idscrub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/__init__.py +1 -0
- idscrub/locations.py +10 -0
- idscrub/scrub.py +947 -0
- idscrub-0.1.0.dist-info/METADATA +147 -0
- idscrub-0.1.0.dist-info/RECORD +20 -0
- idscrub-0.1.0.dist-info/WHEEL +5 -0
- idscrub-0.1.0.dist-info/licenses/LICENSE +21 -0
- idscrub-0.1.0.dist-info/top_level.txt +3 -0
- notebooks/basic_usage.ipynb +1002 -0
- test/conftest.py +12 -0
- test/test_all.py +39 -0
- test/test_chain.py +54 -0
- test/test_dataframe.py +51 -0
- test/test_huggingface.py +25 -0
- test/test_id.py +24 -0
- test/test_log.py +17 -0
- test/test_persidio.py +44 -0
- test/test_phonenumbers.py +13 -0
- test/test_regex.py +123 -0
- test/test_spacy.py +28 -0
|
@@ -0,0 +1,1002 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"### `idscrub` basic usage example"
|
|
8
|
+
]
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"cell_type": "code",
|
|
12
|
+
"execution_count": 1,
|
|
13
|
+
"metadata": {},
|
|
14
|
+
"outputs": [
|
|
15
|
+
{
|
|
16
|
+
"name": "stderr",
|
|
17
|
+
"output_type": "stream",
|
|
18
|
+
"text": [
|
|
19
|
+
"/Users/euansoutter/Documents/code/idscrub/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
20
|
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
|
21
|
+
"INFO: Texts loaded.\n",
|
|
22
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
23
|
+
"100%|██████████| 2/2 [00:00<00:00, 9.48it/s]\n",
|
|
24
|
+
"INFO: 3 presidio person scrubbed.\n",
|
|
25
|
+
"INFO: 1 presidio location scrubbed.\n",
|
|
26
|
+
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
27
|
+
"100%|██████████| 2/2 [00:00<00:00, 55.76it/s]\n",
|
|
28
|
+
"INFO: 0 spacy person scrubbed.\n",
|
|
29
|
+
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
30
|
+
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
31
|
+
"INFO: Scrubbing email addresses using regex...\n",
|
|
32
|
+
"INFO: 0 email addresses scrubbed.\n",
|
|
33
|
+
"INFO: Scrubbing @user handles using regex...\n",
|
|
34
|
+
"INFO: 0 handles scrubbed.\n",
|
|
35
|
+
"INFO: Scrubbing IP addresses using regex...\n",
|
|
36
|
+
"INFO: 0 ip addresses scrubbed.\n",
|
|
37
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
38
|
+
"INFO: 1 uk phone numbers scrubbed.\n",
|
|
39
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
40
|
+
"INFO: 1 uk postcodes scrubbed.\n",
|
|
41
|
+
"INFO: Scrubbing titles using regex...\n",
|
|
42
|
+
"INFO: 0 titles scrubbed.\n"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "stdout",
|
|
47
|
+
"output_type": "stream",
|
|
48
|
+
"text": [
|
|
49
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']\n"
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
"source": [
|
|
54
|
+
"from idscrub import IDScrub\n",
|
|
55
|
+
"\n",
|
|
56
|
+
"scrub = IDScrub(\n",
|
|
57
|
+
" [\n",
|
|
58
|
+
" \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
|
|
59
|
+
" \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
|
|
60
|
+
" ]\n",
|
|
61
|
+
")\n",
|
|
62
|
+
"scrubbed_texts = scrub.all()\n",
|
|
63
|
+
"\n",
|
|
64
|
+
"print(scrubbed_texts)"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "code",
|
|
69
|
+
"execution_count": 2,
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"outputs": [
|
|
72
|
+
{
|
|
73
|
+
"data": {
|
|
74
|
+
"text/html": [
|
|
75
|
+
"<div>\n",
|
|
76
|
+
"<style scoped>\n",
|
|
77
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
78
|
+
" vertical-align: middle;\n",
|
|
79
|
+
" }\n",
|
|
80
|
+
"\n",
|
|
81
|
+
" .dataframe tbody tr th {\n",
|
|
82
|
+
" vertical-align: top;\n",
|
|
83
|
+
" }\n",
|
|
84
|
+
"\n",
|
|
85
|
+
" .dataframe thead th {\n",
|
|
86
|
+
" text-align: right;\n",
|
|
87
|
+
" }\n",
|
|
88
|
+
"</style>\n",
|
|
89
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
90
|
+
" <thead>\n",
|
|
91
|
+
" <tr style=\"text-align: right;\">\n",
|
|
92
|
+
" <th></th>\n",
|
|
93
|
+
" <th>text_id</th>\n",
|
|
94
|
+
" <th>scrubbed_presidio_person</th>\n",
|
|
95
|
+
" <th>scrubbed_presidio_location</th>\n",
|
|
96
|
+
" <th>scrubbed_uk_phone_numbers</th>\n",
|
|
97
|
+
" <th>scrubbed_uk_postcodes</th>\n",
|
|
98
|
+
" </tr>\n",
|
|
99
|
+
" </thead>\n",
|
|
100
|
+
" <tbody>\n",
|
|
101
|
+
" <tr>\n",
|
|
102
|
+
" <th>0</th>\n",
|
|
103
|
+
" <td>1</td>\n",
|
|
104
|
+
" <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
|
|
105
|
+
" <td>None</td>\n",
|
|
106
|
+
" <td>None</td>\n",
|
|
107
|
+
" <td>None</td>\n",
|
|
108
|
+
" </tr>\n",
|
|
109
|
+
" <tr>\n",
|
|
110
|
+
" <th>1</th>\n",
|
|
111
|
+
" <td>2</td>\n",
|
|
112
|
+
" <td>None</td>\n",
|
|
113
|
+
" <td>[Lapland]</td>\n",
|
|
114
|
+
" <td>[+441111111111]</td>\n",
|
|
115
|
+
" <td>[AA11 1AA]</td>\n",
|
|
116
|
+
" </tr>\n",
|
|
117
|
+
" </tbody>\n",
|
|
118
|
+
"</table>\n",
|
|
119
|
+
"</div>"
|
|
120
|
+
],
|
|
121
|
+
"text/plain": [
|
|
122
|
+
" text_id scrubbed_presidio_person \\\n",
|
|
123
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
124
|
+
"1 2 None \n",
|
|
125
|
+
"\n",
|
|
126
|
+
" scrubbed_presidio_location scrubbed_uk_phone_numbers scrubbed_uk_postcodes \n",
|
|
127
|
+
"0 None None None \n",
|
|
128
|
+
"1 [Lapland] [+441111111111] [AA11 1AA] "
|
|
129
|
+
]
|
|
130
|
+
},
|
|
131
|
+
"execution_count": 2,
|
|
132
|
+
"metadata": {},
|
|
133
|
+
"output_type": "execute_result"
|
|
134
|
+
}
|
|
135
|
+
],
|
|
136
|
+
"source": [
|
|
137
|
+
"scrub.get_scrubbed_data()"
|
|
138
|
+
]
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
"cell_type": "markdown",
|
|
142
|
+
"metadata": {},
|
|
143
|
+
"source": [
|
|
144
|
+
"### `idscrub` example - chaining methods together"
|
|
145
|
+
]
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
"cell_type": "code",
|
|
149
|
+
"execution_count": 3,
|
|
150
|
+
"metadata": {},
|
|
151
|
+
"outputs": [
|
|
152
|
+
{
|
|
153
|
+
"name": "stderr",
|
|
154
|
+
"output_type": "stream",
|
|
155
|
+
"text": [
|
|
156
|
+
"INFO: Texts loaded.\n",
|
|
157
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
158
|
+
"100%|██████████| 2/2 [00:00<00:00, 25.84it/s]\n",
|
|
159
|
+
"INFO: 3 presidio person scrubbed.\n",
|
|
160
|
+
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
161
|
+
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
162
|
+
"INFO: Scrubbing custom regex...\n",
|
|
163
|
+
"INFO: 1 custom regex 1 scrubbed.\n",
|
|
164
|
+
"INFO: 1 custom regex 2 scrubbed.\n",
|
|
165
|
+
"INFO: Scrubbing email addresses using regex...\n",
|
|
166
|
+
"INFO: 0 email addresses scrubbed.\n",
|
|
167
|
+
"INFO: Scrubbing @user handles using regex...\n",
|
|
168
|
+
"INFO: 0 handles scrubbed.\n",
|
|
169
|
+
"INFO: Scrubbing IP addresses using regex...\n",
|
|
170
|
+
"INFO: 0 ip addresses scrubbed.\n",
|
|
171
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
172
|
+
"INFO: 1 uk phone numbers scrubbed.\n",
|
|
173
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
174
|
+
"INFO: 1 uk postcodes scrubbed.\n",
|
|
175
|
+
"INFO: Scrubbing titles using regex...\n",
|
|
176
|
+
"INFO: 0 titles scrubbed.\n"
|
|
177
|
+
]
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"name": "stdout",
|
|
181
|
+
"output_type": "stream",
|
|
182
|
+
"text": [
|
|
183
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], University of [UNIVERSITY] where I am on secret mission [REDACTED].']\n"
|
|
184
|
+
]
|
|
185
|
+
}
|
|
186
|
+
],
|
|
187
|
+
"source": [
|
|
188
|
+
"from idscrub import IDScrub\n",
|
|
189
|
+
"\n",
|
|
190
|
+
"scrub = IDScrub(\n",
|
|
191
|
+
" [\n",
|
|
192
|
+
" \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
|
|
193
|
+
" \"My number is +441111111111 and I live at AA11 1AA, University of Lapland where I am on secret mission ACHILLES.\",\n",
|
|
194
|
+
" ]\n",
|
|
195
|
+
")\n",
|
|
196
|
+
"\n",
|
|
197
|
+
"scrub.presidio()\n",
|
|
198
|
+
"scrub.google_phone_numbers(region=\"GB\")\n",
|
|
199
|
+
"scrub.custom_regex(\n",
|
|
200
|
+
" custom_regex_patterns=[r\"Lapland\", r\"ACHILLES\"], custom_replacement_texts=[\"[UNIVERSITY]\", \"[REDACTED]\"]\n",
|
|
201
|
+
") # Remove specific regex pattern(s). This can also be passed to all().\n",
|
|
202
|
+
"scrubbed_texts = scrub.all_regex()\n",
|
|
203
|
+
"\n",
|
|
204
|
+
"print(scrubbed_texts)"
|
|
205
|
+
]
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"cell_type": "code",
|
|
209
|
+
"execution_count": 4,
|
|
210
|
+
"metadata": {},
|
|
211
|
+
"outputs": [
|
|
212
|
+
{
|
|
213
|
+
"data": {
|
|
214
|
+
"text/html": [
|
|
215
|
+
"<div>\n",
|
|
216
|
+
"<style scoped>\n",
|
|
217
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
218
|
+
" vertical-align: middle;\n",
|
|
219
|
+
" }\n",
|
|
220
|
+
"\n",
|
|
221
|
+
" .dataframe tbody tr th {\n",
|
|
222
|
+
" vertical-align: top;\n",
|
|
223
|
+
" }\n",
|
|
224
|
+
"\n",
|
|
225
|
+
" .dataframe thead th {\n",
|
|
226
|
+
" text-align: right;\n",
|
|
227
|
+
" }\n",
|
|
228
|
+
"</style>\n",
|
|
229
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
230
|
+
" <thead>\n",
|
|
231
|
+
" <tr style=\"text-align: right;\">\n",
|
|
232
|
+
" <th></th>\n",
|
|
233
|
+
" <th>text_id</th>\n",
|
|
234
|
+
" <th>scrubbed_presidio_person</th>\n",
|
|
235
|
+
" <th>scrubbed_custom_regex_1</th>\n",
|
|
236
|
+
" <th>scrubbed_custom_regex_2</th>\n",
|
|
237
|
+
" <th>scrubbed_uk_phone_numbers</th>\n",
|
|
238
|
+
" <th>scrubbed_uk_postcodes</th>\n",
|
|
239
|
+
" </tr>\n",
|
|
240
|
+
" </thead>\n",
|
|
241
|
+
" <tbody>\n",
|
|
242
|
+
" <tr>\n",
|
|
243
|
+
" <th>0</th>\n",
|
|
244
|
+
" <td>1</td>\n",
|
|
245
|
+
" <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
|
|
246
|
+
" <td>None</td>\n",
|
|
247
|
+
" <td>None</td>\n",
|
|
248
|
+
" <td>None</td>\n",
|
|
249
|
+
" <td>None</td>\n",
|
|
250
|
+
" </tr>\n",
|
|
251
|
+
" <tr>\n",
|
|
252
|
+
" <th>1</th>\n",
|
|
253
|
+
" <td>2</td>\n",
|
|
254
|
+
" <td>None</td>\n",
|
|
255
|
+
" <td>[Lapland]</td>\n",
|
|
256
|
+
" <td>[ACHILLES]</td>\n",
|
|
257
|
+
" <td>[+441111111111]</td>\n",
|
|
258
|
+
" <td>[AA11 1AA]</td>\n",
|
|
259
|
+
" </tr>\n",
|
|
260
|
+
" </tbody>\n",
|
|
261
|
+
"</table>\n",
|
|
262
|
+
"</div>"
|
|
263
|
+
],
|
|
264
|
+
"text/plain": [
|
|
265
|
+
" text_id scrubbed_presidio_person scrubbed_custom_regex_1 \\\n",
|
|
266
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
|
|
267
|
+
"1 2 None [Lapland] \n",
|
|
268
|
+
"\n",
|
|
269
|
+
" scrubbed_custom_regex_2 scrubbed_uk_phone_numbers scrubbed_uk_postcodes \n",
|
|
270
|
+
"0 None None None \n",
|
|
271
|
+
"1 [ACHILLES] [+441111111111] [AA11 1AA] "
|
|
272
|
+
]
|
|
273
|
+
},
|
|
274
|
+
"execution_count": 4,
|
|
275
|
+
"metadata": {},
|
|
276
|
+
"output_type": "execute_result"
|
|
277
|
+
}
|
|
278
|
+
],
|
|
279
|
+
"source": [
|
|
280
|
+
"scrub.get_scrubbed_data()"
|
|
281
|
+
]
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
"cell_type": "markdown",
|
|
285
|
+
"metadata": {},
|
|
286
|
+
"source": [
|
|
287
|
+
"## `idscrub` example - using Presidio\n",
|
|
288
|
+
"We can also leverage the power of [Presidio](https://microsoft.github.io/presidio/) and use their entity recognition methods"
|
|
289
|
+
]
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"cell_type": "code",
|
|
293
|
+
"execution_count": 5,
|
|
294
|
+
"metadata": {},
|
|
295
|
+
"outputs": [
|
|
296
|
+
{
|
|
297
|
+
"name": "stderr",
|
|
298
|
+
"output_type": "stream",
|
|
299
|
+
"text": [
|
|
300
|
+
"INFO: Texts loaded.\n",
|
|
301
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
302
|
+
"100%|██████████| 2/2 [00:00<00:00, 26.18it/s]\n",
|
|
303
|
+
"INFO: 3 presidio person scrubbed.\n",
|
|
304
|
+
"INFO: 1 presidio iban code scrubbed.\n"
|
|
305
|
+
]
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
"name": "stdout",
|
|
309
|
+
"output_type": "stream",
|
|
310
|
+
"text": [
|
|
311
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My IBAN code is [IBAN_CODE]']\n"
|
|
312
|
+
]
|
|
313
|
+
}
|
|
314
|
+
],
|
|
315
|
+
"source": [
|
|
316
|
+
"from idscrub import IDScrub\n",
|
|
317
|
+
"\n",
|
|
318
|
+
"scrub = IDScrub(\n",
|
|
319
|
+
" [\"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\", \"My IBAN code is GB91BKEN10000041610008\"]\n",
|
|
320
|
+
")\n",
|
|
321
|
+
"scrubbed_texts = scrub.presidio()\n",
|
|
322
|
+
"\n",
|
|
323
|
+
"print(scrubbed_texts)"
|
|
324
|
+
]
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
"cell_type": "code",
|
|
328
|
+
"execution_count": 6,
|
|
329
|
+
"metadata": {},
|
|
330
|
+
"outputs": [
|
|
331
|
+
{
|
|
332
|
+
"data": {
|
|
333
|
+
"text/html": [
|
|
334
|
+
"<div>\n",
|
|
335
|
+
"<style scoped>\n",
|
|
336
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
337
|
+
" vertical-align: middle;\n",
|
|
338
|
+
" }\n",
|
|
339
|
+
"\n",
|
|
340
|
+
" .dataframe tbody tr th {\n",
|
|
341
|
+
" vertical-align: top;\n",
|
|
342
|
+
" }\n",
|
|
343
|
+
"\n",
|
|
344
|
+
" .dataframe thead th {\n",
|
|
345
|
+
" text-align: right;\n",
|
|
346
|
+
" }\n",
|
|
347
|
+
"</style>\n",
|
|
348
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
349
|
+
" <thead>\n",
|
|
350
|
+
" <tr style=\"text-align: right;\">\n",
|
|
351
|
+
" <th></th>\n",
|
|
352
|
+
" <th>text_id</th>\n",
|
|
353
|
+
" <th>scrubbed_presidio_person</th>\n",
|
|
354
|
+
" <th>scrubbed_presidio_iban_code</th>\n",
|
|
355
|
+
" </tr>\n",
|
|
356
|
+
" </thead>\n",
|
|
357
|
+
" <tbody>\n",
|
|
358
|
+
" <tr>\n",
|
|
359
|
+
" <th>0</th>\n",
|
|
360
|
+
" <td>1</td>\n",
|
|
361
|
+
" <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
|
|
362
|
+
" <td>None</td>\n",
|
|
363
|
+
" </tr>\n",
|
|
364
|
+
" <tr>\n",
|
|
365
|
+
" <th>1</th>\n",
|
|
366
|
+
" <td>2</td>\n",
|
|
367
|
+
" <td>None</td>\n",
|
|
368
|
+
" <td>[GB91BKEN10000041610008]</td>\n",
|
|
369
|
+
" </tr>\n",
|
|
370
|
+
" </tbody>\n",
|
|
371
|
+
"</table>\n",
|
|
372
|
+
"</div>"
|
|
373
|
+
],
|
|
374
|
+
"text/plain": [
|
|
375
|
+
" text_id scrubbed_presidio_person \\\n",
|
|
376
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
377
|
+
"1 2 None \n",
|
|
378
|
+
"\n",
|
|
379
|
+
" scrubbed_presidio_iban_code \n",
|
|
380
|
+
"0 None \n",
|
|
381
|
+
"1 [GB91BKEN10000041610008] "
|
|
382
|
+
]
|
|
383
|
+
},
|
|
384
|
+
"execution_count": 6,
|
|
385
|
+
"metadata": {},
|
|
386
|
+
"output_type": "execute_result"
|
|
387
|
+
}
|
|
388
|
+
],
|
|
389
|
+
"source": [
|
|
390
|
+
"scrub.get_scrubbed_data()"
|
|
391
|
+
]
|
|
392
|
+
},
|
|
393
|
+
{
|
|
394
|
+
"cell_type": "markdown",
|
|
395
|
+
"metadata": {},
|
|
396
|
+
"source": [
|
|
397
|
+
"### `idscrub` example - scrubbing a whole dataframe"
|
|
398
|
+
]
|
|
399
|
+
},
|
|
400
|
+
{
|
|
401
|
+
"cell_type": "code",
|
|
402
|
+
"execution_count": 7,
|
|
403
|
+
"metadata": {},
|
|
404
|
+
"outputs": [
|
|
405
|
+
{
|
|
406
|
+
"data": {
|
|
407
|
+
"text/html": [
|
|
408
|
+
"<div>\n",
|
|
409
|
+
"<style scoped>\n",
|
|
410
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
411
|
+
" vertical-align: middle;\n",
|
|
412
|
+
" }\n",
|
|
413
|
+
"\n",
|
|
414
|
+
" .dataframe tbody tr th {\n",
|
|
415
|
+
" vertical-align: top;\n",
|
|
416
|
+
" }\n",
|
|
417
|
+
"\n",
|
|
418
|
+
" .dataframe thead th {\n",
|
|
419
|
+
" text-align: right;\n",
|
|
420
|
+
" }\n",
|
|
421
|
+
"</style>\n",
|
|
422
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
423
|
+
" <thead>\n",
|
|
424
|
+
" <tr style=\"text-align: right;\">\n",
|
|
425
|
+
" <th></th>\n",
|
|
426
|
+
" <th>ID</th>\n",
|
|
427
|
+
" <th>Pride and Prejudice</th>\n",
|
|
428
|
+
" <th>The Adventures of Sherlock Holmes</th>\n",
|
|
429
|
+
" <th>Frankenstein</th>\n",
|
|
430
|
+
" <th>Fake book</th>\n",
|
|
431
|
+
" </tr>\n",
|
|
432
|
+
" </thead>\n",
|
|
433
|
+
" <tbody>\n",
|
|
434
|
+
" <tr>\n",
|
|
435
|
+
" <th>0</th>\n",
|
|
436
|
+
" <td>A</td>\n",
|
|
437
|
+
" <td>Mr. Darcy walked off; and Elizabeth remained w...</td>\n",
|
|
438
|
+
" <td>To Sherlock Holmes she is always the woman.</td>\n",
|
|
439
|
+
" <td>My dear Victor, do not waste your time upon th...</td>\n",
|
|
440
|
+
" <td>The letter to freddie.mercury@queen.com was st...</td>\n",
|
|
441
|
+
" </tr>\n",
|
|
442
|
+
" <tr>\n",
|
|
443
|
+
" <th>1</th>\n",
|
|
444
|
+
" <td>B</td>\n",
|
|
445
|
+
" <td>Mr. Bennet was so odd a mixture of quick parts...</td>\n",
|
|
446
|
+
" <td>You see, but you do not observe.</td>\n",
|
|
447
|
+
" <td>Learn from me, if not by my precepts, at least...</td>\n",
|
|
448
|
+
" <td>She forwarded the memo from Mick Jagger and Da...</td>\n",
|
|
449
|
+
" </tr>\n",
|
|
450
|
+
" <tr>\n",
|
|
451
|
+
" <th>2</th>\n",
|
|
452
|
+
" <td>C</td>\n",
|
|
453
|
+
" <td>Elizabeth's spirits were so high that they cou...</td>\n",
|
|
454
|
+
" <td>The world is full of obvious things which nobo...</td>\n",
|
|
455
|
+
" <td>I had worked hard for nearly two years, for th...</td>\n",
|
|
456
|
+
" <td>The dossier marked confidential came from sere...</td>\n",
|
|
457
|
+
" </tr>\n",
|
|
458
|
+
" <tr>\n",
|
|
459
|
+
" <th>3</th>\n",
|
|
460
|
+
" <td>D</td>\n",
|
|
461
|
+
" <td>The business of her life was to get her daught...</td>\n",
|
|
462
|
+
" <td>I am a brain, Watson. The rest of me is a mere...</td>\n",
|
|
463
|
+
" <td>Nothing is more painful to the human mind than...</td>\n",
|
|
464
|
+
" <td>A message arrived just as the Downing Street c...</td>\n",
|
|
465
|
+
" </tr>\n",
|
|
466
|
+
" <tr>\n",
|
|
467
|
+
" <th>4</th>\n",
|
|
468
|
+
" <td>E</td>\n",
|
|
469
|
+
" <td>She is tolerable; but not handsome enough to t...</td>\n",
|
|
470
|
+
" <td>When you have eliminated the impossible, whate...</td>\n",
|
|
471
|
+
" <td>Beware; for I am fearless, and therefore power...</td>\n",
|
|
472
|
+
" <td>They did not expected a reply from otis.reddin...</td>\n",
|
|
473
|
+
" </tr>\n",
|
|
474
|
+
" </tbody>\n",
|
|
475
|
+
"</table>\n",
|
|
476
|
+
"</div>"
|
|
477
|
+
],
|
|
478
|
+
"text/plain": [
|
|
479
|
+
" ID Pride and Prejudice \\\n",
|
|
480
|
+
"0 A Mr. Darcy walked off; and Elizabeth remained w... \n",
|
|
481
|
+
"1 B Mr. Bennet was so odd a mixture of quick parts... \n",
|
|
482
|
+
"2 C Elizabeth's spirits were so high that they cou... \n",
|
|
483
|
+
"3 D The business of her life was to get her daught... \n",
|
|
484
|
+
"4 E She is tolerable; but not handsome enough to t... \n",
|
|
485
|
+
"\n",
|
|
486
|
+
" The Adventures of Sherlock Holmes \\\n",
|
|
487
|
+
"0 To Sherlock Holmes she is always the woman. \n",
|
|
488
|
+
"1 You see, but you do not observe. \n",
|
|
489
|
+
"2 The world is full of obvious things which nobo... \n",
|
|
490
|
+
"3 I am a brain, Watson. The rest of me is a mere... \n",
|
|
491
|
+
"4 When you have eliminated the impossible, whate... \n",
|
|
492
|
+
"\n",
|
|
493
|
+
" Frankenstein \\\n",
|
|
494
|
+
"0 My dear Victor, do not waste your time upon th... \n",
|
|
495
|
+
"1 Learn from me, if not by my precepts, at least... \n",
|
|
496
|
+
"2 I had worked hard for nearly two years, for th... \n",
|
|
497
|
+
"3 Nothing is more painful to the human mind than... \n",
|
|
498
|
+
"4 Beware; for I am fearless, and therefore power... \n",
|
|
499
|
+
"\n",
|
|
500
|
+
" Fake book \n",
|
|
501
|
+
"0 The letter to freddie.mercury@queen.com was st... \n",
|
|
502
|
+
"1 She forwarded the memo from Mick Jagger and Da... \n",
|
|
503
|
+
"2 The dossier marked confidential came from sere... \n",
|
|
504
|
+
"3 A message arrived just as the Downing Street c... \n",
|
|
505
|
+
"4 They did not expected a reply from otis.reddin... "
|
|
506
|
+
]
|
|
507
|
+
},
|
|
508
|
+
"execution_count": 7,
|
|
509
|
+
"metadata": {},
|
|
510
|
+
"output_type": "execute_result"
|
|
511
|
+
}
|
|
512
|
+
],
|
|
513
|
+
"source": [
|
|
514
|
+
"import pandas as pd\n",
|
|
515
|
+
"\n",
|
|
516
|
+
"data = {\n",
|
|
517
|
+
" \"ID\": [\"A\", \"B\", \"C\", \"D\", \"E\"],\n",
|
|
518
|
+
" \"Pride and Prejudice\": [\n",
|
|
519
|
+
" \"Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.\",\n",
|
|
520
|
+
" \"Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.\",\n",
|
|
521
|
+
" \"Elizabeth's spirits were so high that they could not be damped for long.\",\n",
|
|
522
|
+
" \"The business of her life was to get her daughters married.\",\n",
|
|
523
|
+
" \"She is tolerable; but not handsome enough to tempt me.\",\n",
|
|
524
|
+
" ],\n",
|
|
525
|
+
" \"The Adventures of Sherlock Holmes\": [\n",
|
|
526
|
+
" \"To Sherlock Holmes she is always the woman.\",\n",
|
|
527
|
+
" \"You see, but you do not observe.\",\n",
|
|
528
|
+
" \"The world is full of obvious things which nobody by any chance ever observes.\",\n",
|
|
529
|
+
" \"I am a brain, Watson. The rest of me is a mere appendix.\",\n",
|
|
530
|
+
" \"When you have eliminated the impossible, whatever remains, however improbable, must be the truth.\",\n",
|
|
531
|
+
" ],\n",
|
|
532
|
+
" \"Frankenstein\": [\n",
|
|
533
|
+
" \"My dear Victor, do not waste your time upon this; it is sad trash.\",\n",
|
|
534
|
+
" \"Learn from me, if not by my precepts, at least by my example.\",\n",
|
|
535
|
+
" \"I had worked hard for nearly two years, for the sole purpose of infusing life into an inanimate body.\",\n",
|
|
536
|
+
" \"Nothing is more painful to the human mind than a great and sudden change.\",\n",
|
|
537
|
+
" \"Beware; for I am fearless, and therefore powerful.\",\n",
|
|
538
|
+
" ],\n",
|
|
539
|
+
" \"Fake book\": [\n",
|
|
540
|
+
" \"The letter to freddie.mercury@queen.com was stamped with SW1A 2AA. His IBAN was GB91BKEN10000041610008.\",\n",
|
|
541
|
+
" \"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.\",\n",
|
|
542
|
+
" \"The dossier marked confidential came from serena.williams@tennis.com, with SW19 5AE etched in bold across the envelope.\",\n",
|
|
543
|
+
" \"A message arrived just as the Downing Street clock struck midnight.\",\n",
|
|
544
|
+
" \"They did not expected a reply from otis.redding@dockofthebay.org, especially one routed through EH8 8DX.\",\n",
|
|
545
|
+
" ],\n",
|
|
546
|
+
"}\n",
|
|
547
|
+
"\n",
|
|
548
|
+
"df = pd.DataFrame(data)\n",
|
|
549
|
+
"df"
|
|
550
|
+
]
|
|
551
|
+
},
|
|
552
|
+
{
|
|
553
|
+
"cell_type": "code",
|
|
554
|
+
"execution_count": 8,
|
|
555
|
+
"metadata": {},
|
|
556
|
+
"outputs": [
|
|
557
|
+
{
|
|
558
|
+
"name": "stderr",
|
|
559
|
+
"output_type": "stream",
|
|
560
|
+
"text": [
|
|
561
|
+
" 0%| | 0/5 [00:00<?, ?it/s]INFO: Texts loaded.\n",
|
|
562
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
563
|
+
"100%|██████████| 5/5 [00:00<00:00, 24.83it/s]\n",
|
|
564
|
+
"INFO: 4 presidio person scrubbed.\n",
|
|
565
|
+
"INFO: 4 presidio person scrubbed.\n",
|
|
566
|
+
"INFO: 4 presidio person scrubbed.\n",
|
|
567
|
+
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
568
|
+
"100%|██████████| 5/5 [00:00<00:00, 71.71it/s]\n",
|
|
569
|
+
"INFO: 0 spacy person scrubbed.\n",
|
|
570
|
+
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
571
|
+
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
572
|
+
"INFO: Scrubbing email addresses using regex...\n",
|
|
573
|
+
"INFO: 0 email addresses scrubbed.\n",
|
|
574
|
+
"INFO: Scrubbing @user handles using regex...\n",
|
|
575
|
+
"INFO: 0 handles scrubbed.\n",
|
|
576
|
+
"INFO: Scrubbing IP addresses using regex...\n",
|
|
577
|
+
"INFO: 0 ip addresses scrubbed.\n",
|
|
578
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
579
|
+
"INFO: 0 uk phone numbers scrubbed.\n",
|
|
580
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
581
|
+
"INFO: 0 uk postcodes scrubbed.\n",
|
|
582
|
+
"INFO: Scrubbing titles using regex...\n",
|
|
583
|
+
"INFO: 2 titles scrubbed.\n",
|
|
584
|
+
" 40%|████ | 2/5 [00:02<00:03, 1.25s/it]INFO: Texts loaded.\n",
|
|
585
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
586
|
+
"100%|██████████| 5/5 [00:00<00:00, 29.98it/s]\n",
|
|
587
|
+
"INFO: 2 presidio person scrubbed.\n",
|
|
588
|
+
"INFO: 2 presidio person scrubbed.\n",
|
|
589
|
+
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
590
|
+
"100%|██████████| 5/5 [00:00<00:00, 96.09it/s]\n",
|
|
591
|
+
"INFO: 0 spacy person scrubbed.\n",
|
|
592
|
+
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
593
|
+
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
594
|
+
"INFO: Scrubbing email addresses using regex...\n",
|
|
595
|
+
"INFO: 0 email addresses scrubbed.\n",
|
|
596
|
+
"INFO: Scrubbing @user handles using regex...\n",
|
|
597
|
+
"INFO: 0 handles scrubbed.\n",
|
|
598
|
+
"INFO: Scrubbing IP addresses using regex...\n",
|
|
599
|
+
"INFO: 0 ip addresses scrubbed.\n",
|
|
600
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
601
|
+
"INFO: 0 uk phone numbers scrubbed.\n",
|
|
602
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
603
|
+
"INFO: 0 uk postcodes scrubbed.\n",
|
|
604
|
+
"INFO: Scrubbing titles using regex...\n",
|
|
605
|
+
"INFO: 0 titles scrubbed.\n",
|
|
606
|
+
" 60%|██████ | 3/5 [00:04<00:03, 1.66s/it]INFO: Texts loaded.\n",
|
|
607
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
608
|
+
"100%|██████████| 5/5 [00:00<00:00, 26.73it/s]\n",
|
|
609
|
+
"INFO: 1 presidio person scrubbed.\n",
|
|
610
|
+
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
611
|
+
"100%|██████████| 5/5 [00:00<00:00, 89.71it/s]\n",
|
|
612
|
+
"INFO: 0 spacy person scrubbed.\n",
|
|
613
|
+
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
614
|
+
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
615
|
+
"INFO: Scrubbing email addresses using regex...\n",
|
|
616
|
+
"INFO: 0 email addresses scrubbed.\n",
|
|
617
|
+
"INFO: Scrubbing @user handles using regex...\n",
|
|
618
|
+
"INFO: 0 handles scrubbed.\n",
|
|
619
|
+
"INFO: Scrubbing IP addresses using regex...\n",
|
|
620
|
+
"INFO: 0 ip addresses scrubbed.\n",
|
|
621
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
622
|
+
"INFO: 0 uk phone numbers scrubbed.\n",
|
|
623
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
624
|
+
"INFO: 0 uk postcodes scrubbed.\n",
|
|
625
|
+
"INFO: Scrubbing titles using regex...\n",
|
|
626
|
+
"INFO: 0 titles scrubbed.\n",
|
|
627
|
+
" 80%|████████ | 4/5 [00:07<00:01, 1.91s/it]INFO: Texts loaded.\n",
|
|
628
|
+
"INFO: Scrubbing using Presidio...\n",
|
|
629
|
+
"100%|██████████| 5/5 [00:00<00:00, 21.44it/s]\n",
|
|
630
|
+
"INFO: 3 presidio email address scrubbed.\n",
|
|
631
|
+
"INFO: 2 presidio person scrubbed.\n",
|
|
632
|
+
"INFO: 3 presidio email address scrubbed.\n",
|
|
633
|
+
"INFO: 3 presidio email address scrubbed.\n",
|
|
634
|
+
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
635
|
+
"100%|██████████| 5/5 [00:00<00:00, 57.46it/s]\n",
|
|
636
|
+
"INFO: 0 spacy person scrubbed.\n",
|
|
637
|
+
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
638
|
+
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
639
|
+
"INFO: Scrubbing email addresses using regex...\n",
|
|
640
|
+
"INFO: 0 email addresses scrubbed.\n",
|
|
641
|
+
"INFO: Scrubbing @user handles using regex...\n",
|
|
642
|
+
"INFO: 0 handles scrubbed.\n",
|
|
643
|
+
"INFO: Scrubbing IP addresses using regex...\n",
|
|
644
|
+
"INFO: 0 ip addresses scrubbed.\n",
|
|
645
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
646
|
+
"INFO: 0 uk phone numbers scrubbed.\n",
|
|
647
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
648
|
+
"INFO: 4 uk postcodes scrubbed.\n",
|
|
649
|
+
"INFO: Scrubbing titles using regex...\n",
|
|
650
|
+
"INFO: 0 titles scrubbed.\n",
|
|
651
|
+
"100%|██████████| 5/5 [00:09<00:00, 1.91s/it]\n"
|
|
652
|
+
]
|
|
653
|
+
},
|
|
654
|
+
{
|
|
655
|
+
"data": {
|
|
656
|
+
"text/html": [
|
|
657
|
+
"<div>\n",
|
|
658
|
+
"<style scoped>\n",
|
|
659
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
660
|
+
" vertical-align: middle;\n",
|
|
661
|
+
" }\n",
|
|
662
|
+
"\n",
|
|
663
|
+
" .dataframe tbody tr th {\n",
|
|
664
|
+
" vertical-align: top;\n",
|
|
665
|
+
" }\n",
|
|
666
|
+
"\n",
|
|
667
|
+
" .dataframe thead th {\n",
|
|
668
|
+
" text-align: right;\n",
|
|
669
|
+
" }\n",
|
|
670
|
+
"</style>\n",
|
|
671
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
672
|
+
" <thead>\n",
|
|
673
|
+
" <tr style=\"text-align: right;\">\n",
|
|
674
|
+
" <th></th>\n",
|
|
675
|
+
" <th>ID</th>\n",
|
|
676
|
+
" <th>Pride and Prejudice</th>\n",
|
|
677
|
+
" <th>The Adventures of Sherlock Holmes</th>\n",
|
|
678
|
+
" <th>Frankenstein</th>\n",
|
|
679
|
+
" <th>Fake book</th>\n",
|
|
680
|
+
" </tr>\n",
|
|
681
|
+
" </thead>\n",
|
|
682
|
+
" <tbody>\n",
|
|
683
|
+
" <tr>\n",
|
|
684
|
+
" <th>0</th>\n",
|
|
685
|
+
" <td>A</td>\n",
|
|
686
|
+
" <td>[TITLE]. [PERSON] walked off; and [PERSON] rem...</td>\n",
|
|
687
|
+
" <td>To [PERSON] she is always the woman.</td>\n",
|
|
688
|
+
" <td>My dear [PERSON], do not waste your time upon ...</td>\n",
|
|
689
|
+
" <td>The letter to [EMAIL_ADDRESS] was stamped with...</td>\n",
|
|
690
|
+
" </tr>\n",
|
|
691
|
+
" <tr>\n",
|
|
692
|
+
" <th>1</th>\n",
|
|
693
|
+
" <td>B</td>\n",
|
|
694
|
+
" <td>[TITLE]. [PERSON] was so odd a mixture of quic...</td>\n",
|
|
695
|
+
" <td>You see, but you do not observe.</td>\n",
|
|
696
|
+
" <td>Learn from me, if not by my precepts, at least...</td>\n",
|
|
697
|
+
" <td>She forwarded the memo from [PERSON] and [PERS...</td>\n",
|
|
698
|
+
" </tr>\n",
|
|
699
|
+
" <tr>\n",
|
|
700
|
+
" <th>2</th>\n",
|
|
701
|
+
" <td>C</td>\n",
|
|
702
|
+
" <td>[PERSON]'s spirits were so high that they coul...</td>\n",
|
|
703
|
+
" <td>The world is full of obvious things which nobo...</td>\n",
|
|
704
|
+
" <td>I had worked hard for nearly two years, for th...</td>\n",
|
|
705
|
+
" <td>The dossier marked confidential came from [EMA...</td>\n",
|
|
706
|
+
" </tr>\n",
|
|
707
|
+
" <tr>\n",
|
|
708
|
+
" <th>3</th>\n",
|
|
709
|
+
" <td>D</td>\n",
|
|
710
|
+
" <td>The business of her life was to get her daught...</td>\n",
|
|
711
|
+
" <td>I am a brain, [PERSON]. The rest of me is a me...</td>\n",
|
|
712
|
+
" <td>Nothing is more painful to the human mind than...</td>\n",
|
|
713
|
+
" <td>A message arrived just as the Downing Street c...</td>\n",
|
|
714
|
+
" </tr>\n",
|
|
715
|
+
" <tr>\n",
|
|
716
|
+
" <th>4</th>\n",
|
|
717
|
+
" <td>E</td>\n",
|
|
718
|
+
" <td>She is tolerable; but not handsome enough to t...</td>\n",
|
|
719
|
+
" <td>When you have eliminated the impossible, whate...</td>\n",
|
|
720
|
+
" <td>Beware; for I am fearless, and therefore power...</td>\n",
|
|
721
|
+
" <td>They did not expected a reply from [EMAIL_ADDR...</td>\n",
|
|
722
|
+
" </tr>\n",
|
|
723
|
+
" </tbody>\n",
|
|
724
|
+
"</table>\n",
|
|
725
|
+
"</div>"
|
|
726
|
+
],
|
|
727
|
+
"text/plain": [
|
|
728
|
+
" ID Pride and Prejudice \\\n",
|
|
729
|
+
"0 A [TITLE]. [PERSON] walked off; and [PERSON] rem... \n",
|
|
730
|
+
"1 B [TITLE]. [PERSON] was so odd a mixture of quic... \n",
|
|
731
|
+
"2 C [PERSON]'s spirits were so high that they coul... \n",
|
|
732
|
+
"3 D The business of her life was to get her daught... \n",
|
|
733
|
+
"4 E She is tolerable; but not handsome enough to t... \n",
|
|
734
|
+
"\n",
|
|
735
|
+
" The Adventures of Sherlock Holmes \\\n",
|
|
736
|
+
"0 To [PERSON] she is always the woman. \n",
|
|
737
|
+
"1 You see, but you do not observe. \n",
|
|
738
|
+
"2 The world is full of obvious things which nobo... \n",
|
|
739
|
+
"3 I am a brain, [PERSON]. The rest of me is a me... \n",
|
|
740
|
+
"4 When you have eliminated the impossible, whate... \n",
|
|
741
|
+
"\n",
|
|
742
|
+
" Frankenstein \\\n",
|
|
743
|
+
"0 My dear [PERSON], do not waste your time upon ... \n",
|
|
744
|
+
"1 Learn from me, if not by my precepts, at least... \n",
|
|
745
|
+
"2 I had worked hard for nearly two years, for th... \n",
|
|
746
|
+
"3 Nothing is more painful to the human mind than... \n",
|
|
747
|
+
"4 Beware; for I am fearless, and therefore power... \n",
|
|
748
|
+
"\n",
|
|
749
|
+
" Fake book \n",
|
|
750
|
+
"0 The letter to [EMAIL_ADDRESS] was stamped with... \n",
|
|
751
|
+
"1 She forwarded the memo from [PERSON] and [PERS... \n",
|
|
752
|
+
"2 The dossier marked confidential came from [EMA... \n",
|
|
753
|
+
"3 A message arrived just as the Downing Street c... \n",
|
|
754
|
+
"4 They did not expected a reply from [EMAIL_ADDR... "
|
|
755
|
+
]
|
|
756
|
+
},
|
|
757
|
+
"execution_count": 8,
|
|
758
|
+
"metadata": {},
|
|
759
|
+
"output_type": "execute_result"
|
|
760
|
+
}
|
|
761
|
+
],
|
|
762
|
+
"source": [
|
|
763
|
+
"from idscrub import IDScrub\n",
|
|
764
|
+
"\n",
|
|
765
|
+
"scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", scrub_methods=[\"all\"])\n",
|
|
766
|
+
"\n",
|
|
767
|
+
"scrubbed_df"
|
|
768
|
+
]
|
|
769
|
+
},
|
|
770
|
+
{
|
|
771
|
+
"cell_type": "code",
|
|
772
|
+
"execution_count": 9,
|
|
773
|
+
"metadata": {},
|
|
774
|
+
"outputs": [
|
|
775
|
+
{
|
|
776
|
+
"data": {
|
|
777
|
+
"text/html": [
|
|
778
|
+
"<div>\n",
|
|
779
|
+
"<style scoped>\n",
|
|
780
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
781
|
+
" vertical-align: middle;\n",
|
|
782
|
+
" }\n",
|
|
783
|
+
"\n",
|
|
784
|
+
" .dataframe tbody tr th {\n",
|
|
785
|
+
" vertical-align: top;\n",
|
|
786
|
+
" }\n",
|
|
787
|
+
"\n",
|
|
788
|
+
" .dataframe thead th {\n",
|
|
789
|
+
" text-align: right;\n",
|
|
790
|
+
" }\n",
|
|
791
|
+
"</style>\n",
|
|
792
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
793
|
+
" <thead>\n",
|
|
794
|
+
" <tr style=\"text-align: right;\">\n",
|
|
795
|
+
" <th></th>\n",
|
|
796
|
+
" <th>ID</th>\n",
|
|
797
|
+
" <th>column</th>\n",
|
|
798
|
+
" <th>scrubbed_presidio_person</th>\n",
|
|
799
|
+
" <th>scrubbed_titles</th>\n",
|
|
800
|
+
" <th>scrubbed_presidio_email_address</th>\n",
|
|
801
|
+
" <th>scrubbed_presidio_iban_code</th>\n",
|
|
802
|
+
" <th>scrubbed_presidio_url</th>\n",
|
|
803
|
+
" <th>scrubbed_uk_postcodes</th>\n",
|
|
804
|
+
" </tr>\n",
|
|
805
|
+
" </thead>\n",
|
|
806
|
+
" <tbody>\n",
|
|
807
|
+
" <tr>\n",
|
|
808
|
+
" <th>0</th>\n",
|
|
809
|
+
" <td>A</td>\n",
|
|
810
|
+
" <td>Pride and Prejudice</td>\n",
|
|
811
|
+
" <td>[Darcy, Elizabeth]</td>\n",
|
|
812
|
+
" <td>[Mr]</td>\n",
|
|
813
|
+
" <td>None</td>\n",
|
|
814
|
+
" <td>None</td>\n",
|
|
815
|
+
" <td>None</td>\n",
|
|
816
|
+
" <td>None</td>\n",
|
|
817
|
+
" </tr>\n",
|
|
818
|
+
" <tr>\n",
|
|
819
|
+
" <th>1</th>\n",
|
|
820
|
+
" <td>B</td>\n",
|
|
821
|
+
" <td>Pride and Prejudice</td>\n",
|
|
822
|
+
" <td>[Bennet]</td>\n",
|
|
823
|
+
" <td>[Mr]</td>\n",
|
|
824
|
+
" <td>None</td>\n",
|
|
825
|
+
" <td>None</td>\n",
|
|
826
|
+
" <td>None</td>\n",
|
|
827
|
+
" <td>None</td>\n",
|
|
828
|
+
" </tr>\n",
|
|
829
|
+
" <tr>\n",
|
|
830
|
+
" <th>2</th>\n",
|
|
831
|
+
" <td>C</td>\n",
|
|
832
|
+
" <td>Pride and Prejudice</td>\n",
|
|
833
|
+
" <td>[Elizabeth]</td>\n",
|
|
834
|
+
" <td>None</td>\n",
|
|
835
|
+
" <td>None</td>\n",
|
|
836
|
+
" <td>None</td>\n",
|
|
837
|
+
" <td>None</td>\n",
|
|
838
|
+
" <td>None</td>\n",
|
|
839
|
+
" </tr>\n",
|
|
840
|
+
" <tr>\n",
|
|
841
|
+
" <th>3</th>\n",
|
|
842
|
+
" <td>A</td>\n",
|
|
843
|
+
" <td>The Adventures of Sherlock Holmes</td>\n",
|
|
844
|
+
" <td>[Sherlock Holmes]</td>\n",
|
|
845
|
+
" <td>None</td>\n",
|
|
846
|
+
" <td>None</td>\n",
|
|
847
|
+
" <td>None</td>\n",
|
|
848
|
+
" <td>None</td>\n",
|
|
849
|
+
" <td>None</td>\n",
|
|
850
|
+
" </tr>\n",
|
|
851
|
+
" <tr>\n",
|
|
852
|
+
" <th>4</th>\n",
|
|
853
|
+
" <td>D</td>\n",
|
|
854
|
+
" <td>The Adventures of Sherlock Holmes</td>\n",
|
|
855
|
+
" <td>[Watson]</td>\n",
|
|
856
|
+
" <td>None</td>\n",
|
|
857
|
+
" <td>None</td>\n",
|
|
858
|
+
" <td>None</td>\n",
|
|
859
|
+
" <td>None</td>\n",
|
|
860
|
+
" <td>None</td>\n",
|
|
861
|
+
" </tr>\n",
|
|
862
|
+
" <tr>\n",
|
|
863
|
+
" <th>5</th>\n",
|
|
864
|
+
" <td>A</td>\n",
|
|
865
|
+
" <td>Frankenstein</td>\n",
|
|
866
|
+
" <td>[Victor]</td>\n",
|
|
867
|
+
" <td>None</td>\n",
|
|
868
|
+
" <td>None</td>\n",
|
|
869
|
+
" <td>None</td>\n",
|
|
870
|
+
" <td>None</td>\n",
|
|
871
|
+
" <td>None</td>\n",
|
|
872
|
+
" </tr>\n",
|
|
873
|
+
" <tr>\n",
|
|
874
|
+
" <th>6</th>\n",
|
|
875
|
+
" <td>A</td>\n",
|
|
876
|
+
" <td>Fake book</td>\n",
|
|
877
|
+
" <td>None</td>\n",
|
|
878
|
+
" <td>None</td>\n",
|
|
879
|
+
" <td>[freddie.mercury@queen.com]</td>\n",
|
|
880
|
+
" <td>[GB91BKEN10000041610008]</td>\n",
|
|
881
|
+
" <td>[freddie.me, queen.com]</td>\n",
|
|
882
|
+
" <td>[SW1A 2AA]</td>\n",
|
|
883
|
+
" </tr>\n",
|
|
884
|
+
" <tr>\n",
|
|
885
|
+
" <th>7</th>\n",
|
|
886
|
+
" <td>B</td>\n",
|
|
887
|
+
" <td>Fake book</td>\n",
|
|
888
|
+
" <td>[Mick Jagger, David Bowie]</td>\n",
|
|
889
|
+
" <td>None</td>\n",
|
|
890
|
+
" <td>None</td>\n",
|
|
891
|
+
" <td>None</td>\n",
|
|
892
|
+
" <td>None</td>\n",
|
|
893
|
+
" <td>[SW1A 2WH]</td>\n",
|
|
894
|
+
" </tr>\n",
|
|
895
|
+
" <tr>\n",
|
|
896
|
+
" <th>8</th>\n",
|
|
897
|
+
" <td>C</td>\n",
|
|
898
|
+
" <td>Fake book</td>\n",
|
|
899
|
+
" <td>None</td>\n",
|
|
900
|
+
" <td>None</td>\n",
|
|
901
|
+
" <td>[serena.williams@tennis.com]</td>\n",
|
|
902
|
+
" <td>None</td>\n",
|
|
903
|
+
" <td>[tennis.com]</td>\n",
|
|
904
|
+
" <td>[SW19 5AE]</td>\n",
|
|
905
|
+
" </tr>\n",
|
|
906
|
+
" <tr>\n",
|
|
907
|
+
" <th>9</th>\n",
|
|
908
|
+
" <td>E</td>\n",
|
|
909
|
+
" <td>Fake book</td>\n",
|
|
910
|
+
" <td>None</td>\n",
|
|
911
|
+
" <td>None</td>\n",
|
|
912
|
+
" <td>[otis.redding@dockofthebay.org]</td>\n",
|
|
913
|
+
" <td>None</td>\n",
|
|
914
|
+
" <td>[otis.red, dockofthebay.org]</td>\n",
|
|
915
|
+
" <td>[EH8 8DX]</td>\n",
|
|
916
|
+
" </tr>\n",
|
|
917
|
+
" </tbody>\n",
|
|
918
|
+
"</table>\n",
|
|
919
|
+
"</div>"
|
|
920
|
+
],
|
|
921
|
+
"text/plain": [
|
|
922
|
+
" ID column scrubbed_presidio_person \\\n",
|
|
923
|
+
"0 A Pride and Prejudice [Darcy, Elizabeth] \n",
|
|
924
|
+
"1 B Pride and Prejudice [Bennet] \n",
|
|
925
|
+
"2 C Pride and Prejudice [Elizabeth] \n",
|
|
926
|
+
"3 A The Adventures of Sherlock Holmes [Sherlock Holmes] \n",
|
|
927
|
+
"4 D The Adventures of Sherlock Holmes [Watson] \n",
|
|
928
|
+
"5 A Frankenstein [Victor] \n",
|
|
929
|
+
"6 A Fake book None \n",
|
|
930
|
+
"7 B Fake book [Mick Jagger, David Bowie] \n",
|
|
931
|
+
"8 C Fake book None \n",
|
|
932
|
+
"9 E Fake book None \n",
|
|
933
|
+
"\n",
|
|
934
|
+
" scrubbed_titles scrubbed_presidio_email_address \\\n",
|
|
935
|
+
"0 [Mr] None \n",
|
|
936
|
+
"1 [Mr] None \n",
|
|
937
|
+
"2 None None \n",
|
|
938
|
+
"3 None None \n",
|
|
939
|
+
"4 None None \n",
|
|
940
|
+
"5 None None \n",
|
|
941
|
+
"6 None [freddie.mercury@queen.com] \n",
|
|
942
|
+
"7 None None \n",
|
|
943
|
+
"8 None [serena.williams@tennis.com] \n",
|
|
944
|
+
"9 None [otis.redding@dockofthebay.org] \n",
|
|
945
|
+
"\n",
|
|
946
|
+
" scrubbed_presidio_iban_code scrubbed_presidio_url \\\n",
|
|
947
|
+
"0 None None \n",
|
|
948
|
+
"1 None None \n",
|
|
949
|
+
"2 None None \n",
|
|
950
|
+
"3 None None \n",
|
|
951
|
+
"4 None None \n",
|
|
952
|
+
"5 None None \n",
|
|
953
|
+
"6 [GB91BKEN10000041610008] [freddie.me, queen.com] \n",
|
|
954
|
+
"7 None None \n",
|
|
955
|
+
"8 None [tennis.com] \n",
|
|
956
|
+
"9 None [otis.red, dockofthebay.org] \n",
|
|
957
|
+
"\n",
|
|
958
|
+
" scrubbed_uk_postcodes \n",
|
|
959
|
+
"0 None \n",
|
|
960
|
+
"1 None \n",
|
|
961
|
+
"2 None \n",
|
|
962
|
+
"3 None \n",
|
|
963
|
+
"4 None \n",
|
|
964
|
+
"5 None \n",
|
|
965
|
+
"6 [SW1A 2AA] \n",
|
|
966
|
+
"7 [SW1A 2WH] \n",
|
|
967
|
+
"8 [SW19 5AE] \n",
|
|
968
|
+
"9 [EH8 8DX] "
|
|
969
|
+
]
|
|
970
|
+
},
|
|
971
|
+
"execution_count": 9,
|
|
972
|
+
"metadata": {},
|
|
973
|
+
"output_type": "execute_result"
|
|
974
|
+
}
|
|
975
|
+
],
|
|
976
|
+
"source": [
|
|
977
|
+
"scrubbed_data"
|
|
978
|
+
]
|
|
979
|
+
}
|
|
980
|
+
],
|
|
981
|
+
"metadata": {
|
|
982
|
+
"kernelspec": {
|
|
983
|
+
"display_name": "idscrub",
|
|
984
|
+
"language": "python",
|
|
985
|
+
"name": "python3"
|
|
986
|
+
},
|
|
987
|
+
"language_info": {
|
|
988
|
+
"codemirror_mode": {
|
|
989
|
+
"name": "ipython",
|
|
990
|
+
"version": 3
|
|
991
|
+
},
|
|
992
|
+
"file_extension": ".py",
|
|
993
|
+
"mimetype": "text/x-python",
|
|
994
|
+
"name": "python",
|
|
995
|
+
"nbconvert_exporter": "python",
|
|
996
|
+
"pygments_lexer": "ipython3",
|
|
997
|
+
"version": "3.12.12"
|
|
998
|
+
}
|
|
999
|
+
},
|
|
1000
|
+
"nbformat": 4,
|
|
1001
|
+
"nbformat_minor": 4
|
|
1002
|
+
}
|