idscrub 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,13 @@
7
7
  "### `idscrub` basic usage examples"
8
8
  ]
9
9
  },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "With a default pipeline:"
15
+ ]
16
+ },
10
17
  {
11
18
  "cell_type": "code",
12
19
  "execution_count": 1,
@@ -17,23 +24,24 @@
17
24
  "output_type": "stream",
18
25
  "text": [
19
26
  "INFO: Texts loaded.\n",
20
- "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
21
- "100%|██████████| 2/2 [00:00<00:00, 33.83it/s]\n",
22
- "INFO: 1 org scrubbed.\n",
23
- "INFO: 3 person scrubbed.\n",
24
- "INFO: Scrubbing phone numbers using regex...\n",
25
- "INFO: 1 uk_phone_number scrubbed.\n",
26
- "INFO: Scrubbing addresses using regex...\n",
27
- "INFO: 1 uk_address scrubbed.\n",
28
- "INFO: Scrubbing postcodes using regex...\n",
29
- "INFO: 1 uk_postcode scrubbed.\n"
27
+ "INFO: Scrubbing using presidio_entities with default parameters...\n",
28
+ "INFO: Scrubbing using spacy_entities with default parameters...\n",
29
+ "INFO: Scrubbing using email_addresses with default parameters...\n",
30
+ "INFO: Scrubbing using handles with default parameters...\n",
31
+ "INFO: Scrubbing using ip_addresses with default parameters...\n",
32
+ "INFO: Scrubbing using uk_addresses with default parameters...\n",
33
+ "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
34
+ "INFO: Scrubbing using google_phone_numbers with default parameters...\n",
35
+ "INFO: Scrubbing using uk_postcodes with default parameters...\n",
36
+ "INFO: Scrubbing using urls with default parameters...\n",
37
+ "INFO: Scrubbing using titles with default parameters...\n"
30
38
  ]
31
39
  },
32
40
  {
33
41
  "name": "stdout",
34
42
  "output_type": "stream",
35
43
  "text": [
36
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
44
+ "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
37
45
  ]
38
46
  }
39
47
  ],
@@ -47,15 +55,82 @@
47
55
  " ]\n",
48
56
  ")\n",
49
57
  "\n",
50
- "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_entities\", \"uk_phone_numbers\", \"uk_addresses\", \"uk_postcodes\"])\n",
58
+ "scrubbed_texts = scrub.scrub()\n",
51
59
  "\n",
52
60
  "print(scrubbed_texts)"
53
61
  ]
54
62
  },
63
+ {
64
+ "cell_type": "markdown",
65
+ "metadata": {},
66
+ "source": [
67
+ "With a custom pipeline:"
68
+ ]
69
+ },
55
70
  {
56
71
  "cell_type": "code",
57
72
  "execution_count": 2,
58
73
  "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stderr",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "INFO: Texts loaded.\n",
80
+ "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON']}...\n",
81
+ "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
82
+ "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
83
+ "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
84
+ "INFO: Scrubbing using email_addresses with default parameters...\n",
85
+ "INFO: Scrubbing using handles with default parameters...\n",
86
+ "INFO: Scrubbing using ip_addresses with default parameters...\n",
87
+ "INFO: Scrubbing using uk_addresses with default parameters...\n",
88
+ "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
89
+ "INFO: Scrubbing using uk_postcodes with default parameters...\n",
90
+ "INFO: Scrubbing using urls with default parameters...\n"
91
+ ]
92
+ },
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
98
+ ]
99
+ }
100
+ ],
101
+ "source": [
102
+ "from idscrub import IDScrub\n",
103
+ "\n",
104
+ "scrub = IDScrub(\n",
105
+ " [\n",
106
+ " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
107
+ " \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
108
+ " ]\n",
109
+ ")\n",
110
+ "\n",
111
+ "pipeline = [\n",
112
+ " {\"method\": \"presidio_entities\", \"entity_types\": [\"PERSON\"]},\n",
113
+ " {\"method\": \"spacy_entities\", \"entity_types\": [\"ORG\"]},\n",
114
+ " {\"method\": \"google_phone_numbers\", \"region\": \"GB\"},\n",
115
+ " {\"method\": \"titles\", \"strict\": False},\n",
116
+ " {\"method\": \"email_addresses\"},\n",
117
+ " {\"method\": \"handles\"},\n",
118
+ " {\"method\": \"ip_addresses\"},\n",
119
+ " {\"method\": \"uk_addresses\"},\n",
120
+ " {\"method\": \"uk_phone_numbers\"},\n",
121
+ " {\"method\": \"uk_postcodes\"},\n",
122
+ " {\"method\": \"urls\"},\n",
123
+ "]\n",
124
+ "\n",
125
+ "scrubbed_texts = scrub.scrub(pipeline=pipeline)\n",
126
+ "\n",
127
+ "print(scrubbed_texts)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 3,
133
+ "metadata": {},
59
134
  "outputs": [
60
135
  {
61
136
  "data": {
@@ -123,7 +198,7 @@
123
198
  "1 [AA11 1AA] "
124
199
  ]
125
200
  },
126
- "execution_count": 2,
201
+ "execution_count": 3,
127
202
  "metadata": {},
128
203
  "output_type": "execute_result"
129
204
  }
@@ -136,12 +211,19 @@
136
211
  "cell_type": "markdown",
137
212
  "metadata": {},
138
213
  "source": [
139
- "Or scrub `all`:"
214
+ "### `idscrub` example - priority scoring"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "markdown",
219
+ "metadata": {},
220
+ "source": [
221
+ "If multiple different types of personal data have been identified in the same string, such as a handle, a email address and a URL, you can score one higher to ensure it is scrubbed:"
140
222
  ]
141
223
  },
142
224
  {
143
225
  "cell_type": "code",
144
- "execution_count": 3,
226
+ "execution_count": 4,
145
227
  "metadata": {},
146
228
  "outputs": [
147
229
  {
@@ -149,57 +231,52 @@
149
231
  "output_type": "stream",
150
232
  "text": [
151
233
  "INFO: Texts loaded.\n",
152
- "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
153
- "100%|██████████| 2/2 [00:00<00:00, 9.14it/s]\n",
154
- "INFO: 3 person scrubbed.\n",
155
- "INFO: 1 location scrubbed.\n",
156
- "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
157
- "100%|██████████| 2/2 [00:00<00:00, 42.62it/s]\n",
158
- "INFO: 1 org scrubbed.\n",
159
- "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
160
- "INFO: 0 phone_number scrubbed.\n",
161
- "INFO: Scrubbing email addresses using regex...\n",
162
- "INFO: 0 email_address scrubbed.\n",
163
- "INFO: Scrubbing @user handles using regex...\n",
164
- "INFO: 0 handle scrubbed.\n",
165
- "INFO: Scrubbing IP addresses using regex...\n",
166
- "INFO: 0 ip_address scrubbed.\n",
167
- "INFO: Scrubbing phone numbers using regex...\n",
168
- "INFO: 1 uk_phone_number scrubbed.\n",
169
- "INFO: Scrubbing addresses using regex...\n",
170
- "INFO: 1 uk_address scrubbed.\n",
171
- "INFO: Scrubbing postcodes using regex...\n",
172
- "INFO: 1 uk_postcode scrubbed.\n",
173
- "INFO: Scrubbing titles using regex...\n",
174
- "INFO: 0 title scrubbed.\n"
234
+ "INFO: Scrubbing using handles with parameters {'priority': 0.1}...\n",
235
+ "INFO: Scrubbing using urls with parameters {'priority': 0.1}...\n",
236
+ "INFO: Scrubbing using email_addresses with parameters {'priority': 0.2}...\n"
175
237
  ]
176
238
  },
177
239
  {
178
240
  "name": "stdout",
179
241
  "output_type": "stream",
180
242
  "text": [
181
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
243
+ "\n",
244
+ "All personal data identified: [('handle', '@person.com'), ('url', 'www.person@person.com'), ('email_address', 'www.person@person.com')]\n",
245
+ "\n",
246
+ "Personal data removed after priority scoring: [('email_address', 'www.person@person.com')]\n",
247
+ "\n",
248
+ "['My email is [EMAIL_ADDRESS]']\n"
182
249
  ]
183
250
  }
184
251
  ],
185
252
  "source": [
186
253
  "from idscrub import IDScrub\n",
187
254
  "\n",
188
- "scrub = IDScrub(\n",
189
- " [\n",
190
- " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
191
- " \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
255
+ "scrub = IDScrub(texts=[\"My email is www.person@person.com\"])\n",
256
+ "\n",
257
+ "scrubbed_texts = scrub.scrub(\n",
258
+ " pipeline=[\n",
259
+ " {\"method\": \"handles\", \"priority\": 0.1},\n",
260
+ " {\"method\": \"urls\", \"priority\": 0.1},\n",
261
+ " {\"method\": \"email_addresses\", \"priority\": 0.2},\n",
192
262
  " ]\n",
193
263
  ")\n",
194
264
  "\n",
195
- "scrubbed_texts = scrub.scrub(scrub_methods=[\"all\"])\n",
196
- "\n",
265
+ "print(f\"\\nAll personal data identified: {[(ident.label, ident.text) for ident in scrub.idents_all]}\\n\")\n",
266
+ "print(f\"Personal data removed after priority scoring: {[(ident.label, ident.text) for ident in scrub.idents]}\\n\")\n",
197
267
  "print(scrubbed_texts)"
198
268
  ]
199
269
  },
270
+ {
271
+ "cell_type": "markdown",
272
+ "metadata": {},
273
+ "source": [
274
+ "To view all of the identified data:"
275
+ ]
276
+ },
200
277
  {
201
278
  "cell_type": "code",
202
- "execution_count": 4,
279
+ "execution_count": 5,
203
280
  "metadata": {},
204
281
  "outputs": [
205
282
  {
@@ -224,72 +301,86 @@
224
301
  " <tr style=\"text-align: right;\">\n",
225
302
  " <th></th>\n",
226
303
  " <th>text_id</th>\n",
227
- " <th>person</th>\n",
228
- " <th>location</th>\n",
229
- " <th>org</th>\n",
230
- " <th>uk_phone_number</th>\n",
231
- " <th>uk_address</th>\n",
232
- " <th>uk_postcode</th>\n",
304
+ " <th>text</th>\n",
305
+ " <th>start</th>\n",
306
+ " <th>end</th>\n",
307
+ " <th>label</th>\n",
308
+ " <th>replacement</th>\n",
309
+ " <th>priority</th>\n",
310
+ " <th>source</th>\n",
233
311
  " </tr>\n",
234
312
  " </thead>\n",
235
313
  " <tbody>\n",
236
314
  " <tr>\n",
237
315
  " <th>0</th>\n",
238
316
  " <td>1</td>\n",
239
- " <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
240
- " <td>None</td>\n",
241
- " <td>None</td>\n",
242
- " <td>None</td>\n",
243
- " <td>None</td>\n",
244
- " <td>None</td>\n",
317
+ " <td>@person.com</td>\n",
318
+ " <td>22</td>\n",
319
+ " <td>33</td>\n",
320
+ " <td>handle</td>\n",
321
+ " <td>[HANDLE]</td>\n",
322
+ " <td>0.1</td>\n",
323
+ " <td>regex</td>\n",
245
324
  " </tr>\n",
246
325
  " <tr>\n",
247
326
  " <th>1</th>\n",
248
- " <td>2</td>\n",
249
- " <td>None</td>\n",
250
- " <td>[Lapland]</td>\n",
251
- " <td>[Department for Business and Trade]</td>\n",
252
- " <td>[+441111111111]</td>\n",
253
- " <td>[15 Elf Road]</td>\n",
254
- " <td>[AA11 1AA]</td>\n",
327
+ " <td>1</td>\n",
328
+ " <td>www.person@person.com</td>\n",
329
+ " <td>12</td>\n",
330
+ " <td>33</td>\n",
331
+ " <td>url</td>\n",
332
+ " <td>[URL]</td>\n",
333
+ " <td>0.1</td>\n",
334
+ " <td>regex</td>\n",
335
+ " </tr>\n",
336
+ " <tr>\n",
337
+ " <th>2</th>\n",
338
+ " <td>1</td>\n",
339
+ " <td>www.person@person.com</td>\n",
340
+ " <td>12</td>\n",
341
+ " <td>33</td>\n",
342
+ " <td>email_address</td>\n",
343
+ " <td>[EMAIL_ADDRESS]</td>\n",
344
+ " <td>0.2</td>\n",
345
+ " <td>regex</td>\n",
255
346
  " </tr>\n",
256
347
  " </tbody>\n",
257
348
  "</table>\n",
258
349
  "</div>"
259
350
  ],
260
351
  "text/plain": [
261
- " text_id person location \\\n",
262
- "0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
263
- "1 2 None [Lapland] \n",
264
- "\n",
265
- " org uk_phone_number uk_address \\\n",
266
- "0 None None None \n",
267
- "1 [Department for Business and Trade] [+441111111111] [15 Elf Road] \n",
352
+ " text_id text start end label replacement \\\n",
353
+ "0 1 @person.com 22 33 handle [HANDLE] \n",
354
+ "1 1 www.person@person.com 12 33 url [URL] \n",
355
+ "2 1 www.person@person.com 12 33 email_address [EMAIL_ADDRESS] \n",
268
356
  "\n",
269
- " uk_postcode \n",
270
- "0 None \n",
271
- "1 [AA11 1AA] "
357
+ " priority source \n",
358
+ "0 0.1 regex \n",
359
+ "1 0.1 regex \n",
360
+ "2 0.2 regex "
272
361
  ]
273
362
  },
274
- "execution_count": 4,
363
+ "execution_count": 5,
275
364
  "metadata": {},
276
365
  "output_type": "execute_result"
277
366
  }
278
367
  ],
279
368
  "source": [
280
- "scrub.get_scrubbed_data()"
369
+ "scrub.get_all_identified_data()"
281
370
  ]
282
371
  },
283
372
  {
284
373
  "cell_type": "markdown",
285
374
  "metadata": {},
286
375
  "source": [
287
- "### `idscrub` example - chaining methods together"
376
+ "Note that methods which identify multiple identities, like `spacy_entities` and `presidio_entities`, will have the same priority score applied to each entity type. \n",
377
+ "\n",
378
+ "To assign priority scores based on entity types, you can chain methods together. For example, if you wanted to prioritise email addresses over names when using `presidio_entities`:"
288
379
  ]
289
380
  },
290
381
  {
291
382
  "cell_type": "code",
292
- "execution_count": 5,
383
+ "execution_count": 6,
293
384
  "metadata": {},
294
385
  "outputs": [
295
386
  {
@@ -297,151 +388,47 @@
297
388
  "output_type": "stream",
298
389
  "text": [
299
390
  "INFO: Texts loaded.\n",
300
- "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
301
- "100%|██████████| 2/2 [00:00<00:00, 42.58it/s]\n",
302
- "INFO: 1 org scrubbed.\n",
303
- "INFO: 3 person scrubbed.\n",
304
- "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
305
- "INFO: 0 phone_number scrubbed.\n",
306
- "INFO: Scrubbing custom regex...\n",
307
- "INFO: 1 custom_regex_1 scrubbed.\n",
308
- "INFO: 0 custom_regex_2 scrubbed.\n",
309
- "INFO: Scrubbing email addresses using regex...\n",
310
- "INFO: 0 email_address scrubbed.\n",
311
- "INFO: Scrubbing @user handles using regex...\n",
312
- "INFO: 0 handle scrubbed.\n",
313
- "INFO: Scrubbing IP addresses using regex...\n",
314
- "INFO: 0 ip_address scrubbed.\n",
315
- "INFO: Scrubbing phone numbers using regex...\n",
316
- "INFO: 1 uk_phone_number scrubbed.\n",
317
- "INFO: Scrubbing addresses using regex...\n",
318
- "INFO: 1 uk_address scrubbed.\n",
319
- "INFO: Scrubbing postcodes using regex...\n",
320
- "INFO: 1 uk_postcode scrubbed.\n",
321
- "INFO: Scrubbing titles using regex...\n",
322
- "INFO: 0 title scrubbed.\n"
391
+ "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON'], 'priority': 0.1}...\n",
392
+ "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['EMAIL_ADDRESS'], 'priority': 0.2}...\n"
323
393
  ]
324
394
  },
325
395
  {
326
396
  "name": "stdout",
327
397
  "output_type": "stream",
328
398
  "text": [
329
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [UNIVERSITY]']\n"
399
+ " text_id text start end label replacement \\\n",
400
+ "0 1 John Smith@mail.com 0 19 person [PERSON] \n",
401
+ "1 1 Smith@mail.com 5 19 email_address [EMAIL_ADDRESS] \n",
402
+ "\n",
403
+ " priority source \n",
404
+ "0 0.1 presidio \n",
405
+ "1 0.2 presidio \n",
406
+ "['John [EMAIL_ADDRESS]']\n"
330
407
  ]
331
408
  }
332
409
  ],
333
410
  "source": [
334
411
  "from idscrub import IDScrub\n",
335
412
  "\n",
336
- "scrub = IDScrub(\n",
337
- " [\n",
338
- " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
339
- " \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
340
- " ]\n",
341
- ")\n",
342
- "\n",
343
- "scrub.spacy_entities()\n",
344
- "scrub.google_phone_numbers(region=\"GB\")\n",
413
+ "scrub = IDScrub([\"John Smith@mail.com\"])\n",
345
414
  "\n",
346
- "# Remove specific regex pattern(s). This can also be passed to all().\n",
347
- "scrub.custom_regex(\n",
348
- " custom_regex_patterns=[r\"Lapland\", r\"ACHILLES\"], custom_replacement_texts=[\"[UNIVERSITY]\", \"[REDACTED]\"]\n",
415
+ "scrubbed_texts = scrub.scrub(\n",
416
+ " pipeline=[\n",
417
+ " {\"method\": \"presidio_entities\", \"entity_types\": [\"PERSON\"], \"priority\": 0.1},\n",
418
+ " {\"method\": \"presidio_entities\", \"entity_types\": [\"EMAIL_ADDRESS\"], \"priority\": 0.2},\n",
419
+ " ]\n",
349
420
  ")\n",
350
421
  "\n",
351
- "scrubbed_texts = scrub.all_regex()\n",
422
+ "print(scrub.get_all_identified_data())\n",
352
423
  "\n",
353
424
  "print(scrubbed_texts)"
354
425
  ]
355
426
  },
356
- {
357
- "cell_type": "code",
358
- "execution_count": 6,
359
- "metadata": {},
360
- "outputs": [
361
- {
362
- "data": {
363
- "text/html": [
364
- "<div>\n",
365
- "<style scoped>\n",
366
- " .dataframe tbody tr th:only-of-type {\n",
367
- " vertical-align: middle;\n",
368
- " }\n",
369
- "\n",
370
- " .dataframe tbody tr th {\n",
371
- " vertical-align: top;\n",
372
- " }\n",
373
- "\n",
374
- " .dataframe thead th {\n",
375
- " text-align: right;\n",
376
- " }\n",
377
- "</style>\n",
378
- "<table border=\"1\" class=\"dataframe\">\n",
379
- " <thead>\n",
380
- " <tr style=\"text-align: right;\">\n",
381
- " <th></th>\n",
382
- " <th>text_id</th>\n",
383
- " <th>person</th>\n",
384
- " <th>org</th>\n",
385
- " <th>custom_regex_1</th>\n",
386
- " <th>uk_phone_number</th>\n",
387
- " <th>uk_address</th>\n",
388
- " <th>uk_postcode</th>\n",
389
- " </tr>\n",
390
- " </thead>\n",
391
- " <tbody>\n",
392
- " <tr>\n",
393
- " <th>0</th>\n",
394
- " <td>1</td>\n",
395
- " <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
396
- " <td>None</td>\n",
397
- " <td>None</td>\n",
398
- " <td>None</td>\n",
399
- " <td>None</td>\n",
400
- " <td>None</td>\n",
401
- " </tr>\n",
402
- " <tr>\n",
403
- " <th>1</th>\n",
404
- " <td>2</td>\n",
405
- " <td>None</td>\n",
406
- " <td>[Department for Business and Trade]</td>\n",
407
- " <td>[Lapland]</td>\n",
408
- " <td>[+441111111111]</td>\n",
409
- " <td>[15 Elf Road]</td>\n",
410
- " <td>[AA11 1AA]</td>\n",
411
- " </tr>\n",
412
- " </tbody>\n",
413
- "</table>\n",
414
- "</div>"
415
- ],
416
- "text/plain": [
417
- " text_id person \\\n",
418
- "0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
419
- "1 2 None \n",
420
- "\n",
421
- " org custom_regex_1 uk_phone_number \\\n",
422
- "0 None None None \n",
423
- "1 [Department for Business and Trade] [Lapland] [+441111111111] \n",
424
- "\n",
425
- " uk_address uk_postcode \n",
426
- "0 None None \n",
427
- "1 [15 Elf Road] [AA11 1AA] "
428
- ]
429
- },
430
- "execution_count": 6,
431
- "metadata": {},
432
- "output_type": "execute_result"
433
- }
434
- ],
435
- "source": [
436
- "scrub.get_scrubbed_data()"
437
- ]
438
- },
439
427
  {
440
428
  "cell_type": "markdown",
441
429
  "metadata": {},
442
430
  "source": [
443
- "### `idscrub` example - using Presidio\n",
444
- "We can also leverage the power of [Presidio](https://microsoft.github.io/presidio/) and use their entity recognition methods"
431
+ "### `idscrub` example - scrubbing custom regex patterns"
445
432
  ]
446
433
  },
447
434
  {
@@ -454,29 +441,41 @@
454
441
  "output_type": "stream",
455
442
  "text": [
456
443
  "INFO: Texts loaded.\n",
457
- "INFO: Scrubbing Presidio entities `PERSON, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, IBAN_CODE` using SpaCy model `en_core_web_trf`...\n",
458
- "100%|██████████| 2/2 [00:00<00:00, 24.36it/s]\n",
459
- "INFO: 1 iban_code scrubbed.\n",
460
- "INFO: 3 person scrubbed.\n"
444
+ "INFO: Scrubbing using custom_regex with parameters {'patterns': {'university': {'pattern': 'Lapland', 'replacement': '[UNIVERSITY]', 'priority': 1.0}}}...\n"
461
445
  ]
462
446
  },
463
447
  {
464
- "name": "stdout",
465
- "output_type": "stream",
466
- "text": [
467
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My IBAN code is [IBAN_CODE]']\n"
468
- ]
448
+ "data": {
449
+ "text/plain": [
450
+ "['Our names are Hamish McDonald, L. Salah, and Elena Suárez.',\n",
451
+ " 'My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, [UNIVERSITY]']"
452
+ ]
453
+ },
454
+ "execution_count": 7,
455
+ "metadata": {},
456
+ "output_type": "execute_result"
469
457
  }
470
458
  ],
471
459
  "source": [
472
460
  "from idscrub import IDScrub\n",
473
461
  "\n",
474
462
  "scrub = IDScrub(\n",
475
- " [\"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\", \"My IBAN code is GB91BKEN10000041610008\"]\n",
463
+ " [\n",
464
+ " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
465
+ " \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
466
+ " ]\n",
476
467
  ")\n",
477
- "scrubbed_texts = scrub.presidio_entities()\n",
478
468
  "\n",
479
- "print(scrubbed_texts)"
469
+ "pipeline = [\n",
470
+ " {\n",
471
+ " \"method\": \"custom_regex\",\n",
472
+ " \"patterns\": {\"university\": {\"pattern\": r\"Lapland\", \"replacement\": \"[UNIVERSITY]\", \"priority\": 1.0}},\n",
473
+ " }\n",
474
+ "]\n",
475
+ "\n",
476
+ "scrubbed_texts = scrub.scrub(pipeline=pipeline)\n",
477
+ "\n",
478
+ "scrubbed_texts"
480
479
  ]
481
480
  },
482
481
  {
@@ -506,35 +505,22 @@
506
505
  " <tr style=\"text-align: right;\">\n",
507
506
  " <th></th>\n",
508
507
  " <th>text_id</th>\n",
509
- " <th>person</th>\n",
510
- " <th>iban_code</th>\n",
508
+ " <th>university</th>\n",
511
509
  " </tr>\n",
512
510
  " </thead>\n",
513
511
  " <tbody>\n",
514
512
  " <tr>\n",
515
513
  " <th>0</th>\n",
516
- " <td>1</td>\n",
517
- " <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
518
- " <td>None</td>\n",
519
- " </tr>\n",
520
- " <tr>\n",
521
- " <th>1</th>\n",
522
514
  " <td>2</td>\n",
523
- " <td>None</td>\n",
524
- " <td>[GB91BKEN10000041610008]</td>\n",
515
+ " <td>[Lapland]</td>\n",
525
516
  " </tr>\n",
526
517
  " </tbody>\n",
527
518
  "</table>\n",
528
519
  "</div>"
529
520
  ],
530
521
  "text/plain": [
531
- " text_id person \\\n",
532
- "0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
533
- "1 2 None \n",
534
- "\n",
535
- " iban_code \n",
536
- "0 None \n",
537
- "1 [GB91BKEN10000041610008] "
522
+ " text_id university\n",
523
+ "0 2 [Lapland]"
538
524
  ]
539
525
  },
540
526
  "execution_count": 8,
@@ -693,7 +679,7 @@
693
679
  " \"Beware; for I am fearless, and therefore powerful.\",\n",
694
680
  " ],\n",
695
681
  " \"Fake book\": [\n",
696
- " \"The letter to freddie.mercury@queen.com was stamped with SW1A 2AA. His IBAN was GB91BKEN10000041610008.\",\n",
682
+ " \"The letter to freddie.mercury@queen.com was stamped with SW1A 2AA. He was British.\",\n",
697
683
  " \"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.\",\n",
698
684
  " \"The dossier marked confidential came from serena.williams@tennis.com, with SW19 5AE etched in bold across the envelope.\",\n",
699
685
  " \"A message arrived just as the Downing Street clock struck midnight.\",\n",
@@ -716,78 +702,44 @@
716
702
  "text": [
717
703
  " 0%| | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
718
704
  "INFO: Scrubbing column `Pride and Prejudice`...\n",
719
- "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
720
- "100%|██████████| 5/5 [00:00<00:00, 23.73it/s]\n",
721
- "INFO: 4 person scrubbed.\n",
722
- "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
723
- "100%|██████████| 5/5 [00:00<00:00, 77.84it/s]\n",
724
- "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
725
- "INFO: 0 phone_number scrubbed.\n",
726
- "INFO: Scrubbing email addresses using regex...\n",
727
- "INFO: 0 email_address scrubbed.\n",
728
- "INFO: Scrubbing @user handles using regex...\n",
729
- "INFO: 0 handle scrubbed.\n",
730
- "INFO: Scrubbing IP addresses using regex...\n",
731
- "INFO: 0 ip_address scrubbed.\n",
732
- "INFO: Scrubbing phone numbers using regex...\n",
733
- "INFO: 0 uk_phone_number scrubbed.\n",
734
- "INFO: Scrubbing addresses using regex...\n",
735
- "INFO: 0 uk_address scrubbed.\n",
736
- "INFO: Scrubbing postcodes using regex...\n",
737
- "INFO: 0 uk_postcode scrubbed.\n",
738
- "INFO: Scrubbing titles using regex...\n",
739
- "INFO: 2 title scrubbed.\n",
740
- " 33%|███▎ | 1/3 [00:02<00:05, 2.60s/it]INFO: Texts loaded.\n",
705
+ "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON', 'NRP']}...\n",
706
+ "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
707
+ "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
708
+ "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
709
+ "INFO: Scrubbing using email_addresses with default parameters...\n",
710
+ "INFO: Scrubbing using handles with default parameters...\n",
711
+ "INFO: Scrubbing using ip_addresses with default parameters...\n",
712
+ "INFO: Scrubbing using uk_addresses with default parameters...\n",
713
+ "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
714
+ "INFO: Scrubbing using uk_postcodes with default parameters...\n",
715
+ "INFO: Scrubbing using urls with default parameters...\n",
716
+ " 33%|███▎ | 1/3 [00:02<00:04, 2.44s/it]INFO: Texts loaded.\n",
741
717
  "INFO: Scrubbing column `The Adventures of Sherlock Holmes`...\n",
742
- "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
743
- "100%|██████████| 5/5 [00:00<00:00, 24.22it/s]\n",
744
- "INFO: 2 person scrubbed.\n",
745
- "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
746
- "100%|██████████| 5/5 [00:00<00:00, 84.78it/s]\n",
747
- "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
748
- "INFO: 0 phone_number scrubbed.\n",
749
- "INFO: Scrubbing email addresses using regex...\n",
750
- "INFO: 0 email_address scrubbed.\n",
751
- "INFO: Scrubbing @user handles using regex...\n",
752
- "INFO: 0 handle scrubbed.\n",
753
- "INFO: Scrubbing IP addresses using regex...\n",
754
- "INFO: 0 ip_address scrubbed.\n",
755
- "INFO: Scrubbing phone numbers using regex...\n",
756
- "INFO: 0 uk_phone_number scrubbed.\n",
757
- "INFO: Scrubbing addresses using regex...\n",
758
- "INFO: 0 uk_address scrubbed.\n",
759
- "INFO: Scrubbing postcodes using regex...\n",
760
- "INFO: 0 uk_postcode scrubbed.\n",
761
- "INFO: Scrubbing titles using regex...\n",
762
- "INFO: 0 title scrubbed.\n",
763
- " 67%|██████▋ | 2/3 [00:05<00:02, 2.49s/it]INFO: Texts loaded.\n",
718
+ "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON', 'NRP']}...\n",
719
+ "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
720
+ "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
721
+ "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
722
+ "INFO: Scrubbing using email_addresses with default parameters...\n",
723
+ "INFO: Scrubbing using handles with default parameters...\n",
724
+ "INFO: Scrubbing using ip_addresses with default parameters...\n",
725
+ "INFO: Scrubbing using uk_addresses with default parameters...\n",
726
+ "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
727
+ "INFO: Scrubbing using uk_postcodes with default parameters...\n",
728
+ "INFO: Scrubbing using urls with default parameters...\n",
729
+ " 67%|██████▋ | 2/3 [00:04<00:02, 2.44s/it]INFO: Texts loaded.\n",
764
730
  "INFO: Scrubbing column `Fake book`...\n",
765
- "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
766
- "100%|██████████| 5/5 [00:00<00:00, 13.41it/s]\n",
767
- "INFO: 1 iban_code scrubbed.\n",
768
- "INFO: 5 url scrubbed.\n",
769
- "INFO: 2 person scrubbed.\n",
770
- "INFO: 3 email_address scrubbed.\n",
771
- "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
772
- "100%|██████████| 5/5 [00:00<00:00, 64.57it/s]\n",
773
- "INFO: 1 org scrubbed.\n",
774
- "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
775
- "INFO: 0 phone_number scrubbed.\n",
776
- "INFO: Scrubbing email addresses using regex...\n",
777
- "INFO: 3 email_address scrubbed.\n",
778
- "INFO: Scrubbing @user handles using regex...\n",
779
- "INFO: 0 handle scrubbed.\n",
780
- "INFO: Scrubbing IP addresses using regex...\n",
781
- "INFO: 0 ip_address scrubbed.\n",
782
- "INFO: Scrubbing phone numbers using regex...\n",
783
- "INFO: 0 uk_phone_number scrubbed.\n",
784
- "INFO: Scrubbing addresses using regex...\n",
785
- "INFO: 0 uk_address scrubbed.\n",
786
- "INFO: Scrubbing postcodes using regex...\n",
787
- "INFO: 4 uk_postcode scrubbed.\n",
788
- "INFO: Scrubbing titles using regex...\n",
789
- "INFO: 0 title scrubbed.\n",
790
- "100%|██████████| 3/3 [00:07<00:00, 2.53s/it]\n"
731
+ "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON', 'NRP']}...\n",
732
+ "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
733
+ "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
734
+ "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
735
+ "INFO: Scrubbing using email_addresses with default parameters...\n",
736
+ "INFO: Scrubbing using handles with default parameters...\n",
737
+ "INFO: Scrubbing using ip_addresses with default parameters...\n",
738
+ "INFO: Scrubbing using uk_addresses with default parameters...\n",
739
+ "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
740
+ "INFO: Scrubbing using uk_postcodes with default parameters...\n",
741
+ "INFO: Scrubbing using urls with default parameters...\n",
742
+ "100%|██████████| 3/3 [00:07<00:00, 2.51s/it]\n"
791
743
  ]
792
744
  },
793
745
  {
@@ -901,7 +853,21 @@
901
853
  "source": [
902
854
  "from idscrub import IDScrub\n",
903
855
  "\n",
904
- "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], scrub_methods=[\"all\"])\n",
856
+ "pipeline = [\n",
857
+ " {\"method\": \"presidio_entities\", \"entity_types\": [\"PERSON\", \"NRP\"]},\n",
858
+ " {\"method\": \"spacy_entities\", \"entity_types\": [\"ORG\"]},\n",
859
+ " {\"method\": \"google_phone_numbers\", \"region\": \"GB\"},\n",
860
+ " {\"method\": \"titles\", \"strict\": False},\n",
861
+ " {\"method\": \"email_addresses\"},\n",
862
+ " {\"method\": \"handles\"},\n",
863
+ " {\"method\": \"ip_addresses\"},\n",
864
+ " {\"method\": \"uk_addresses\"},\n",
865
+ " {\"method\": \"uk_phone_numbers\"},\n",
866
+ " {\"method\": \"uk_postcodes\"},\n",
867
+ " {\"method\": \"urls\"},\n",
868
+ "]\n",
869
+ "\n",
870
+ "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], pipeline=pipeline)\n",
905
871
  "\n",
906
872
  "scrubbed_df"
907
873
  ]
@@ -936,11 +902,10 @@
936
902
  " <th>column</th>\n",
937
903
  " <th>person</th>\n",
938
904
  " <th>title</th>\n",
905
+ " <th>nrp</th>\n",
939
906
  " <th>email_address</th>\n",
940
- " <th>iban_code</th>\n",
941
- " <th>url</th>\n",
942
- " <th>org</th>\n",
943
907
  " <th>uk_postcode</th>\n",
908
+ " <th>org</th>\n",
944
909
  " </tr>\n",
945
910
  " </thead>\n",
946
911
  " <tbody>\n",
@@ -954,7 +919,6 @@
954
919
  " <td>None</td>\n",
955
920
  " <td>None</td>\n",
956
921
  " <td>None</td>\n",
957
- " <td>None</td>\n",
958
922
  " </tr>\n",
959
923
  " <tr>\n",
960
924
  " <th>1</th>\n",
@@ -966,7 +930,6 @@
966
930
  " <td>None</td>\n",
967
931
  " <td>None</td>\n",
968
932
  " <td>None</td>\n",
969
- " <td>None</td>\n",
970
933
  " </tr>\n",
971
934
  " <tr>\n",
972
935
  " <th>2</th>\n",
@@ -978,7 +941,6 @@
978
941
  " <td>None</td>\n",
979
942
  " <td>None</td>\n",
980
943
  " <td>None</td>\n",
981
- " <td>None</td>\n",
982
944
  " </tr>\n",
983
945
  " <tr>\n",
984
946
  " <th>3</th>\n",
@@ -990,7 +952,6 @@
990
952
  " <td>None</td>\n",
991
953
  " <td>None</td>\n",
992
954
  " <td>None</td>\n",
993
- " <td>None</td>\n",
994
955
  " </tr>\n",
995
956
  " <tr>\n",
996
957
  " <th>4</th>\n",
@@ -1002,7 +963,6 @@
1002
963
  " <td>None</td>\n",
1003
964
  " <td>None</td>\n",
1004
965
  " <td>None</td>\n",
1005
- " <td>None</td>\n",
1006
966
  " </tr>\n",
1007
967
  " <tr>\n",
1008
968
  " <th>5</th>\n",
@@ -1010,11 +970,10 @@
1010
970
  " <td>Fake book</td>\n",
1011
971
  " <td>None</td>\n",
1012
972
  " <td>None</td>\n",
973
+ " <td>[British]</td>\n",
1013
974
  " <td>[freddie.mercury@queen.com]</td>\n",
1014
- " <td>[GB91BKEN10000041610008]</td>\n",
1015
- " <td>[freddie.me, queen.com]</td>\n",
1016
- " <td>None</td>\n",
1017
975
  " <td>[SW1A 2AA]</td>\n",
976
+ " <td>None</td>\n",
1018
977
  " </tr>\n",
1019
978
  " <tr>\n",
1020
979
  " <th>6</th>\n",
@@ -1024,9 +983,8 @@
1024
983
  " <td>None</td>\n",
1025
984
  " <td>None</td>\n",
1026
985
  " <td>None</td>\n",
1027
- " <td>None</td>\n",
1028
- " <td>None</td>\n",
1029
986
  " <td>[SW1A 2WH]</td>\n",
987
+ " <td>None</td>\n",
1030
988
  " </tr>\n",
1031
989
  " <tr>\n",
1032
990
  " <th>7</th>\n",
@@ -1034,34 +992,31 @@
1034
992
  " <td>Fake book</td>\n",
1035
993
  " <td>None</td>\n",
1036
994
  " <td>None</td>\n",
1037
- " <td>[serena.williams@tennis.com]</td>\n",
1038
- " <td>None</td>\n",
1039
- " <td>[tennis.com]</td>\n",
1040
995
  " <td>None</td>\n",
996
+ " <td>[serena.williams@tennis.com]</td>\n",
1041
997
  " <td>[SW19 5AE]</td>\n",
998
+ " <td>None</td>\n",
1042
999
  " </tr>\n",
1043
1000
  " <tr>\n",
1044
1001
  " <th>8</th>\n",
1045
- " <td>E</td>\n",
1002
+ " <td>D</td>\n",
1046
1003
  " <td>Fake book</td>\n",
1047
1004
  " <td>None</td>\n",
1048
1005
  " <td>None</td>\n",
1049
- " <td>[otis.redding@dockofthebay.org]</td>\n",
1050
1006
  " <td>None</td>\n",
1051
- " <td>[otis.red, dockofthebay.org]</td>\n",
1052
1007
  " <td>None</td>\n",
1053
- " <td>[EH8 8DX]</td>\n",
1008
+ " <td>None</td>\n",
1009
+ " <td>[Downing Street]</td>\n",
1054
1010
  " </tr>\n",
1055
1011
  " <tr>\n",
1056
1012
  " <th>9</th>\n",
1057
- " <td>D</td>\n",
1013
+ " <td>E</td>\n",
1058
1014
  " <td>Fake book</td>\n",
1059
1015
  " <td>None</td>\n",
1060
1016
  " <td>None</td>\n",
1061
1017
  " <td>None</td>\n",
1062
- " <td>None</td>\n",
1063
- " <td>None</td>\n",
1064
- " <td>[Downing Street]</td>\n",
1018
+ " <td>[otis.redding@dockofthebay.org]</td>\n",
1019
+ " <td>[EH8 8DX]</td>\n",
1065
1020
  " <td>None</td>\n",
1066
1021
  " </tr>\n",
1067
1022
  " </tbody>\n",
@@ -1078,32 +1033,20 @@
1078
1033
  "5 A Fake book None None \n",
1079
1034
  "6 B Fake book [Mick Jagger, David Bowie] None \n",
1080
1035
  "7 C Fake book None None \n",
1081
- "8 E Fake book None None \n",
1082
- "9 D Fake book None None \n",
1083
- "\n",
1084
- " email_address iban_code \\\n",
1085
- "0 None None \n",
1086
- "1 None None \n",
1087
- "2 None None \n",
1088
- "3 None None \n",
1089
- "4 None None \n",
1090
- "5 [freddie.mercury@queen.com] [GB91BKEN10000041610008] \n",
1091
- "6 None None \n",
1092
- "7 [serena.williams@tennis.com] None \n",
1093
- "8 [otis.redding@dockofthebay.org] None \n",
1094
- "9 None None \n",
1036
+ "8 D Fake book None None \n",
1037
+ "9 E Fake book None None \n",
1095
1038
  "\n",
1096
- " url org uk_postcode \n",
1097
- "0 None None None \n",
1098
- "1 None None None \n",
1099
- "2 None None None \n",
1100
- "3 None None None \n",
1101
- "4 None None None \n",
1102
- "5 [freddie.me, queen.com] None [SW1A 2AA] \n",
1103
- "6 None None [SW1A 2WH] \n",
1104
- "7 [tennis.com] None [SW19 5AE] \n",
1105
- "8 [otis.red, dockofthebay.org] None [EH8 8DX] \n",
1106
- "9 None [Downing Street] None "
1039
+ " nrp email_address uk_postcode org \n",
1040
+ "0 None None None None \n",
1041
+ "1 None None None None \n",
1042
+ "2 None None None None \n",
1043
+ "3 None None None None \n",
1044
+ "4 None None None None \n",
1045
+ "5 [British] [freddie.mercury@queen.com] [SW1A 2AA] None \n",
1046
+ "6 None None [SW1A 2WH] None \n",
1047
+ "7 None [serena.williams@tennis.com] [SW19 5AE] None \n",
1048
+ "8 None None None [Downing Street] \n",
1049
+ "9 None [otis.redding@dockofthebay.org] [EH8 8DX] None "
1107
1050
  ]
1108
1051
  },
1109
1052
  "execution_count": 11,