idscrub 0.2.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +73 -88
- {idscrub-0.2.2.dist-info → idscrub-1.0.1.dist-info}/METADATA +19 -2
- idscrub-1.0.1.dist-info/RECORD +22 -0
- notebooks/basic_usage.ipynb +153 -161
- test/conftest.py +10 -0
- test/test_all.py +3 -3
- test/test_chain.py +7 -7
- test/test_dataframe.py +114 -5
- test/test_huggingface.py +1 -1
- test/test_label.py +17 -0
- test/test_log.py +3 -3
- test/test_persidio.py +2 -2
- test/test_regex.py +8 -8
- test/test_scrub.py +4 -4
- test/test_spacy.py +1 -3
- idscrub-0.2.2.dist-info/RECORD +0 -21
- {idscrub-0.2.2.dist-info → idscrub-1.0.1.dist-info}/WHEEL +0 -0
- {idscrub-0.2.2.dist-info → idscrub-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {idscrub-0.2.2.dist-info → idscrub-1.0.1.dist-info}/top_level.txt +0 -0
notebooks/basic_usage.ipynb
CHANGED
|
@@ -18,12 +18,12 @@
|
|
|
18
18
|
"text": [
|
|
19
19
|
"INFO: Texts loaded.\n",
|
|
20
20
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
21
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
22
|
-
"INFO: 3
|
|
21
|
+
"100%|██████████| 2/2 [00:00<00:00, 42.63it/s]\n",
|
|
22
|
+
"INFO: 3 person scrubbed.\n",
|
|
23
23
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
24
|
-
"INFO: 1
|
|
25
|
-
"INFO: Scrubbing
|
|
26
|
-
"INFO: 1
|
|
24
|
+
"INFO: 1 uk_phone_number scrubbed.\n",
|
|
25
|
+
"INFO: Scrubbing postcodes using regex...\n",
|
|
26
|
+
"INFO: 1 uk_postcode scrubbed.\n"
|
|
27
27
|
]
|
|
28
28
|
},
|
|
29
29
|
{
|
|
@@ -76,9 +76,9 @@
|
|
|
76
76
|
" <tr style=\"text-align: right;\">\n",
|
|
77
77
|
" <th></th>\n",
|
|
78
78
|
" <th>text_id</th>\n",
|
|
79
|
-
" <th>
|
|
80
|
-
" <th>
|
|
81
|
-
" <th>
|
|
79
|
+
" <th>person</th>\n",
|
|
80
|
+
" <th>uk_phone_number</th>\n",
|
|
81
|
+
" <th>uk_postcode</th>\n",
|
|
82
82
|
" </tr>\n",
|
|
83
83
|
" </thead>\n",
|
|
84
84
|
" <tbody>\n",
|
|
@@ -101,13 +101,13 @@
|
|
|
101
101
|
"</div>"
|
|
102
102
|
],
|
|
103
103
|
"text/plain": [
|
|
104
|
-
" text_id
|
|
105
|
-
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
106
|
-
"1 2 None \n",
|
|
104
|
+
" text_id person uk_phone_number \\\n",
|
|
105
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
|
|
106
|
+
"1 2 None [+441111111111] \n",
|
|
107
107
|
"\n",
|
|
108
|
-
"
|
|
109
|
-
"0
|
|
110
|
-
"1
|
|
108
|
+
" uk_postcode \n",
|
|
109
|
+
"0 None \n",
|
|
110
|
+
"1 [AA11 1AA] "
|
|
111
111
|
]
|
|
112
112
|
},
|
|
113
113
|
"execution_count": 2,
|
|
@@ -137,26 +137,26 @@
|
|
|
137
137
|
"text": [
|
|
138
138
|
"INFO: Texts loaded.\n",
|
|
139
139
|
"INFO: Scrubbing using Presidio...\n",
|
|
140
|
-
"100%|██████████| 2/2 [00:00<00:00, 14.
|
|
141
|
-
"INFO: 3
|
|
142
|
-
"INFO: 1
|
|
140
|
+
"100%|██████████| 2/2 [00:00<00:00, 14.67it/s]\n",
|
|
141
|
+
"INFO: 3 person scrubbed.\n",
|
|
142
|
+
"INFO: 1 location scrubbed.\n",
|
|
143
143
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
144
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
145
|
-
"INFO:
|
|
144
|
+
"100%|██████████| 2/2 [00:00<00:00, 48.96it/s]\n",
|
|
145
|
+
"INFO: 3 person scrubbed.\n",
|
|
146
146
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
147
|
-
"INFO: 0
|
|
147
|
+
"INFO: 0 phone_number scrubbed.\n",
|
|
148
148
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
149
|
-
"INFO: 0
|
|
149
|
+
"INFO: 0 email_address scrubbed.\n",
|
|
150
150
|
"INFO: Scrubbing @user handles using regex...\n",
|
|
151
|
-
"INFO: 0
|
|
151
|
+
"INFO: 0 handle scrubbed.\n",
|
|
152
152
|
"INFO: Scrubbing IP addresses using regex...\n",
|
|
153
|
-
"INFO: 0
|
|
153
|
+
"INFO: 0 ip_address scrubbed.\n",
|
|
154
154
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
155
|
-
"INFO: 1
|
|
156
|
-
"INFO: Scrubbing
|
|
157
|
-
"INFO: 1
|
|
155
|
+
"INFO: 1 uk_phone_number scrubbed.\n",
|
|
156
|
+
"INFO: Scrubbing postcodes using regex...\n",
|
|
157
|
+
"INFO: 1 uk_postcode scrubbed.\n",
|
|
158
158
|
"INFO: Scrubbing titles using regex...\n",
|
|
159
|
-
"INFO: 0
|
|
159
|
+
"INFO: 0 title scrubbed.\n"
|
|
160
160
|
]
|
|
161
161
|
},
|
|
162
162
|
{
|
|
@@ -209,10 +209,10 @@
|
|
|
209
209
|
" <tr style=\"text-align: right;\">\n",
|
|
210
210
|
" <th></th>\n",
|
|
211
211
|
" <th>text_id</th>\n",
|
|
212
|
-
" <th>
|
|
213
|
-
" <th>
|
|
214
|
-
" <th>
|
|
215
|
-
" <th>
|
|
212
|
+
" <th>person</th>\n",
|
|
213
|
+
" <th>location</th>\n",
|
|
214
|
+
" <th>uk_phone_number</th>\n",
|
|
215
|
+
" <th>uk_postcode</th>\n",
|
|
216
216
|
" </tr>\n",
|
|
217
217
|
" </thead>\n",
|
|
218
218
|
" <tbody>\n",
|
|
@@ -237,13 +237,13 @@
|
|
|
237
237
|
"</div>"
|
|
238
238
|
],
|
|
239
239
|
"text/plain": [
|
|
240
|
-
" text_id
|
|
241
|
-
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
242
|
-
"1 2 None \n",
|
|
240
|
+
" text_id person location \\\n",
|
|
241
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
|
|
242
|
+
"1 2 None [Lapland] \n",
|
|
243
243
|
"\n",
|
|
244
|
-
"
|
|
245
|
-
"0
|
|
246
|
-
"1
|
|
244
|
+
" uk_phone_number uk_postcode \n",
|
|
245
|
+
"0 None None \n",
|
|
246
|
+
"1 [+441111111111] [AA11 1AA] "
|
|
247
247
|
]
|
|
248
248
|
},
|
|
249
249
|
"execution_count": 4,
|
|
@@ -273,25 +273,25 @@
|
|
|
273
273
|
"text": [
|
|
274
274
|
"INFO: Texts loaded.\n",
|
|
275
275
|
"INFO: Scrubbing using Presidio...\n",
|
|
276
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
277
|
-
"INFO: 3
|
|
276
|
+
"100%|██████████| 2/2 [00:00<00:00, 30.26it/s]\n",
|
|
277
|
+
"INFO: 3 person scrubbed.\n",
|
|
278
278
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
279
|
-
"INFO: 0
|
|
279
|
+
"INFO: 0 phone_number scrubbed.\n",
|
|
280
280
|
"INFO: Scrubbing custom regex...\n",
|
|
281
|
-
"INFO: 1
|
|
282
|
-
"INFO: 1
|
|
281
|
+
"INFO: 1 custom_regex_1 scrubbed.\n",
|
|
282
|
+
"INFO: 1 custom_regex_2 scrubbed.\n",
|
|
283
283
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
284
|
-
"INFO: 0
|
|
284
|
+
"INFO: 0 email_address scrubbed.\n",
|
|
285
285
|
"INFO: Scrubbing @user handles using regex...\n",
|
|
286
|
-
"INFO: 0
|
|
286
|
+
"INFO: 0 handle scrubbed.\n",
|
|
287
287
|
"INFO: Scrubbing IP addresses using regex...\n",
|
|
288
|
-
"INFO: 0
|
|
288
|
+
"INFO: 0 ip_address scrubbed.\n",
|
|
289
289
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
290
|
-
"INFO: 1
|
|
291
|
-
"INFO: Scrubbing
|
|
292
|
-
"INFO: 1
|
|
290
|
+
"INFO: 1 uk_phone_number scrubbed.\n",
|
|
291
|
+
"INFO: Scrubbing postcodes using regex...\n",
|
|
292
|
+
"INFO: 1 uk_postcode scrubbed.\n",
|
|
293
293
|
"INFO: Scrubbing titles using regex...\n",
|
|
294
|
-
"INFO: 0
|
|
294
|
+
"INFO: 0 title scrubbed.\n"
|
|
295
295
|
]
|
|
296
296
|
},
|
|
297
297
|
{
|
|
@@ -349,11 +349,11 @@
|
|
|
349
349
|
" <tr style=\"text-align: right;\">\n",
|
|
350
350
|
" <th></th>\n",
|
|
351
351
|
" <th>text_id</th>\n",
|
|
352
|
-
" <th>
|
|
353
|
-
" <th>
|
|
354
|
-
" <th>
|
|
355
|
-
" <th>
|
|
356
|
-
" <th>
|
|
352
|
+
" <th>person</th>\n",
|
|
353
|
+
" <th>custom_regex_1</th>\n",
|
|
354
|
+
" <th>custom_regex_2</th>\n",
|
|
355
|
+
" <th>uk_phone_number</th>\n",
|
|
356
|
+
" <th>uk_postcode</th>\n",
|
|
357
357
|
" </tr>\n",
|
|
358
358
|
" </thead>\n",
|
|
359
359
|
" <tbody>\n",
|
|
@@ -380,13 +380,13 @@
|
|
|
380
380
|
"</div>"
|
|
381
381
|
],
|
|
382
382
|
"text/plain": [
|
|
383
|
-
" text_id
|
|
384
|
-
"0 1 [Hamish McDonald, L. Salah, Elena Suárez]
|
|
385
|
-
"1 2 None
|
|
383
|
+
" text_id person custom_regex_1 \\\n",
|
|
384
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
|
|
385
|
+
"1 2 None [Lapland] \n",
|
|
386
386
|
"\n",
|
|
387
|
-
"
|
|
388
|
-
"0
|
|
389
|
-
"1
|
|
387
|
+
" custom_regex_2 uk_phone_number uk_postcode \n",
|
|
388
|
+
"0 None None None \n",
|
|
389
|
+
"1 [ACHILLES] [+441111111111] [AA11 1AA] "
|
|
390
390
|
]
|
|
391
391
|
},
|
|
392
392
|
"execution_count": 6,
|
|
@@ -417,9 +417,9 @@
|
|
|
417
417
|
"text": [
|
|
418
418
|
"INFO: Texts loaded.\n",
|
|
419
419
|
"INFO: Scrubbing using Presidio...\n",
|
|
420
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
421
|
-
"INFO: 3
|
|
422
|
-
"INFO: 1
|
|
420
|
+
"100%|██████████| 2/2 [00:00<00:00, 28.29it/s]\n",
|
|
421
|
+
"INFO: 3 person scrubbed.\n",
|
|
422
|
+
"INFO: 1 iban_code scrubbed.\n"
|
|
423
423
|
]
|
|
424
424
|
},
|
|
425
425
|
{
|
|
@@ -468,8 +468,8 @@
|
|
|
468
468
|
" <tr style=\"text-align: right;\">\n",
|
|
469
469
|
" <th></th>\n",
|
|
470
470
|
" <th>text_id</th>\n",
|
|
471
|
-
" <th>
|
|
472
|
-
" <th>
|
|
471
|
+
" <th>person</th>\n",
|
|
472
|
+
" <th>iban_code</th>\n",
|
|
473
473
|
" </tr>\n",
|
|
474
474
|
" </thead>\n",
|
|
475
475
|
" <tbody>\n",
|
|
@@ -490,13 +490,13 @@
|
|
|
490
490
|
"</div>"
|
|
491
491
|
],
|
|
492
492
|
"text/plain": [
|
|
493
|
-
" text_id
|
|
493
|
+
" text_id person \\\n",
|
|
494
494
|
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
495
495
|
"1 2 None \n",
|
|
496
496
|
"\n",
|
|
497
|
-
"
|
|
498
|
-
"0
|
|
499
|
-
"1
|
|
497
|
+
" iban_code \n",
|
|
498
|
+
"0 None \n",
|
|
499
|
+
"1 [GB91BKEN10000041610008] "
|
|
500
500
|
]
|
|
501
501
|
},
|
|
502
502
|
"execution_count": 8,
|
|
@@ -677,75 +677,78 @@
|
|
|
677
677
|
"output_type": "stream",
|
|
678
678
|
"text": [
|
|
679
679
|
" 0%| | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
|
|
680
|
+
"INFO: Scrubbing column `Pride and Prejudice`...\n",
|
|
680
681
|
"INFO: Scrubbing using Presidio...\n",
|
|
681
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
682
|
-
"INFO: 4
|
|
683
|
-
"INFO: 4
|
|
684
|
-
"INFO: 4
|
|
682
|
+
"100%|██████████| 5/5 [00:00<00:00, 27.93it/s]\n",
|
|
683
|
+
"INFO: 4 person scrubbed.\n",
|
|
684
|
+
"INFO: 4 person scrubbed.\n",
|
|
685
|
+
"INFO: 4 person scrubbed.\n",
|
|
685
686
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
686
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
687
|
-
"INFO:
|
|
687
|
+
"100%|██████████| 5/5 [00:00<00:00, 62.29it/s]\n",
|
|
688
|
+
"INFO: 4 person scrubbed.\n",
|
|
688
689
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
689
|
-
"INFO: 0
|
|
690
|
+
"INFO: 0 phone_number scrubbed.\n",
|
|
690
691
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
691
|
-
"INFO: 0
|
|
692
|
+
"INFO: 0 email_address scrubbed.\n",
|
|
692
693
|
"INFO: Scrubbing @user handles using regex...\n",
|
|
693
|
-
"INFO: 0
|
|
694
|
+
"INFO: 0 handle scrubbed.\n",
|
|
694
695
|
"INFO: Scrubbing IP addresses using regex...\n",
|
|
695
|
-
"INFO: 0
|
|
696
|
+
"INFO: 0 ip_address scrubbed.\n",
|
|
696
697
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
697
|
-
"INFO: 0
|
|
698
|
-
"INFO: Scrubbing
|
|
699
|
-
"INFO: 0
|
|
698
|
+
"INFO: 0 uk_phone_number scrubbed.\n",
|
|
699
|
+
"INFO: Scrubbing postcodes using regex...\n",
|
|
700
|
+
"INFO: 0 uk_postcode scrubbed.\n",
|
|
700
701
|
"INFO: Scrubbing titles using regex...\n",
|
|
701
|
-
"INFO: 2
|
|
702
|
-
" 33%|███▎ | 1/3 [00:02<00:05, 2.
|
|
702
|
+
"INFO: 2 title scrubbed.\n",
|
|
703
|
+
" 33%|███▎ | 1/3 [00:02<00:05, 2.62s/it]INFO: Texts loaded.\n",
|
|
704
|
+
"INFO: Scrubbing column `The Adventures of Sherlock Holmes`...\n",
|
|
703
705
|
"INFO: Scrubbing using Presidio...\n",
|
|
704
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
705
|
-
"INFO: 2
|
|
706
|
-
"INFO: 2
|
|
706
|
+
"100%|██████████| 5/5 [00:00<00:00, 28.25it/s]\n",
|
|
707
|
+
"INFO: 2 person scrubbed.\n",
|
|
708
|
+
"INFO: 2 person scrubbed.\n",
|
|
707
709
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
708
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
709
|
-
"INFO:
|
|
710
|
+
"100%|██████████| 5/5 [00:00<00:00, 82.44it/s]\n",
|
|
711
|
+
"INFO: 2 person scrubbed.\n",
|
|
710
712
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
711
|
-
"INFO: 0
|
|
713
|
+
"INFO: 0 phone_number scrubbed.\n",
|
|
712
714
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
713
|
-
"INFO: 0
|
|
715
|
+
"INFO: 0 email_address scrubbed.\n",
|
|
714
716
|
"INFO: Scrubbing @user handles using regex...\n",
|
|
715
|
-
"INFO: 0
|
|
717
|
+
"INFO: 0 handle scrubbed.\n",
|
|
716
718
|
"INFO: Scrubbing IP addresses using regex...\n",
|
|
717
|
-
"INFO: 0
|
|
719
|
+
"INFO: 0 ip_address scrubbed.\n",
|
|
718
720
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
719
|
-
"INFO: 0
|
|
720
|
-
"INFO: Scrubbing
|
|
721
|
-
"INFO: 0
|
|
721
|
+
"INFO: 0 uk_phone_number scrubbed.\n",
|
|
722
|
+
"INFO: Scrubbing postcodes using regex...\n",
|
|
723
|
+
"INFO: 0 uk_postcode scrubbed.\n",
|
|
722
724
|
"INFO: Scrubbing titles using regex...\n",
|
|
723
|
-
"INFO: 0
|
|
724
|
-
" 67%|██████▋ | 2/3 [00:
|
|
725
|
+
"INFO: 0 title scrubbed.\n",
|
|
726
|
+
" 67%|██████▋ | 2/3 [00:05<00:02, 2.50s/it]INFO: Texts loaded.\n",
|
|
727
|
+
"INFO: Scrubbing column `Fake book`...\n",
|
|
725
728
|
"INFO: Scrubbing using Presidio...\n",
|
|
726
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
727
|
-
"INFO:
|
|
728
|
-
"INFO: 2
|
|
729
|
-
"INFO: 3
|
|
730
|
-
"INFO: 3
|
|
729
|
+
"100%|██████████| 5/5 [00:00<00:00, 13.15it/s]\n",
|
|
730
|
+
"INFO: 1 iban_code scrubbed.\n",
|
|
731
|
+
"INFO: 2 person scrubbed.\n",
|
|
732
|
+
"INFO: 3 email_address scrubbed.\n",
|
|
733
|
+
"INFO: 3 email_address scrubbed.\n",
|
|
731
734
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
732
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
733
|
-
"INFO:
|
|
735
|
+
"100%|██████████| 5/5 [00:00<00:00, 54.15it/s]\n",
|
|
736
|
+
"INFO: 2 person scrubbed.\n",
|
|
734
737
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
735
|
-
"INFO: 0
|
|
738
|
+
"INFO: 0 phone_number scrubbed.\n",
|
|
736
739
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
737
|
-
"INFO:
|
|
740
|
+
"INFO: 3 email_address scrubbed.\n",
|
|
738
741
|
"INFO: Scrubbing @user handles using regex...\n",
|
|
739
|
-
"INFO: 0
|
|
742
|
+
"INFO: 0 handle scrubbed.\n",
|
|
740
743
|
"INFO: Scrubbing IP addresses using regex...\n",
|
|
741
|
-
"INFO: 0
|
|
744
|
+
"INFO: 0 ip_address scrubbed.\n",
|
|
742
745
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
743
|
-
"INFO: 0
|
|
744
|
-
"INFO: Scrubbing
|
|
745
|
-
"INFO: 4
|
|
746
|
+
"INFO: 0 uk_phone_number scrubbed.\n",
|
|
747
|
+
"INFO: Scrubbing postcodes using regex...\n",
|
|
748
|
+
"INFO: 4 uk_postcode scrubbed.\n",
|
|
746
749
|
"INFO: Scrubbing titles using regex...\n",
|
|
747
|
-
"INFO: 0
|
|
748
|
-
"100%|██████████| 3/3 [00:07<00:00, 2.
|
|
750
|
+
"INFO: 0 title scrubbed.\n",
|
|
751
|
+
"100%|██████████| 3/3 [00:07<00:00, 2.56s/it]\n"
|
|
749
752
|
]
|
|
750
753
|
},
|
|
751
754
|
{
|
|
@@ -892,12 +895,12 @@
|
|
|
892
895
|
" <th></th>\n",
|
|
893
896
|
" <th>ID</th>\n",
|
|
894
897
|
" <th>column</th>\n",
|
|
895
|
-
" <th>
|
|
896
|
-
" <th>
|
|
897
|
-
" <th>
|
|
898
|
-
" <th>
|
|
899
|
-
" <th>
|
|
900
|
-
" <th>
|
|
898
|
+
" <th>person</th>\n",
|
|
899
|
+
" <th>title</th>\n",
|
|
900
|
+
" <th>email_address</th>\n",
|
|
901
|
+
" <th>iban_code</th>\n",
|
|
902
|
+
" <th>url</th>\n",
|
|
903
|
+
" <th>uk_postcode</th>\n",
|
|
901
904
|
" </tr>\n",
|
|
902
905
|
" </thead>\n",
|
|
903
906
|
" <tbody>\n",
|
|
@@ -1005,49 +1008,38 @@
|
|
|
1005
1008
|
"</div>"
|
|
1006
1009
|
],
|
|
1007
1010
|
"text/plain": [
|
|
1008
|
-
" ID column
|
|
1009
|
-
"0 A Pride and Prejudice [Darcy, Elizabeth] \n",
|
|
1010
|
-
"1 B Pride and Prejudice [Bennet] \n",
|
|
1011
|
-
"2 C Pride and Prejudice [Elizabeth] \n",
|
|
1012
|
-
"3 A The Adventures of Sherlock Holmes [Sherlock Holmes] \n",
|
|
1013
|
-
"4 D The Adventures of Sherlock Holmes [Watson] \n",
|
|
1014
|
-
"5 A Fake book None \n",
|
|
1015
|
-
"6 B Fake book [Mick Jagger, David Bowie] \n",
|
|
1016
|
-
"7 C Fake book None \n",
|
|
1017
|
-
"8 E Fake book None \n",
|
|
1018
|
-
"\n",
|
|
1019
|
-
" scrubbed_titles scrubbed_presidio_email_address \\\n",
|
|
1020
|
-
"0 [Mr] None \n",
|
|
1021
|
-
"1 [Mr] None \n",
|
|
1022
|
-
"2 None None \n",
|
|
1023
|
-
"3 None None \n",
|
|
1024
|
-
"4 None None \n",
|
|
1025
|
-
"5 None [freddie.mercury@queen.com] \n",
|
|
1026
|
-
"6 None None \n",
|
|
1027
|
-
"7 None [serena.williams@tennis.com] \n",
|
|
1028
|
-
"8 None [otis.redding@dockofthebay.org] \n",
|
|
1011
|
+
" ID column person title \\\n",
|
|
1012
|
+
"0 A Pride and Prejudice [Darcy, Elizabeth] [Mr] \n",
|
|
1013
|
+
"1 B Pride and Prejudice [Bennet] [Mr] \n",
|
|
1014
|
+
"2 C Pride and Prejudice [Elizabeth] None \n",
|
|
1015
|
+
"3 A The Adventures of Sherlock Holmes [Sherlock Holmes] None \n",
|
|
1016
|
+
"4 D The Adventures of Sherlock Holmes [Watson] None \n",
|
|
1017
|
+
"5 A Fake book None None \n",
|
|
1018
|
+
"6 B Fake book [Mick Jagger, David Bowie] None \n",
|
|
1019
|
+
"7 C Fake book None None \n",
|
|
1020
|
+
"8 E Fake book None None \n",
|
|
1029
1021
|
"\n",
|
|
1030
|
-
"
|
|
1031
|
-
"0
|
|
1032
|
-
"1
|
|
1033
|
-
"2
|
|
1034
|
-
"3
|
|
1035
|
-
"4
|
|
1036
|
-
"5
|
|
1037
|
-
"6
|
|
1038
|
-
"7
|
|
1039
|
-
"8
|
|
1022
|
+
" email_address iban_code \\\n",
|
|
1023
|
+
"0 None None \n",
|
|
1024
|
+
"1 None None \n",
|
|
1025
|
+
"2 None None \n",
|
|
1026
|
+
"3 None None \n",
|
|
1027
|
+
"4 None None \n",
|
|
1028
|
+
"5 [freddie.mercury@queen.com] [GB91BKEN10000041610008] \n",
|
|
1029
|
+
"6 None None \n",
|
|
1030
|
+
"7 [serena.williams@tennis.com] None \n",
|
|
1031
|
+
"8 [otis.redding@dockofthebay.org] None \n",
|
|
1040
1032
|
"\n",
|
|
1041
|
-
"
|
|
1042
|
-
"0
|
|
1043
|
-
"1
|
|
1044
|
-
"2
|
|
1045
|
-
"3
|
|
1046
|
-
"4
|
|
1047
|
-
"5
|
|
1048
|
-
"6
|
|
1049
|
-
"7
|
|
1050
|
-
"8
|
|
1033
|
+
" url uk_postcode \n",
|
|
1034
|
+
"0 None None \n",
|
|
1035
|
+
"1 None None \n",
|
|
1036
|
+
"2 None None \n",
|
|
1037
|
+
"3 None None \n",
|
|
1038
|
+
"4 None None \n",
|
|
1039
|
+
"5 [freddie.me, queen.com] [SW1A 2AA] \n",
|
|
1040
|
+
"6 None [SW1A 2WH] \n",
|
|
1041
|
+
"7 [tennis.com] [SW19 5AE] \n",
|
|
1042
|
+
"8 [otis.red, dockofthebay.org] [EH8 8DX] "
|
|
1051
1043
|
]
|
|
1052
1044
|
},
|
|
1053
1045
|
"execution_count": 11,
|
test/conftest.py
CHANGED
|
@@ -10,3 +10,13 @@ def scrub_object():
|
|
|
10
10
|
"My number is +441111111111 and I live at AA11 1AA.",
|
|
11
11
|
]
|
|
12
12
|
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def scrub_object_all():
|
|
17
|
+
return IDScrub(
|
|
18
|
+
[
|
|
19
|
+
"We are Hamish McDonald, L. Salah, and Elena Suárez, Professor Patel, @johnsmith, 8.8.8.8, marie-9999@randomemail.co.uk.",
|
|
20
|
+
"My number is +441111111111 and I live at AA11 1AA.",
|
|
21
|
+
]
|
|
22
|
+
)
|
test/test_all.py
CHANGED
|
@@ -30,9 +30,9 @@ def test_get_scrubbed_data(scrub_object):
|
|
|
30
30
|
expected_df = pd.DataFrame(
|
|
31
31
|
{
|
|
32
32
|
"text_id": {0: 1, 1: 2},
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
33
|
+
"person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
|
|
34
|
+
"uk_phone_number": {0: None, 1: ["+441111111111"]},
|
|
35
|
+
"uk_postcode": {0: None, 1: ["AA11 1AA"]},
|
|
36
36
|
}
|
|
37
37
|
)
|
|
38
38
|
|
test/test_chain.py
CHANGED
|
@@ -22,8 +22,8 @@ def test_chain_order(scrub_object):
|
|
|
22
22
|
"My number is [PHONENO] and I live at AA11 1AA.",
|
|
23
23
|
]
|
|
24
24
|
|
|
25
|
-
assert scrub_object.get_scrubbed_data()["
|
|
26
|
-
assert "
|
|
25
|
+
assert scrub_object.get_scrubbed_data()["uk_phone_number"].to_list() == [["+441111111111"]]
|
|
26
|
+
assert "uk_postcode" not in scrub_object.get_scrubbed_data().columns
|
|
27
27
|
|
|
28
28
|
scrubbed = scrub_object.uk_postcodes()
|
|
29
29
|
|
|
@@ -31,8 +31,8 @@ def test_chain_order(scrub_object):
|
|
|
31
31
|
"Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
|
|
32
32
|
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
33
33
|
]
|
|
34
|
-
assert scrub_object.get_scrubbed_data()["
|
|
35
|
-
assert scrub_object.get_scrubbed_data()["
|
|
34
|
+
assert scrub_object.get_scrubbed_data()["uk_phone_number"].to_list() == [["+441111111111"]]
|
|
35
|
+
assert scrub_object.get_scrubbed_data()["uk_postcode"].to_list() == [["AA11 1AA"]]
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def test_get_scrubbed_data_chain(scrub_object):
|
|
@@ -45,9 +45,9 @@ def test_get_scrubbed_data_chain(scrub_object):
|
|
|
45
45
|
expected_df = pd.DataFrame(
|
|
46
46
|
{
|
|
47
47
|
"text_id": {0: 1, 1: 2},
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
48
|
+
"uk_phone_number": {0: None, 1: ["+441111111111"]},
|
|
49
|
+
"uk_postcode": {0: None, 1: ["AA11 1AA"]},
|
|
50
|
+
"person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
|
|
51
51
|
}
|
|
52
52
|
)
|
|
53
53
|
|