idscrub 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
idscrub/scrub.py CHANGED
@@ -2,8 +2,9 @@ import logging
2
2
  import os
3
3
  import re
4
4
  import warnings
5
+ from collections import defaultdict
5
6
  from collections.abc import Iterable
6
- from functools import partial
7
+ from dataclasses import asdict, dataclass
7
8
 
8
9
  import pandas as pd
9
10
  import phonenumbers
@@ -11,8 +12,6 @@ import spacy
11
12
  from huggingface_hub.utils import HFValidationError
12
13
  from presidio_analyzer import AnalyzerEngine
13
14
  from presidio_analyzer.nlp_engine import SpacyNlpEngine
14
- from presidio_anonymizer import AnonymizerEngine
15
- from presidio_anonymizer.entities import OperatorConfig
16
15
  from spacy.cli import download
17
16
  from spacy.language import Language
18
17
  from tqdm import tqdm
@@ -29,12 +28,44 @@ trf_logging.set_verbosity_error()
29
28
 
30
29
 
31
30
  class IDScrub:
31
+ """
32
+ Class for identifying and scrubbing entities in text.
33
+ """
34
+
35
+ @dataclass
36
+ class IDEnt:
37
+ """
38
+ Structured representation of an identified entity (ident) within text.
39
+
40
+ Attributes:
41
+ text_id (str | int | float): A unique identifier for the original text.
42
+ text (str): The exact substring extracted from the original text.
43
+ start (int): The starting character offset of the ident within the original text.
44
+ end (int): The ending character offset of the ident within the original text.
45
+ label (str): The ident type (e.g. 'person').
46
+ replacement (str): The text that should replace this ident during scrubbing.
47
+ priority (float): Priority score for overlapping idents.
48
+ Higher scored idents are scrubbed where an overlap occurs.
49
+ The scores are relative e.g. 0.2 beats 0.1.
50
+ source (str): The source model or method that identified the ident.
51
+ """
52
+
53
+ text_id: str | int | float
54
+ text: str
55
+ start: int
56
+ end: int
57
+ label: str
58
+ replacement: str
59
+ priority: float
60
+ source: str
61
+
32
62
  def __init__(
33
63
  self,
34
- texts: list[str] = [],
64
+ texts: list[str] = None,
35
65
  text_ids: list | Iterable = None,
36
66
  text_id_name: str = "text_id",
37
- replacement_text: str = None,
67
+ replacement: str = None,
68
+ exclude: list[str] = [],
38
69
  verbose: bool = True,
39
70
  ):
40
71
  """
@@ -46,32 +77,37 @@ class IDScrub:
46
77
  such as the ID column in a DataFrame. If None, an integer index starting at 1 is applied.
47
78
  This is used to identify texts in get_scrubbed_data().
48
79
  text_id_name (str): Name of the ID column in get_scrubbed_data(). Default is `text_id`.
49
- replacement_text (str): A global string to replace every scrubbed
50
- string with.
80
+ replacement (str): A global string to replace every scrubbed string with.
81
+ exclude (list[str]): A list of strings that will not be scrubbed if identified.
51
82
  verbose (bool): Whether to show all log messages or only warnings.
52
83
  """
53
84
 
54
- assert isinstance(texts, list) and all(isinstance(text, str) for text in texts), (
55
- "`texts` can only be a list of strings or a single string in a list."
56
- )
85
+ if not isinstance(texts, list):
86
+ raise TypeError("`texts` must be a list.")
87
+ if not all(isinstance(text, str) for text in texts):
88
+ raise TypeError("`texts` must be a list of strings.")
57
89
 
58
- assert isinstance(replacement_text, str) or isinstance(replacement_text, type(None)), (
59
- "`replacement_text` can only be string."
60
- )
90
+ if replacement is not None and not isinstance(replacement, str):
91
+ raise TypeError("`replacement` must be a string or None.")
61
92
 
62
93
  self.texts = texts
63
94
 
64
- if text_ids:
65
- self.text_ids = text_ids
66
- else:
67
- self.text_ids = range(1, len(self.texts) + 1)
95
+ if text_ids is None:
96
+ text_ids = range(1, len(self.texts) + 1)
97
+
98
+ if not len(self.texts) == len(text_ids):
99
+ raise ValueError("Length of texts is different to the length of text IDs.")
68
100
 
69
- assert len(self.texts) == len(self.text_ids), "Length of texts is different to the length of text IDs."
101
+ self.text_ids = text_ids
70
102
 
103
+ self.replacement = replacement
71
104
  self.text_id_name = text_id_name
72
- self.cleaned_texts = []
73
- self.scrubbed_data = []
74
- self.replacement_text = replacement_text
105
+ self.exclude = exclude
106
+ self.scrubbed_texts = []
107
+ self.idents: list[IDScrub.IDEnt] = []
108
+
109
+ self.hf_ner = None
110
+ self.spacy_docs = None
75
111
 
76
112
  self.logger = logging.getLogger(self.__class__.__name__)
77
113
  self.logger.setLevel(logging.DEBUG if verbose else logging.WARNING)
@@ -84,284 +120,295 @@ class IDScrub:
84
120
 
85
121
  self.logger.info("Texts loaded.")
86
122
 
87
- def get_texts(self) -> list[str]:
88
- """
89
- Get the text that will be processed.
90
- If no cleaning has occured, then use the raw input
91
- texts. If cleaning has occured then update the cleaned texts.
92
-
93
- Args:
94
- None
95
-
96
- Returns:
97
- A Pandas DataFrame with text_id
98
- and scrubbed in a list format.
99
- """
100
- if self.cleaned_texts:
101
- texts = self.cleaned_texts
102
- else:
103
- texts = self.texts
104
-
105
- return texts
106
-
107
- def get_scrubbed_data(self) -> pd.DataFrame:
123
+ def find_regex(
124
+ self,
125
+ texts: list[str],
126
+ text_ids: list,
127
+ pattern: str,
128
+ replacement: str,
129
+ label: str,
130
+ priority: float,
131
+ ) -> list[IDEnt]:
108
132
  """
109
- Turn text ids and scrubbed text into a DataFrame.
133
+ General method to clean text using a regex pattern.
110
134
 
111
135
  Args:
112
- None
136
+ texts (list[str]): Strings to scrub.
137
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
138
+ If None, current cleaned state of `texts` passed at Class initiation used.
139
+ pattern (str): Regex pattern to apply.
140
+ replacement (str): The replacement text for the removed text.
141
+ label (str): Label for the personal data removed.
142
+ priority (float): Priority score for overlapping entities.
143
+ Higher scored entities are scrubbed where an overlap occurs.
144
+ The scores are relative e.g. 0.2 beats 0.1.
113
145
 
114
146
  Returns:
115
- A Pandas DataFrame with text_id
116
- and scrubbed in a list format.
147
+ list[IDEnt]: A list of IDEnt objects.
117
148
  """
118
- df = pd.DataFrame(self.scrubbed_data)
119
149
 
120
- if self.text_id_name not in df.columns:
121
- return None
122
-
123
- # Group by the id and aggregate non-null values into lists
124
- if df[self.text_id_name].dtype == object or df[self.text_id_name].dtype == str:
125
- grouped = (
126
- df.groupby(self.text_id_name, sort=False)
127
- .agg(lambda x: [i for i in x if pd.notna(i)])
128
- .reset_index()
129
- .map(lambda x: None if isinstance(x, list) and len(x) == 0 else x)
130
- )
131
- else:
132
- grouped = (
133
- df.groupby(self.text_id_name)
134
- .agg(lambda x: [i for i in x if pd.notna(i)])
135
- .reset_index()
136
- .map(lambda x: None if isinstance(x, list) and len(x) == 0 else x)
137
- )
150
+ if self.replacement:
151
+ replacement = self.replacement
152
+
153
+ compiled = re.compile(pattern, re.IGNORECASE)
154
+ idents = []
155
+
156
+ for text_id, text in zip(text_ids, texts):
157
+ for match in compiled.finditer(text):
158
+ idents.append(
159
+ self.IDEnt(
160
+ text_id=text_id,
161
+ text=match.group(),
162
+ start=match.start(),
163
+ end=match.end(),
164
+ label=label,
165
+ replacement=replacement,
166
+ priority=priority,
167
+ source="regex",
168
+ )
169
+ )
138
170
 
139
- return grouped
171
+ return idents
140
172
 
141
- def log_message(self, label) -> None:
173
+ def custom_regex(
174
+ self, texts: list[str] = None, text_ids: list = None, patterns: dict = None, source: str = "custom_regex"
175
+ ) -> list[IDEnt]:
142
176
  """
143
- Log message with count of PII-type scrubbed.
177
+ Remove text matching a custom regex pattern.
144
178
 
145
179
  Args:
146
- label (str): Label for the personal data removed.
180
+ texts (list[str]): Strings to scrub.
181
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
182
+ patterns (dict): {"name": {"pattern": r"John", "replacement": "[NAME]", "priority": 0.5}}
183
+ source (str): The methodological source of the scrubbed ident.
147
184
  Returns:
148
- int: The count of PII-type scrubbed.
149
- """
150
-
151
- if any(label in key for key in self.scrubbed_data):
152
- scrubbed_data = self.get_scrubbed_data()
153
- count = scrubbed_data[label].dropna().apply(len).sum()
154
- else:
155
- count = 0
185
+ list[IDEnt]: A list of IDEnt objects.
156
186
 
157
- self.logger.info(f"{count} {label} scrubbed.")
158
-
159
- return count
160
-
161
- def scrub_and_collect(self, match, text, replacement_text, i, label) -> str:
162
- """
163
- Scrub pattern match and collect scrubbed name.
164
-
165
- Args:
166
- match (str): The regex match passed from `re.sub()`.
167
- i (int): the enumerate id of the string.
168
- label (str): Label for the personal data removed.
169
-
170
- Returns:
171
- str: The replacement text.
172
187
  """
173
188
 
174
- self.scrubbed_data.append({self.text_id_name: i, label: match.group()})
189
+ idents = []
190
+
191
+ for text, text_id in zip(texts, text_ids):
192
+ for label, params in patterns.items():
193
+ pattern = params["pattern"]
194
+ replacement = params.get("replacement", "[REDACTED]")
195
+ priority = params.get("priority", 0.5)
196
+
197
+ compiled = re.compile(pattern, flags=re.IGNORECASE)
198
+
199
+ for match in compiled.finditer(text):
200
+ idents.append(
201
+ self.IDEnt(
202
+ text_id=text_id,
203
+ text=match.group(),
204
+ start=match.start(),
205
+ end=match.end(),
206
+ label=label,
207
+ replacement=replacement,
208
+ priority=priority,
209
+ source=source,
210
+ )
211
+ )
175
212
 
176
- return replacement_text
213
+ return idents
177
214
 
178
- def scrub_regex(self, pattern, replacement_text, label) -> list[str]:
215
+ def email_addresses(
216
+ self,
217
+ texts: list[str] = None,
218
+ text_ids: list = None,
219
+ replacement: str = "[EMAIL_ADDRESS]",
220
+ label: str = "email_address",
221
+ priority: float = 0.7,
222
+ ) -> list[IDEnt]:
179
223
  """
180
- General method to clean text using a regex pattern.
224
+ Remove email addresses using regex e.g. `johnsmith@mail.com` identified.
181
225
 
182
226
  Args:
183
- pattern (str): Regex pattern to apply.
184
- replacement_text (str): The replacement text for the removed text.
227
+ texts (list[str]): Strings to scrub.
228
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
229
+ If None, current cleaned state of `texts` passed at Class initiation used.
230
+ replacement (str): The replacement text for the removed text.
185
231
  label (str): Label for the personal data removed.
232
+ priority (float): Priority score for overlapping entities.
233
+ Higher scored entities are scrubbed where an overlap occurs.
234
+ The scores are relative e.g. 0.2 beats 0.1.
186
235
 
187
236
  Returns:
188
- list[str]: Cleaned texts.
237
+ list[IDEnt]: A list of IDEnt objects.
189
238
  """
190
239
 
191
- texts = self.get_texts()
192
-
193
- compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)
194
-
195
- if self.replacement_text:
196
- replacement_text = self.replacement_text
197
-
198
- cleaned_texts = [
199
- compiled_pattern.sub(
200
- partial(
201
- self.scrub_and_collect,
202
- text=text,
203
- replacement_text=replacement_text,
204
- i=i,
205
- label=label,
206
- ),
207
- text,
208
- )
209
- for i, text in zip(self.text_ids, texts)
210
- ]
211
-
212
- self.cleaned_texts = cleaned_texts
213
-
214
- self.log_message(label)
215
-
216
- return cleaned_texts
240
+ pattern = r"\b\S+@\S+\.\S+\b"
241
+ return self.find_regex(
242
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
243
+ )
217
244
 
218
- def custom_regex(
245
+ def urls(
219
246
  self,
220
- custom_regex_patterns: list[str] = None,
221
- custom_replacement_texts: list[str] = None,
222
- labels: list[str] = None,
223
- ) -> list[str]:
247
+ texts: list[str] = None,
248
+ text_ids: list = None,
249
+ replacement: str = "[URL]",
250
+ label: str = "url",
251
+ priority: float = 0.3,
252
+ ) -> list[IDEnt]:
224
253
  """
225
- Remove text matching a custom regex pattern.
254
+ Remove `http`, `https` and `www` URLs using regex e.g. `www.google.com` identified.
226
255
 
227
- Args:
228
- custom_regex_patterns list[str]: Regex(s) pattern to apply.
229
- custom_replacement_texts list[str]: The replacement texts for the removed text.
230
- Defaults to '[REDACTED]' for all.
231
- labels list[str]: Labels for patterns removed.
232
-
233
- Returns:
234
- list[str]: Cleaned texts.
235
-
236
- """
237
- self.logger.info("Scrubbing custom regex...")
238
-
239
- if custom_replacement_texts:
240
- assert len(custom_regex_patterns) == len(custom_replacement_texts), (
241
- "There must be a replacement text for each pattern."
242
- )
243
- else:
244
- custom_replacement_texts = ["[REDACTED]"] * len(custom_regex_patterns)
245
-
246
- for i, (pattern, replacement_text) in enumerate(zip(custom_regex_patterns, custom_replacement_texts)):
247
- if labels:
248
- assert len(custom_regex_patterns) == len(labels), "There must be a label for each pattern."
249
- self.scrub_regex(pattern, replacement_text, label=f"{labels[i]}")
250
- else:
251
- self.scrub_regex(pattern, replacement_text, label=f"custom_regex_{i + 1}")
252
-
253
- return self.cleaned_texts
254
-
255
- def email_addresses(self, replacement_text: str = "[EMAIL_ADDRESS]", label: str = "email_address") -> list[str]:
256
- """
257
- Remove email addresses using regex.
258
- e.g. `johnsmith@gmail.com` scrubbed
256
+ `example.com` will not be scrubbed by this method.
259
257
 
260
258
  Args:
261
- replacement_text (str): The replacement text for the removed text.
259
+ texts (list[str]): Strings to scrub.
260
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
261
+ If None, current cleaned state of `texts` passed at Class initiation used.
262
+ replacement (str): The replacement text for the removed text.
262
263
  label (str): Label for the personal data removed.
264
+ priority (float): Priority score for overlapping entities.
265
+ Higher scored entities are scrubbed where an overlap occurs.
266
+ The scores are relative e.g. 0.2 beats 0.1.
263
267
 
264
268
  Returns:
265
- list[str]: The input list of text with email addresses replaced.
269
+ list[IDEnt]: A list of IDEnt objects.
266
270
  """
267
271
 
268
- self.logger.info("Scrubbing email addresses using regex...")
269
- pattern = r"\b\S+@\S+\.\S+\b"
270
-
271
- return self.scrub_regex(pattern, replacement_text, label=label)
272
+ pattern = r"\b(?:https?://|www\.)[^\s<>()\"']+"
273
+ return self.find_regex(
274
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
275
+ )
272
276
 
273
- def handles(self, replacement_text: str = "[HANDLE]", label: str = "handle") -> list[str]:
277
+ def handles(
278
+ self,
279
+ texts: list[str] = None,
280
+ text_ids: list = None,
281
+ replacement: str = "[HANDLE]",
282
+ label: str = "handle",
283
+ priority: float = 0.4,
284
+ ) -> list[IDEnt]:
274
285
  """
275
- Remove `@` user handles using regex
276
- e.g. `@username` scrubbed
286
+ Remove `@` user handles using regex e.g. `@username` identified.
277
287
 
278
288
  Args:
279
- replacement_text (str): The replacement text for the removed text.
289
+ texts (list[str]): Strings to scrub.
290
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
291
+ If None, current cleaned state of `texts` passed at Class initiation used.
292
+ replacement (str): The replacement text for the removed text.
280
293
  label (str): Label for the personal data removed.
294
+ priority (float): Priority score for overlapping entities.
295
+ Higher scored entities are scrubbed where an overlap occurs.
296
+ The scores are relative e.g. 0.2 beats 0.1.
281
297
 
282
298
  Returns:
283
- list[str]: The input list of text with handles replaced.
299
+ list[IDEnt]: A list of IDEnt objects.
284
300
  """
285
301
 
286
- self.logger.info("Scrubbing @user handles using regex...")
287
302
  pattern = r"@[\w.-]+(?=\b)"
288
-
289
- return self.scrub_regex(pattern, replacement_text, label=label)
303
+ return self.find_regex(
304
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
305
+ )
290
306
 
291
307
  def google_phone_numbers(
292
- self, region: str = "GB", replacement_text: str = "[PHONENO]", label: str = "phone_number"
293
- ) -> list[str]:
308
+ self,
309
+ texts: list[str] = None,
310
+ text_ids: list = None,
311
+ region: str = "GB",
312
+ replacement: str = "[PHONENO]",
313
+ label: str = "phone_number",
314
+ priority: float = 0.8,
315
+ ) -> list[IDEnt]:
294
316
  """
295
- Remove phone numbers using Google's `phonenumbers`.
296
- e.g. `+441234567891` scrubbed
317
+ Remove phone numbers using Google's `phonenumbers` e.g. `+441234567891` identified.
297
318
 
298
319
  Args:
320
+ texts (list[str]): Strings to scrub.
321
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
322
+ If None, current cleaned state of `texts` passed at Class initiation used.
299
323
  region (str): The region to find phone numbers for. See `phonenumbers` regions.
300
- replacement_text (str): The replacement text for the removed text.
324
+ replacement (str): The replacement text for the removed text.
301
325
  label (str): Label for the personal data removed.
326
+ priority (float): Priority score for overlapping entities.
327
+ Higher scored entities are scrubbed where an overlap occurs.
328
+ The scores are relative e.g. 0.2 beats 0.1.
302
329
 
303
330
  Returns:
304
- list[str]: The input list of text with phone numbers replaced.
331
+ list[IDEnt]: A list of IDEnt objects.
305
332
  """
306
333
 
307
- self.logger.info(f"Scrubbing {region} phone numbers using Google's `phonenumbers`...")
334
+ if self.replacement:
335
+ replacement = self.replacement
308
336
 
309
- texts = self.get_texts()
337
+ idents = []
310
338
 
311
- if self.replacement_text:
312
- replacement_text = self.replacement_text
313
-
314
- cleaned_texts = []
315
-
316
- for i, text in zip(self.text_ids, texts):
339
+ for text, text_id in zip(texts, text_ids):
317
340
  matches = list(phonenumbers.PhoneNumberMatcher(text, region))
318
- phone_nos = [match.raw_string for match in matches]
319
-
320
- for phone_no in phone_nos:
321
- self.scrubbed_data.append({self.text_id_name: i, label: phone_no})
322
-
323
- cleaned = text
324
- for match in reversed(matches):
325
- cleaned = cleaned[: match.start] + replacement_text + cleaned[match.end :]
326
-
327
- cleaned_texts.append(cleaned)
328
-
329
- self.cleaned_texts = cleaned_texts
330
-
331
- self.log_message(label)
341
+ for match in matches:
342
+ idents.append(
343
+ self.IDEnt(
344
+ text_id=text_id,
345
+ text=match.raw_string,
346
+ start=match.start,
347
+ end=match.end,
348
+ priority=priority,
349
+ replacement=replacement,
350
+ label="phone_no",
351
+ source="google_phone_numbers",
352
+ )
353
+ )
332
354
 
333
- return cleaned_texts
355
+ return idents
334
356
 
335
- def uk_phone_numbers(self, replacement_text: str = "[PHONENO]", label: str = "uk_phone_number") -> list[str]:
357
+ def uk_phone_numbers(
358
+ self,
359
+ texts: list[str] = None,
360
+ text_ids: list = None,
361
+ replacement: str = "[PHONENO]",
362
+ label: str = "uk_phone_number",
363
+ priority: float = 0.8,
364
+ ) -> list[IDEnt]:
336
365
  """
337
- Remove phone numbers using regex.
338
- e.g. `+441234567891` scrubbed
366
+ Remove phone numbers using regex e.g. `+441234567891` identified.
339
367
 
340
368
  Args:
341
- replacement_text (str): The replacement text for the removed text.
369
+ texts (list[str]): Strings to scrub.
370
+ If None, current cleaned state of `text` passed at Class initiation used.
371
+ replacement (str): The replacement text for the removed text.
342
372
  label (str): Label for the personal data removed.
373
+ priority (float): Priority score for overlapping entities.
374
+ Higher scored entities are scrubbed where an overlap occurs.
375
+ The scores are relative e.g. 0.2 beats 0.1.
343
376
 
344
377
  Returns:
345
- list[str]: The input list of text with phone numbers replaced.
378
+ list[IDEnt]: A list of IDEnt objects.
346
379
  """
347
380
 
348
- self.logger.info("Scrubbing phone numbers using regex...")
349
381
  pattern = r"(\+?\d[\d\s]{7,}\d)"
382
+ return self.find_regex(
383
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
384
+ )
350
385
 
351
- return self.scrub_regex(pattern, replacement_text, label=label)
352
-
353
- def titles(self, strict: bool = False, replacement_text: str = "[TITLE]", label: str = "title") -> list[str]:
386
+ def titles(
387
+ self,
388
+ texts: list[str] = None,
389
+ text_ids: list = None,
390
+ strict: bool = False,
391
+ replacement: str = "[TITLE]",
392
+ label: str = "title",
393
+ priority: float = 0.4,
394
+ ) -> list[IDEnt]:
354
395
  """
355
396
  Remove titles using regex.
356
397
 
357
398
  Args:
399
+ texts (list[str]): Strings to scrub.
400
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
401
+ If None, current cleaned state of `text` passed at Class initiation used.
358
402
  strict (bool): Whether to use all of the titles or only essential titles.
359
403
  If strict, you may find scrubbing of common words, such as general.
360
- replacement_text (str): The replacement text for the removed text.
404
+ replacement (str): The replacement text for the removed text.
361
405
  label (str): Label for the personal data removed.
406
+ priority (float): Priority score for overlapping entities.
407
+ Higher scored entities are scrubbed where an overlap occurs.
408
+ The scores are relative e.g. 0.2 beats 0.1.
362
409
 
363
410
  Returns:
364
- list[str]: The input list of text with names after titles replaced.
411
+ list[IDEnt]: A list of IDEnt objects.
365
412
  """
366
413
 
367
414
  titles = [
@@ -413,103 +460,109 @@ class IDScrub:
413
460
  titles += [title + "." for title in titles]
414
461
  titles += [title + ":" for title in titles]
415
462
 
416
- self.logger.info("Scrubbing titles using regex...")
417
463
  pattern = r"\b(?:{})\b".format("|".join(re.escape(t) for t in titles))
464
+ return self.find_regex(
465
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
466
+ )
418
467
 
419
- return self.scrub_regex(pattern, replacement_text, label=label)
420
-
421
- def ip_addresses(self, replacement_text: str = "[IPADDRESS]", label: str = "ip_address") -> list[str]:
468
+ def ip_addresses(
469
+ self,
470
+ texts: list[str] = None,
471
+ text_ids: list = None,
472
+ replacement: str = "[IPADDRESS]",
473
+ label: str = "ip_address",
474
+ priority: float = 0.5,
475
+ ) -> list[IDEnt]:
422
476
  """
423
- Removes IP addresses.
424
- e.g. `192.168.1.1` scrubbed
477
+ Removes IP addresses e.g. `192.168.1.1` identified.
425
478
 
426
479
  Args:
427
- replacement_text (str): The replacement text for the removed text.
480
+ texts (list[str]): Strings to scrub.
481
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
482
+ If None, current cleaned state of `texts` passed at Class initiation used.
483
+ replacement (str): The replacement text for the removed text.
484
+ label (str): Label for the personal data removed.
485
+ priority (float): Priority score for overlapping entities.
486
+ Higher scored entities are scrubbed where an overlap occurs.
487
+ The scores are relative e.g. 0.2 beats 0.1.
428
488
 
429
489
  Returns:
430
- list[str]: The input list of text with IP addresses replaced.
490
+ list[IDEnt]: A list of IDEnt objects.
431
491
  """
432
492
 
433
- self.logger.info("Scrubbing IP addresses using regex...")
434
493
  pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
494
+ return self.find_regex(
495
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
496
+ )
435
497
 
436
- return self.scrub_regex(pattern, replacement_text, label=label)
437
-
438
- def uk_postcodes(self, replacement_text: str = "[POSTCODE]", label: str = "uk_postcode") -> list[str]:
498
+ def uk_postcodes(
499
+ self,
500
+ texts: list[str] = None,
501
+ text_ids: list = None,
502
+ replacement: str = "[POSTCODE]",
503
+ label: str = "uk_postcode",
504
+ priority: float = 0.5,
505
+ ) -> list[IDEnt]:
439
506
  """
440
- Removes postcodes.
441
- e.g. `A11 1AA` scrubbed
507
+ Removes postcodes e.g. `A11 1AA` identified.
442
508
 
443
509
  Args:
444
- replacement_text (str): The replacement text for the removed text.
510
+ texts (list[str]): Strings to scrub.
511
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
512
+ If None, current cleaned state of `texts` passed at Class initiation used.
513
+ replacement (str): The replacement text for the removed text.
445
514
  label (str): Label for the personal data removed.
515
+ priority (float): Priority score for overlapping entities.
516
+ Higher scored entities are scrubbed where an overlap occurs.
517
+ The scores are relative e.g. 0.2 beats 0.1.
446
518
 
447
519
  Returns:
448
- list[str]: The input list of text with postcodes replaced.
520
+ list[IDEnt]: A list of IDEnt objects.
449
521
  """
450
522
 
451
- self.logger.info("Scrubbing postcodes using regex...")
452
523
  pattern = r"\b(?:(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)[ \t]*[0-9][A-Z]{2}|GIR[ \t]*0A{2}|SAN[ \t]*TA1|ASCN[ \t]*1ZZ|STHL[ \t]*1ZZ|TDCU[ \t]*1ZZ|BBND[ \t]*1ZZ|[BFS]IQ{2}[ \t]*1ZZ|GX11[ \t]*1AA|PCRN[ \t]*1ZZ|TKCA[ \t]*1ZZ|AI-?[0-9]{4}|BFPO[ \t-]?[0-9]{2,4}|MSR[ \t-]?1(?:1[12]|[23][135])0|VG[ \t-]?11[1-6]0|KY[1-3][ \t-]?[0-2][0-9]{3})\b"
524
+ return self.find_regex(
525
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
526
+ )
453
527
 
454
- return self.scrub_regex(pattern, replacement_text, label=label)
455
-
456
- def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
528
+ def uk_addresses(
529
+ self,
530
+ texts: list[str] = None,
531
+ text_ids: list = None,
532
+ replacement: str = "[ADDRESS]",
533
+ label: str = "uk_address",
534
+ priority: float = 0.8,
535
+ ) -> list[IDEnt]:
457
536
  """
458
537
  Removes addresses.
459
- e.g. `10 Downing Street` scrubbed
538
+ e.g. `10 Downing Street` and `10, Downing Street` identified.
460
539
 
461
540
  Args:
462
- replacement_text (str): The replacement text for the removed text.
541
+ texts (list[str]): Strings to scrub.
542
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
543
+ If None, current cleaned state of `texts` passed at Class initiation used.
544
+ replacement (str): The replacement text for the removed text.
463
545
  label (str): Label for the personal data removed.
546
+ priority (float): Priority score for overlapping entities.
547
+ Higher scored entities are scrubbed where an overlap occurs.
548
+ The scores are relative e.g. 0.2 beats 0.1.
464
549
 
465
- Returns:
466
- list[str]: The input list of text with postcodes replaced.
467
- """
468
-
469
- self.logger.info("Scrubbing addresses using regex...")
470
- pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
471
-
472
- return self.scrub_regex(pattern, replacement_text, label)
473
-
474
- def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
475
- """
476
- Removes claimant names from employment tribunal texts.
477
- e.g. `Claimant: Jim Smith` scrubbed
478
550
 
479
- Args:
480
- None
481
551
  Returns:
482
- list[str]: The input list of text with claimants replaced.
552
+ list[IDEnt]: A list of IDEnt objects.
483
553
  """
484
554
 
485
- self.logger.info("Scrubbing claimants using regex...")
486
-
487
- texts = self.get_texts()
488
-
489
- claimant_name = None
490
-
491
- cleaned_texts = []
492
-
493
- for i, text in zip(self.text_ids, texts):
494
-
495
- def replace_claimant(match):
496
- nonlocal claimant_name
497
- claimant_name = match.group(2).strip()
498
- return f"{match.group(1)}[CLAIMANT] "
499
-
500
- cleaned = re.sub(r"[\r\n]", " ", text)
501
-
502
- cleaned = re.sub(r"(Claimant\s*:\s*)(.*?)(?=\bRespondents?\s*:)", replace_claimant, cleaned)
503
-
504
- if claimant_name:
505
- cleaned = re.sub(re.escape(claimant_name), replacement_text, cleaned)
506
- self.scrubbed_data.append({self.text_id_name: i, label: claimant_name})
507
-
508
- cleaned_texts.append(cleaned)
509
-
510
- self.cleaned_texts = cleaned_texts
555
+ if self.texts and self.text_ids:
556
+ texts = self.texts
557
+ text_ids = self.text_ids
558
+ else:
559
+ texts = texts
560
+ text_ids = text_ids
511
561
 
512
- return cleaned_texts
562
+ pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?,?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
563
+ return self.find_regex(
564
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
565
+ )
513
566
 
514
567
  def get_spacy_model(self, model_name: str = "en_core_web_trf") -> Language:
515
568
  """
@@ -548,86 +601,69 @@ class IDScrub:
548
601
 
549
602
  def spacy_entities(
550
603
  self,
604
+ texts: list[str] = None,
605
+ text_ids: list = None,
551
606
  model_name: str = "en_core_web_trf",
552
- entities: list[str] = ["PERSON", "ORG", "NORP"],
553
- replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
554
- label_prefix: str = None,
607
+ entity_types: list[str] = ["PERSON", "ORG", "NORP"],
608
+ replacement_map: dict = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
609
+ priority: float = 1.0,
555
610
  n_process: int = 1,
556
611
  batch_size: int = 1000,
557
- ) -> list[str]:
612
+ ) -> list[IDEnt]:
558
613
  """
559
- Remove SpaCy entities using a given SpaCy model.
614
+ Remove SpaCy idents using a given SpaCy model.
560
615
  Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
561
616
  Note: only "en_core_web_trf" has been evaluated.
562
617
 
563
618
  Args:
619
+ texts (list[str]): Strings to scrub.
620
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
621
+ If None, current cleaned state of `texts` passed at Class initiation used.
564
622
  model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
565
- entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
566
- replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
623
+ entity_types (list[str]): Which SpaCy idents to scrub (based on SpaCy entity keys).
624
+ replacement_map (str): The replacement texts for the removed text. Key is entity type, value is replacement.
567
625
  label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
568
626
  n_process (int): Number of parallel processes.
569
627
  batch_size (int): The number of texts in each batch.
628
+ priority (float): Priority score for overlapping entities.
629
+ Higher scored entities are scrubbed where an overlap occurs.
630
+ The scores are relative e.g. 0.2 beats 0.1.
570
631
 
571
632
  Returns:
572
- list[str]: The input list of text with PERSON entities scrubbed.
633
+ list[IDEnt]: A list of IDEnt objects.
573
634
  """
574
635
 
575
- self.logger.info(
576
- f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
577
- )
578
-
579
- texts = self.get_texts()
580
-
581
- cleaned_texts = []
582
- labels = []
583
-
584
636
  nlp = self.get_spacy_model(model_name)
585
637
  stripped_texts = [s.strip() if s.isspace() else s for s in texts]
586
- documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
587
-
588
- for i, (ids, doc, stripped_text) in tqdm(
589
- enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
590
- ):
591
- if not stripped_text:
592
- cleaned_texts.append(texts[i])
593
- continue
594
-
595
- all_found_entities = []
596
-
597
- for entity_type in entities:
598
- found = [
599
- ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
600
- ]
601
-
602
- for ent in found:
603
- label = ent.label_.lower()
604
- if label_prefix:
605
- label = f"{label_prefix}_{label}"
606
- labels.append(label)
607
- self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
608
-
609
- if self.replacement_text:
610
- all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
611
- elif replacement_map:
612
- all_found_entities.extend(
613
- (ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
614
- )
615
- else:
616
- all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
617
-
618
- cleaned = stripped_text
619
-
620
- for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
621
- cleaned = cleaned[:start] + repl + cleaned[end:]
638
+ docs = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
622
639
 
623
- cleaned_texts.append(cleaned)
640
+ idents = []
624
641
 
625
- self.cleaned_texts = cleaned_texts
626
-
627
- for label in set(labels):
628
- self.log_message(label)
642
+ for doc, text_id in zip(docs, text_ids):
643
+ for ent in doc.ents:
644
+ if ent.label_ not in entity_types:
645
+ continue
646
+ if self.replacement:
647
+ replacement = self.replacement
648
+ elif replacement_map:
649
+ replacement = replacement_map.get(ent.label_, "[REDACTED]")
650
+ else:
651
+ replacement = f"[{ent.label_}]"
652
+
653
+ idents.append(
654
+ self.IDEnt(
655
+ text_id=text_id,
656
+ text=ent.text,
657
+ start=ent.start_char,
658
+ end=ent.end_char,
659
+ priority=priority,
660
+ replacement=replacement,
661
+ label=ent.label_.lower(),
662
+ source="spacy",
663
+ )
664
+ )
629
665
 
630
- return cleaned_texts
666
+ return idents
631
667
 
632
668
  def get_hf_model(
633
669
  self,
@@ -666,41 +702,46 @@ class IDScrub:
666
702
 
667
703
  def huggingface_entities(
668
704
  self,
705
+ texts: list[str] = None,
706
+ text_ids: list = None,
707
+ entity_type="PER",
708
+ replacement: str = "[PERSON]",
709
+ label: str = "person",
710
+ priority: float = 1.0,
669
711
  hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
670
712
  download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
671
- entity="PER",
672
- replacement_text: str = "[PERSON]",
673
- label: str = "person",
674
- batch_size: int = 8,
675
- ) -> list[str]:
713
+ ) -> list[IDEnt]:
676
714
  """
677
- Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
715
+ Remove idents using a Hugging Face model. Default is a PERSON entity identifier.
678
716
  Note: No Hugging Face models have been evaluated for performance.
679
717
 
680
718
  Args:
719
+ texts (list[str]): Strings to scrub.
720
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
721
+ entity_type (str): Which entity to scrub (based on particular model keys).
722
+ If None, current cleaned state of `texts` passed at Class initiation used.
681
723
  hf_model_path (str): Path to the Hugging Face model.
682
724
  Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
683
725
  download_directory (str): Directory in which to save the model.
684
726
  Default is current working directory.
685
- replacement_text (str): The replacement text for the removed text.
727
+ replacement (str): The replacement text for the removed text.
686
728
  label (str): Label for the personal data removed.
729
+ priority (float): Priority score for overlapping entities.
730
+ Higher scored entities are scrubbed where an overlap occurs.
731
+ The scores are relative e.g. 0.2 beats 0.1.
687
732
  batch_size (int): Number of texts passed to the model in each batch.
688
733
  Memory (instance size) dependent.
689
734
 
690
735
  Returns:
691
- list[str]: The input list of text with PERSON entities replaced.
736
+ list[str]: The input list of text with PERSON idents replaced.
692
737
 
693
738
  """
694
739
 
695
- self.logger.info(f"Scrubbing names using Hugging Face model ({hf_model_path})...")
740
+ if self.replacement:
741
+ replacement = self.replacement
696
742
 
697
743
  tokenizer = self.get_hf_model(hf_model_path=hf_model_path, download_directory=download_directory)
698
744
 
699
- texts = self.get_texts()
700
-
701
- if self.replacement_text:
702
- replacement_text = self.replacement_text
703
-
704
745
  try:
705
746
  names_model = AutoModelForTokenClassification.from_pretrained(hf_model_path)
706
747
  except OSError:
@@ -708,74 +749,72 @@ class IDScrub:
708
749
  f"Hugging Face model `{hf_model_path}` does has not been downloaded correctly. Please delete `huggingface/` and retry."
709
750
  )
710
751
 
711
- ner_pipeline = pipeline("ner", model=names_model, tokenizer=tokenizer, aggregation_strategy="simple")
712
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
713
- batched_entities = ner_pipeline(stripped_texts, batch_size=batch_size)
714
-
715
- cleaned_texts = []
716
-
717
- for i, (ids, stripped_text, entities) in enumerate(zip(self.text_ids, stripped_texts, batched_entities)):
718
- if stripped_text == "":
719
- cleaned_texts.append(texts[i])
720
- continue
721
-
722
- person_entities = [
723
- ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
724
- ]
725
- self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
726
-
727
- cleaned = stripped_text
728
- for ent in sorted(person_entities, key=lambda x: x["start"], reverse=True):
729
- cleaned = cleaned[: ent["start"]] + replacement_text + cleaned[ent["end"] :]
730
-
731
- cleaned_texts.append(cleaned)
732
-
733
- self.cleaned_texts = cleaned_texts
734
-
735
- self.log_message(label)
752
+ ner = pipeline(task="ner", model=names_model, tokenizer=tokenizer, aggregation_strategy="simple")
753
+
754
+ idents = []
755
+
756
+ results = ner(texts)
757
+
758
+ for ents, text_id in zip(results, text_ids):
759
+ for ent in ents:
760
+ if ent["entity_group"] != entity_type:
761
+ continue
762
+ idents.append(
763
+ self.IDEnt(
764
+ text_id=text_id,
765
+ text=ent["word"],
766
+ start=ent["start"],
767
+ end=ent["end"],
768
+ priority=priority,
769
+ replacement=replacement,
770
+ label=label,
771
+ source="huggingface",
772
+ )
773
+ )
736
774
 
737
- return cleaned_texts
775
+ return idents
738
776
 
739
777
  def presidio_entities(
740
778
  self,
779
+ texts: list[str] = None,
780
+ text_ids: list = None,
741
781
  model_name: str = "en_core_web_trf",
742
- entities: list[str] = [
782
+ entity_types: list[str] = [
743
783
  "PERSON",
784
+ "EMAIL_ADDRESS",
744
785
  "UK_NINO",
745
786
  "UK_NHS",
746
787
  "CREDIT_CARD",
747
788
  "CRYPTO",
748
789
  "MEDICAL_LICENSE",
749
- "URL",
790
+ "SWIFT_CODE",
750
791
  "IBAN_CODE",
792
+ "LOCATION",
793
+ "NRP",
751
794
  ],
752
- replacement_map: str = None,
753
- label_prefix: str = None,
754
- ) -> list[str]:
795
+ replacement_map: dict = {},
796
+ priority: float = 1.0,
797
+ ) -> list[IDEnt]:
755
798
  """
756
- Scrub specified entities from texts using Presidio.
799
+ Scrub specified idents from texts using Presidio.
757
800
 
758
801
  See https://microsoft.github.io/presidio/supported_entities/ for further detail.
759
802
 
760
803
  Args:
804
+ texts (list[str]): Strings to scrub.
805
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
806
+ If None, current cleaned state of `texts` passed at Class initiation used.
761
807
  model_name (str): spaCy model to use
762
- entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
763
- replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
764
- label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
765
- Useful if you wish to identify this having being scrubbed by Presidio.
808
+ entity_types (list[str]): entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
809
+ replacement_map (str): The replacement texts for the removed text. Key is entity type, value is replacement.
810
+ priority (float): Priority score for overlapping entities.
811
+ Higher scored entities are scrubbed where an overlap occurs.
812
+ The scores are relative e.g. 0.2 beats 0.1.
766
813
 
767
814
  Returns:
768
- list[str]: The input list of text with entities replaced.
815
+ list[str]: The input list of text with idents replaced.
769
816
  """
770
817
 
771
- self.logger.info(
772
- f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
773
- )
774
-
775
- texts = self.get_texts()
776
-
777
- cleaned_texts = []
778
-
779
818
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
780
819
  def __init__(self, loaded_spacy_model):
781
820
  super().__init__()
@@ -785,199 +824,320 @@ class IDScrub:
785
824
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)
786
825
 
787
826
  analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine)
788
- anonymizer = AnonymizerEngine()
789
827
 
790
- cleaned_texts = []
791
- all_labels = []
828
+ idents = []
792
829
 
793
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
830
+ for text, text_id in zip(texts, text_ids):
831
+ results = analyzer.analyze(text=text, language="en", entities=entity_types)
832
+ for res in results:
833
+ if res.entity_type not in entity_types:
834
+ continue
835
+
836
+ if self.replacement:
837
+ replacement = self.replacement
838
+ elif replacement_map:
839
+ replacement = replacement_map.get(res.entity_type, "[REDACTED]")
840
+ else:
841
+ replacement = f"[{res.entity_type}]"
842
+
843
+ idents.append(
844
+ self.IDEnt(
845
+ text_id=text_id,
846
+ text=text[res.start : res.end],
847
+ start=res.start,
848
+ end=res.end,
849
+ priority=priority,
850
+ replacement=replacement,
851
+ label=res.entity_type.lower(),
852
+ source="presidio",
853
+ )
854
+ )
794
855
 
795
- for i, (ids, stripped_text) in tqdm(enumerate(zip(self.text_ids, stripped_texts)), total=len(texts)):
796
- if stripped_text == "":
797
- cleaned_texts.append(texts[i])
798
- continue
856
+ return idents
799
857
 
800
- results = analyzer.analyze(text=stripped_text, language="en")
801
- results = [r for r in results if r.entity_type in entities]
858
+ def group_idents(self, idents: list[IDEnt]) -> dict[int | str | float, list[IDEnt]]:
859
+ """
860
+ Group a list of IDEnt objects by `text_id`.
802
861
 
803
- if label_prefix:
804
- labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
805
- else:
806
- labels = [f"{res.entity_type.lower()}" for res in results]
862
+ Each unique `text_id` becomes a dictionary key,
863
+ and its value is a list of all IDEnt objects associated with that ID.
807
864
 
808
- for label in labels:
809
- all_labels.append(label)
865
+ Args:
866
+ idents (list[IDEnt]) A list of IDEnt objects.
810
867
 
811
- self.scrubbed_data.extend(
812
- {self.text_id_name: ids, label: stripped_text[res.start : res.end]}
813
- for res, label in zip(results, labels)
814
- )
868
+ Returns:
869
+ dict[int | str | float, list[IDEnt]]: A dictionary mapping each text_id to a list of IDEnt objects.
870
+ """
815
871
 
816
- if self.replacement_text:
817
- operators = {
818
- res.entity_type: OperatorConfig("replace", {"new_value": self.replacement_text}) for res in results
819
- }
820
- elif replacement_map:
821
- operators = {
822
- res.entity_type: OperatorConfig("replace", {"new_value": replacement_map.get(res.entity_type)})
823
- for res in results
824
- }
825
- else:
826
- operators = {
827
- res.entity_type: OperatorConfig("replace", {"new_value": f"[{res.entity_type}]"}) for res in results
828
- }
872
+ idents_grouped = defaultdict(list)
829
873
 
830
- anonymized = anonymizer.anonymize(text=stripped_text, analyzer_results=results, operators=operators)
874
+ for ident in idents:
875
+ idents_grouped[ident.text_id].append(ident)
831
876
 
832
- cleaned_texts.append(anonymized.text)
877
+ return idents_grouped
833
878
 
834
- self.cleaned_texts = cleaned_texts
879
+ def resolve_overlaps(self, idents: list[IDEnt]) -> list[IDEnt]:
880
+ """
881
+ Select the highest-scoring non-overlapping idents.
835
882
 
836
- for label in set(all_labels):
837
- self.log_message(label)
883
+ Resolves conflicts between idents that overlap in their
884
+ character ranges. Entities are first sorted by descending priority and then by
885
+ start position to ensure a priority order.
838
886
 
839
- return cleaned_texts
887
+ Each IDEnt is accepted only if it does not overlap with any IDEnt
888
+ already selected. The resulting set of idents is returned in ascending
889
+ document order.
840
890
 
841
- def all_regex(self) -> list[str]:
842
- """
843
- Use all regex methods to remove personal information from text.
891
+ A IDEnt is considered overlapping if:
892
+ IDEnt.start <= other.end and IDEnt.end >= other.start
844
893
 
845
894
  Args:
846
- None
895
+ idents (list[IDEnt]) A list of IDEnt objects.
847
896
 
848
897
  Returns:
849
- list[str]: The input list of text with various personal information replaced.
850
-
898
+ list[IDEnt]: A list of non-overlapping idents, sorted by their start position.
851
899
  """
852
900
 
853
- self.email_addresses()
854
- self.handles()
855
- self.ip_addresses()
856
- self.uk_phone_numbers()
857
- self.uk_addresses()
858
- self.uk_postcodes()
859
- self.titles()
901
+ idents_grouped = self.group_idents(idents)
860
902
 
861
- return self.cleaned_texts
903
+ resolved = []
862
904
 
863
- def all(
864
- self,
865
- custom_regex_patterns: list = None,
866
- custom_replacement_texts: list[str] = None,
867
- model_name: str = "en_core_web_trf",
868
- spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
869
- presidio_entities: list[str] = [
870
- "PERSON",
871
- "EMAIL_ADDRESS",
872
- "UK_NINO",
873
- "UK_NHS",
874
- "CREDIT_CARD",
875
- "CRYPTO",
876
- "MEDICAL_LICENSE",
877
- "URL",
878
- "SWIFT_CODE",
879
- "IBAN_CODE",
880
- "LOCATION",
881
- "NRP",
882
- ],
883
- n_process: int = 1,
884
- batch_size: int = 1000,
885
- ) -> list[str]:
905
+ for text_id, idents in idents_grouped.items():
906
+ if not idents:
907
+ return []
908
+
909
+ idents_by_score = sorted(idents, key=lambda ident: (-ident.priority, ident.start))
910
+
911
+ kept_idents = []
912
+
913
+ for current_ident in idents_by_score:
914
+ has_overlap = any(
915
+ current_ident.start <= existing_ident.end and current_ident.end >= existing_ident.start
916
+ for existing_ident in kept_idents
917
+ )
918
+
919
+ if not has_overlap:
920
+ kept_idents.append(current_ident)
921
+
922
+ resolved.extend(kept_idents)
923
+
924
+ return resolved
925
+
926
+ def scrub_text(self, texts: str = None, text_ids: list = None, idents: list[IDEnt] = None):
886
927
  """
887
- Use all regex and NER (Spacy) methods to remove personal information from text.
928
+ Apply a set of non-overlapping replacement idents to a text.
929
+
930
+ Each IDEnt specifies a character range to replace (`IDEnt.start` to `IDEnt.end`)
931
+ and a `replacement` string that will be inserted in place of that range.
888
932
 
889
933
  Args:
890
- custom_regex_patterns list[str]: Regex(s) pattern to apply.
891
- custom_replacement_texts list[str]: The replacement texts for the removed text. Defaults to '[REDACTED]' for all.
892
- model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
893
- n_process (str): Number of parallel processes.
894
- batch_size (int): The number of texts in each batch.
934
+ texts list[str]: The original input text with overlaps resolved.
935
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
936
+ idents list[IDEnt]: a list of IDEnt objects. Must be non-overlapping.
937
+ See `resolve_conflicts`.
895
938
 
896
- Returns:
897
- list[str]: The input list of text with various personal information replaced.
939
+ Return:
940
+ str: A scrubbed string with all replacements applied.
898
941
  """
899
942
 
900
- if custom_regex_patterns:
901
- self.custom_regex(
902
- custom_regex_patterns=custom_regex_patterns,
903
- custom_replacement_texts=custom_replacement_texts,
904
- )
943
+ if texts is None:
944
+ texts = getattr(self, "texts", None)
945
+ if text_ids is None:
946
+ text_ids = getattr(self, "text_ids", None)
947
+ if idents is None:
948
+ idents = getattr(self, "idents", None)
905
949
 
906
- self.presidio_entities(model_name=model_name, entities=presidio_entities)
907
- self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
908
- self.google_phone_numbers()
909
- self.all_regex()
950
+ if texts is None or text_ids is None or idents is None:
951
+ raise ValueError("texts, text_ids, and idents must be provided or set on self.")
910
952
 
911
- return self.cleaned_texts
953
+ if len(texts) != len(text_ids):
954
+ raise ValueError("texts and text_ids must be the same length.")
912
955
 
913
- def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
914
- """
915
- Scrubs text using given methods (in order).
916
- Uses default values for the given scrub method.
956
+ scrubbed_texts = list(texts)
957
+ idents_grouped = self.group_idents(idents)
917
958
 
918
- Methods available (see associated method docstring for further information):
959
+ for i, text_id in enumerate(text_ids):
960
+ text = texts[i]
919
961
 
920
- "all", "spacy_persons", "huggingface_persons", "email_addresses", "handles",
921
- "ip_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
922
- "titles", "presidio"
962
+ group = idents_grouped.get(text_id, [])
963
+ sorted_group = sorted(group, key=lambda ident: ident.start, reverse=True)
923
964
 
924
- Example:
965
+ for ident in sorted_group:
966
+ text = text[: ident.start] + ident.replacement + text[ident.end :]
925
967
 
926
- "email_addresses" = scrub.email_addresses()
968
+ scrubbed_texts[i] = text
927
969
 
928
- Therefore we can call:
970
+ return scrubbed_texts
929
971
 
930
- IDScrub.scrub(scrub_methods = ["email_addresses"])
972
+ def scrub(
973
+ self,
974
+ pipeline: list[dict] = [
975
+ {"method": "presidio_entities"},
976
+ {"method": "spacy_entities"},
977
+ {"method": "email_addresses"},
978
+ {"method": "handles"},
979
+ {"method": "ip_addresses"},
980
+ {"method": "uk_addresses"},
981
+ {"method": "uk_phone_numbers"},
982
+ {"method": "google_phone_numbers"},
983
+ {"method": "uk_postcodes"},
984
+ {"method": "urls"},
985
+ {"method": "titles"},
986
+ ],
987
+ ):
988
+ """
989
+ Scrubs text using given methods.
990
+ Uses default values for the given scrub method.
931
991
 
932
992
  Args:
933
- scrub_method (str): string name of scrub method.
993
+ pipeline (list[dict]): Scrub methods and their method parameters to apply.
994
+ Methods are specified with "method" key.
995
+ Parameters are specified with argument name as "key" and argument value as value.
996
+
997
+ Example: IDScrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"])
998
+
999
+ See associated method docstring for further parameters e.g. ?IDScrub.spacy_entities.
1000
+
1001
+ Methods available:
1002
+
1003
+ "spacy_entities", "huggingface_entities", "email_addresses", "handles",
1004
+ "ip_addresses", "uk_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
1005
+ "titles", "presidio_entities"
1006
+
1007
+ Each method takes a `priority` argument. Higher priority scored entities
1008
+ are scrubbed where an overlap occurs. The scores are relative.
934
1009
 
935
1010
  Returns:
936
- list[str]: The input list of text with personal information replaced.
1011
+ list[str]: The input texts scrubbed of personal data.
937
1012
 
938
1013
  """
939
1014
 
940
- for scrub_method in scrub_methods:
1015
+ if not isinstance(pipeline, list):
1016
+ raise TypeError("Argument `pipeline` must be a list of dicts.")
1017
+
1018
+ self.idents_all = []
1019
+ self.idents = []
1020
+
1021
+ for step in pipeline:
1022
+ scrub_method = step["method"]
1023
+ args = {k: v for k, v in step.items() if k != "method"}
1024
+
1025
+ if args:
1026
+ self.logger.info(f"Scrubbing using {scrub_method} with parameters {args}...")
1027
+ else:
1028
+ self.logger.info(f"Scrubbing using {scrub_method} with default parameters...")
1029
+
941
1030
  try:
942
1031
  method = getattr(self, scrub_method)
943
- method()
944
1032
  except AttributeError:
945
1033
  self.logger.warning("Not a scrub method.")
946
1034
 
947
- return self.cleaned_texts
1035
+ self.idents_all.extend(method(texts=self.texts, text_ids=self.text_ids, **args))
1036
+
1037
+ idents_exclude = [ident for ident in self.idents_all if ident.text not in self.exclude]
1038
+ idents_resolved = self.resolve_overlaps(idents=idents_exclude)
1039
+ self.idents.extend(idents_resolved)
1040
+ self.scrubbed_texts = self.scrub_text(texts=self.texts, text_ids=self.text_ids, idents=self.idents)
1041
+
1042
+ return self.scrubbed_texts
1043
+
1044
+ def get_all_identified_data(self) -> pd.DataFrame:
1045
+ """
1046
+ Get all of the identified data before overlaps have been resolved.
1047
+
1048
+ Each row is a identified entity. Columns are the IDEnt attributes.
1049
+
1050
+ Args:
1051
+ None
1052
+ Return:
1053
+ pd.DataFrame: All identified data and their attributes.
1054
+ """
1055
+ all_idents = pd.DataFrame([asdict(ident) for ident in self.idents_all])
1056
+ return all_idents
1057
+
1058
+ def get_scrubbed_data(self) -> pd.DataFrame:
1059
+ """
1060
+ Create a DataFrame summarising scrubbed text idents grouped by text ID and label.
1061
+
1062
+ Each row corresponds to a unique `text_id`, and each column represents a IDEnt label.
1063
+ The cell values are lists of the IDEnt text values associated with that label for the given text ID.
1064
+ Args:
1065
+ None
1066
+ Return:
1067
+ pd.DataFrame: All data scrubbed from text.
1068
+ """
1069
+ data = defaultdict(lambda: defaultdict(list))
1070
+
1071
+ for ident in self.idents:
1072
+ data[ident.text_id][ident.label].append(ident.text)
1073
+
1074
+ df = pd.DataFrame.from_dict(data, orient="index")
1075
+ df = df.reset_index().rename(columns={"index": self.text_id_name})
1076
+ df = df.where(pd.notna(df), None)
1077
+
1078
+ return df
948
1079
 
949
1080
  @staticmethod
950
1081
  def dataframe(
951
1082
  df: pd.DataFrame = None,
952
1083
  id_col: str = None,
953
1084
  exclude_cols: list = None,
954
- scrub_methods: list[str] = ["all"],
1085
+ pipeline: list[dict] = [
1086
+ {"method": "presidio_entities"},
1087
+ {"method": "spacy_entities"},
1088
+ {"method": "email_addresses"},
1089
+ {"method": "handles"},
1090
+ {"method": "ip_addresses"},
1091
+ {"method": "uk_addresses"},
1092
+ {"method": "uk_phone_numbers"},
1093
+ {"method": "google_phone_numbers"},
1094
+ {"method": "uk_postcodes"},
1095
+ {"method": "urls"},
1096
+ {"method": "titles"},
1097
+ ],
955
1098
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
956
1099
  """
957
1100
  Scrubs all personal data from a Pandas Dataframe.
958
1101
 
959
1102
  Args:
960
1103
  df (pd.DataFrame): A Pandas dataframe to scrub.
961
- id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `id` is applied.
1104
+ id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `text_id` is applied.
962
1105
  exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
963
- scrub_methods (list[str]): Which scrub methods to apply to the DataFrame (in order).
964
- These are string versions of the existing methods e.g. "all" == scrub.all() and "email_addresses" == scrub.email_addresses().
1106
+ pipeline (list[dict]): Scrub methods and their method parameters to apply.
1107
+ Methods are specified with "method" key.
1108
+
1109
+ Example: IDScrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"])
1110
+
1111
+ See associated method docstring for further parameters e.g. ?IDScrub.spacy_entities.
1112
+
1113
+ Methods available:
1114
+
1115
+ "spacy_entities", "huggingface_entities", "email_addresses", "handles",
1116
+ "ip_addresses", "uk_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
1117
+ "titles", "presidio_entities"
1118
+
1119
+ Each method takes a `priority` argument. Higher priority scored entities
1120
+ are scrubbed where an overlap occurs. The scores are relative.
965
1121
 
966
1122
  Returns:
967
1123
  tuple[pd.DataFrame, pd.DataFrame]: The input dataframe with all personal data removed and a dataframe with the personal data that has been removed.
968
1124
 
969
1125
  """
970
1126
 
971
- assert id_col in df.columns, "`id_col` is not a column in `df`. Please check."
1127
+ if not isinstance(df, pd.DataFrame):
1128
+ raise TypeError("`df` must be a Pandas DataFrame.")
972
1129
 
973
- if id_col:
974
- ids = df[id_col].to_list()
975
- if not id_col:
976
- id_col = "id"
1130
+ if id_col is None:
977
1131
  ids = range(1, len(df) + 1)
1132
+ id_col = "id"
1133
+ else:
1134
+ if id_col not in df.columns:
1135
+ raise ValueError(f"`id_col` '{id_col}' is not a column in df.")
978
1136
 
979
- assert isinstance(df, pd.DataFrame), "`df` must be a Pandas DataFrame."
980
- assert len(df) == len(ids), "Length of dataframe is different to the length of IDs."
1137
+ ids = df[id_col].tolist()
1138
+
1139
+ if not len(df) == len(ids):
1140
+ raise ValueError("Length of dataframe is different to the length of IDs.")
981
1141
 
982
1142
  if exclude_cols is None:
983
1143
  cols_to_scrub = df.columns.to_list()
@@ -994,16 +1154,17 @@ class IDScrub:
994
1154
  original_dtype = scrubbed_df[col].dtype
995
1155
  scrubbed_df[col] = scrubbed_df[col].astype(str)
996
1156
 
997
- scrub = IDScrub(texts=scrubbed_df[col].to_list(), text_id_name=id_col, text_ids=ids)
1157
+ scrub = IDScrub(texts=scrubbed_df[col].to_list(), text_ids=ids)
998
1158
  scrub.logger.info(f"Scrubbing column `{col}`...")
999
1159
 
1000
- scrubbed_texts = scrub.scrub(scrub_methods)
1160
+ scrubbed_texts = scrub.scrub(pipeline=pipeline)
1001
1161
  scrubbed_df[col] = scrubbed_texts
1002
1162
 
1003
1163
  scrubbed_data = scrub.get_scrubbed_data()
1004
1164
 
1005
1165
  if scrubbed_data is not None:
1006
1166
  scrubbed_data.insert(1, "column", col)
1167
+ scrubbed_data.rename(columns={"text_id": id_col}, inplace=True)
1007
1168
  all_scrubbed_data.append(scrubbed_data)
1008
1169
 
1009
1170
  try:
@@ -1013,8 +1174,14 @@ class IDScrub:
1013
1174
  pass
1014
1175
 
1015
1176
  all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
1177
+ all_scrubbed_data["column"] = pd.Categorical(
1178
+ all_scrubbed_data["column"], categories=cols_to_scrub, ordered=True
1179
+ )
1180
+ all_scrubbed_data = all_scrubbed_data.sort_values(by=["column", id_col]).reset_index(drop=True)
1181
+ all_scrubbed_data["column"] = all_scrubbed_data["column"].astype(str)
1016
1182
  all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
1017
1183
 
1018
- assert df.shape == scrubbed_df.shape, "Original and scrubbed dataframe not the same shape. Check."
1184
+ if not df.shape == scrubbed_df.shape:
1185
+ raise ValueError("Original and scrubbed dataframe not the same shape. Check input DataFrame.")
1019
1186
 
1020
1187
  return scrubbed_df, all_scrubbed_data