idscrub 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
idscrub/scrub.py CHANGED
@@ -2,8 +2,9 @@ import logging
2
2
  import os
3
3
  import re
4
4
  import warnings
5
+ from collections import defaultdict
5
6
  from collections.abc import Iterable
6
- from functools import partial
7
+ from dataclasses import asdict, dataclass
7
8
 
8
9
  import pandas as pd
9
10
  import phonenumbers
@@ -11,8 +12,6 @@ import spacy
11
12
  from huggingface_hub.utils import HFValidationError
12
13
  from presidio_analyzer import AnalyzerEngine
13
14
  from presidio_analyzer.nlp_engine import SpacyNlpEngine
14
- from presidio_anonymizer import AnonymizerEngine
15
- from presidio_anonymizer.entities import OperatorConfig
16
15
  from spacy.cli import download
17
16
  from spacy.language import Language
18
17
  from tqdm import tqdm
@@ -29,12 +28,44 @@ trf_logging.set_verbosity_error()
29
28
 
30
29
 
31
30
  class IDScrub:
31
+ """
32
+ Class for identifying and scrubbing entities in text.
33
+ """
34
+
35
+ @dataclass
36
+ class IDEnt:
37
+ """
38
+ Structured representation of an identified entity (ident) within text.
39
+
40
+ Attributes:
41
+ text_id (str | int | float): A unique identifier for the original text.
42
+ text (str): The exact substring extracted from the original text.
43
+ start (int): The starting character offset of the ident within the original text.
44
+ end (int): The ending character offset of the ident within the original text.
45
+ label (str): The ident type (e.g. 'person').
46
+ replacement (str): The text that should replace this ident during scrubbing.
47
+ priority (float): Priority score for overlapping idents.
48
+ Higher scored idents are scrubbed where an overlap occurs.
49
+ The scores are relative e.g. 0.2 beats 0.1.
50
+ source (str): The source model or method that identified the ident.
51
+ """
52
+
53
+ text_id: str | int | float
54
+ text: str
55
+ start: int
56
+ end: int
57
+ label: str
58
+ replacement: str
59
+ priority: float
60
+ source: str
61
+
32
62
  def __init__(
33
63
  self,
34
- texts: list[str] = [],
64
+ texts: list[str] = None,
35
65
  text_ids: list | Iterable = None,
36
66
  text_id_name: str = "text_id",
37
- replacement_text: str = None,
67
+ replacement: str = None,
68
+ exclude: list[str] = [],
38
69
  verbose: bool = True,
39
70
  ):
40
71
  """
@@ -46,32 +77,37 @@ class IDScrub:
46
77
  such as the ID column in a DataFrame. If None, an integer index starting at 1 is applied.
47
78
  This is used to identify texts in get_scrubbed_data().
48
79
  text_id_name (str): Name of the ID column in get_scrubbed_data(). Default is `text_id`.
49
- replacement_text (str): A global string to replace every scrubbed
50
- string with.
80
+ replacement (str): A global string to replace every scrubbed string with.
81
+ exclude (list[str]): A list of strings that will not be scrubbed if identified.
51
82
  verbose (bool): Whether to show all log messages or only warnings.
52
83
  """
53
84
 
54
- assert isinstance(texts, list) and all(isinstance(text, str) for text in texts), (
55
- "`texts` can only be a list of strings or a single string in a list."
56
- )
85
+ if not isinstance(texts, list):
86
+ raise TypeError("`texts` must be a list.")
87
+ if not all(isinstance(text, str) for text in texts):
88
+ raise TypeError("`texts` must be a list of strings.")
57
89
 
58
- assert isinstance(replacement_text, str) or isinstance(replacement_text, type(None)), (
59
- "`replacement_text` can only be string."
60
- )
90
+ if replacement is not None and not isinstance(replacement, str):
91
+ raise TypeError("`replacement` must be a string or None.")
61
92
 
62
93
  self.texts = texts
63
94
 
64
- if text_ids:
65
- self.text_ids = text_ids
66
- else:
67
- self.text_ids = range(1, len(self.texts) + 1)
95
+ if text_ids is None:
96
+ text_ids = range(1, len(self.texts) + 1)
68
97
 
69
- assert len(self.texts) == len(self.text_ids), "Length of texts is different to the length of text IDs."
98
+ if not len(self.texts) == len(text_ids):
99
+ raise ValueError("Length of texts is different to the length of text IDs.")
70
100
 
101
+ self.text_ids = text_ids
102
+
103
+ self.replacement = replacement
71
104
  self.text_id_name = text_id_name
72
- self.cleaned_texts = []
73
- self.scrubbed_data = []
74
- self.replacement_text = replacement_text
105
+ self.exclude = exclude
106
+ self.scrubbed_texts = []
107
+ self.idents: list[IDScrub.IDEnt] = []
108
+
109
+ self.hf_ner = None
110
+ self.spacy_docs = None
75
111
 
76
112
  self.logger = logging.getLogger(self.__class__.__name__)
77
113
  self.logger.setLevel(logging.DEBUG if verbose else logging.WARNING)
@@ -84,284 +120,295 @@ class IDScrub:
84
120
 
85
121
  self.logger.info("Texts loaded.")
86
122
 
87
- def get_texts(self) -> list[str]:
88
- """
89
- Get the text that will be processed.
90
- If no cleaning has occured, then use the raw input
91
- texts. If cleaning has occured then update the cleaned texts.
92
-
93
- Args:
94
- None
95
-
96
- Returns:
97
- A Pandas DataFrame with text_id
98
- and scrubbed in a list format.
99
- """
100
- if self.cleaned_texts:
101
- texts = self.cleaned_texts
102
- else:
103
- texts = self.texts
104
-
105
- return texts
106
-
107
- def get_scrubbed_data(self) -> pd.DataFrame:
123
+ def find_regex(
124
+ self,
125
+ texts: list[str],
126
+ text_ids: list,
127
+ pattern: str,
128
+ replacement: str,
129
+ label: str,
130
+ priority: float,
131
+ ) -> list[IDEnt]:
108
132
  """
109
- Turn text ids and scrubbed text into a DataFrame.
133
+ General method to clean text using a regex pattern.
110
134
 
111
135
  Args:
112
- None
136
+ texts (list[str]): Strings to scrub.
137
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
138
+ If None, current cleaned state of `texts` passed at Class initiation used.
139
+ pattern (str): Regex pattern to apply.
140
+ replacement (str): The replacement text for the removed text.
141
+ label (str): Label for the personal data removed.
142
+ priority (float): Priority score for overlapping entities.
143
+ Higher scored entities are scrubbed where an overlap occurs.
144
+ The scores are relative e.g. 0.2 beats 0.1.
113
145
 
114
146
  Returns:
115
- A Pandas DataFrame with text_id
116
- and scrubbed in a list format.
147
+ list[IDEnt]: A list of IDEnt objects.
117
148
  """
118
- df = pd.DataFrame(self.scrubbed_data)
119
-
120
- if self.text_id_name not in df.columns:
121
- return None
122
149
 
123
- # Group by the id and aggregate non-null values into lists
124
- if df[self.text_id_name].dtype == object or df[self.text_id_name].dtype == str:
125
- grouped = (
126
- df.groupby(self.text_id_name, sort=False)
127
- .agg(lambda x: [i for i in x if pd.notna(i)])
128
- .reset_index()
129
- .map(lambda x: None if isinstance(x, list) and len(x) == 0 else x)
130
- )
131
- else:
132
- grouped = (
133
- df.groupby(self.text_id_name)
134
- .agg(lambda x: [i for i in x if pd.notna(i)])
135
- .reset_index()
136
- .map(lambda x: None if isinstance(x, list) and len(x) == 0 else x)
137
- )
150
+ if self.replacement:
151
+ replacement = self.replacement
152
+
153
+ compiled = re.compile(pattern, re.IGNORECASE)
154
+ idents = []
155
+
156
+ for text_id, text in zip(text_ids, texts):
157
+ for match in compiled.finditer(text):
158
+ idents.append(
159
+ self.IDEnt(
160
+ text_id=text_id,
161
+ text=match.group(),
162
+ start=match.start(),
163
+ end=match.end(),
164
+ label=label,
165
+ replacement=replacement,
166
+ priority=priority,
167
+ source="regex",
168
+ )
169
+ )
138
170
 
139
- return grouped
171
+ return idents
140
172
 
141
- def log_message(self, label) -> None:
173
+ def custom_regex(
174
+ self, texts: list[str] = None, text_ids: list = None, patterns: dict = None, source: str = "custom_regex"
175
+ ) -> list[IDEnt]:
142
176
  """
143
- Log message with count of PII-type scrubbed.
177
+ Remove text matching a custom regex pattern.
144
178
 
145
179
  Args:
146
- label (str): Label for the personal data removed.
180
+ texts (list[str]): Strings to scrub.
181
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
182
+ patterns (dict): {"name": {"pattern": r"John", "replacement": "[NAME]", "priority": 0.5}}
183
+ source (str): The methodological source of the scrubbed ident.
147
184
  Returns:
148
- int: The count of PII-type scrubbed.
149
- """
150
-
151
- if any(label in key for key in self.scrubbed_data):
152
- scrubbed_data = self.get_scrubbed_data()
153
- count = scrubbed_data[label].dropna().apply(len).sum()
154
- else:
155
- count = 0
156
-
157
- self.logger.info(f"{count} {label} scrubbed.")
158
-
159
- return count
160
-
161
- def scrub_and_collect(self, match, text, replacement_text, i, label) -> str:
162
- """
163
- Scrub pattern match and collect scrubbed name.
164
-
165
- Args:
166
- match (str): The regex match passed from `re.sub()`.
167
- i (int): the enumerate id of the string.
168
- label (str): Label for the personal data removed.
185
+ list[IDEnt]: A list of IDEnt objects.
169
186
 
170
- Returns:
171
- str: The replacement text.
172
187
  """
173
188
 
174
- self.scrubbed_data.append({self.text_id_name: i, label: match.group()})
189
+ idents = []
190
+
191
+ for text, text_id in zip(texts, text_ids):
192
+ for label, params in patterns.items():
193
+ pattern = params["pattern"]
194
+ replacement = params.get("replacement", "[REDACTED]")
195
+ priority = params.get("priority", 0.5)
196
+
197
+ compiled = re.compile(pattern, flags=re.IGNORECASE)
198
+
199
+ for match in compiled.finditer(text):
200
+ idents.append(
201
+ self.IDEnt(
202
+ text_id=text_id,
203
+ text=match.group(),
204
+ start=match.start(),
205
+ end=match.end(),
206
+ label=label,
207
+ replacement=replacement,
208
+ priority=priority,
209
+ source=source,
210
+ )
211
+ )
175
212
 
176
- return replacement_text
213
+ return idents
177
214
 
178
- def scrub_regex(self, pattern, replacement_text, label) -> list[str]:
215
+ def email_addresses(
216
+ self,
217
+ texts: list[str] = None,
218
+ text_ids: list = None,
219
+ replacement: str = "[EMAIL_ADDRESS]",
220
+ label: str = "email_address",
221
+ priority: float = 0.7,
222
+ ) -> list[IDEnt]:
179
223
  """
180
- General method to clean text using a regex pattern.
224
+ Remove email addresses using regex e.g. `johnsmith@mail.com` identified.
181
225
 
182
226
  Args:
183
- pattern (str): Regex pattern to apply.
184
- replacement_text (str): The replacement text for the removed text.
227
+ texts (list[str]): Strings to scrub.
228
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
229
+ If None, current cleaned state of `texts` passed at Class initiation used.
230
+ replacement (str): The replacement text for the removed text.
185
231
  label (str): Label for the personal data removed.
232
+ priority (float): Priority score for overlapping entities.
233
+ Higher scored entities are scrubbed where an overlap occurs.
234
+ The scores are relative e.g. 0.2 beats 0.1.
186
235
 
187
236
  Returns:
188
- list[str]: Cleaned texts.
237
+ list[IDEnt]: A list of IDEnt objects.
189
238
  """
190
239
 
191
- texts = self.get_texts()
192
-
193
- compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)
194
-
195
- if self.replacement_text:
196
- replacement_text = self.replacement_text
197
-
198
- cleaned_texts = [
199
- compiled_pattern.sub(
200
- partial(
201
- self.scrub_and_collect,
202
- text=text,
203
- replacement_text=replacement_text,
204
- i=i,
205
- label=label,
206
- ),
207
- text,
208
- )
209
- for i, text in zip(self.text_ids, texts)
210
- ]
211
-
212
- self.cleaned_texts = cleaned_texts
213
-
214
- self.log_message(label)
215
-
216
- return cleaned_texts
240
+ pattern = r"\b\S+@\S+\.\S+\b"
241
+ return self.find_regex(
242
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
243
+ )
217
244
 
218
- def custom_regex(
245
+ def urls(
219
246
  self,
220
- custom_regex_patterns: list[str] = None,
221
- custom_replacement_texts: list[str] = None,
222
- labels: list[str] = None,
223
- ) -> list[str]:
247
+ texts: list[str] = None,
248
+ text_ids: list = None,
249
+ replacement: str = "[URL]",
250
+ label: str = "url",
251
+ priority: float = 0.3,
252
+ ) -> list[IDEnt]:
224
253
  """
225
- Remove text matching a custom regex pattern.
226
-
227
- Args:
228
- custom_regex_patterns list[str]: Regex(s) pattern to apply.
229
- custom_replacement_texts list[str]: The replacement texts for the removed text.
230
- Defaults to '[REDACTED]' for all.
231
- labels list[str]: Labels for patterns removed.
232
-
233
- Returns:
234
- list[str]: Cleaned texts.
235
-
236
- """
237
- self.logger.info("Scrubbing custom regex...")
238
-
239
- if custom_replacement_texts:
240
- assert len(custom_regex_patterns) == len(custom_replacement_texts), (
241
- "There must be a replacement text for each pattern."
242
- )
243
- else:
244
- custom_replacement_texts = ["[REDACTED]"] * len(custom_regex_patterns)
245
-
246
- for i, (pattern, replacement_text) in enumerate(zip(custom_regex_patterns, custom_replacement_texts)):
247
- if labels:
248
- assert len(custom_regex_patterns) == len(labels), "There must be a label for each pattern."
249
- self.scrub_regex(pattern, replacement_text, label=f"{labels[i]}")
250
- else:
251
- self.scrub_regex(pattern, replacement_text, label=f"custom_regex_{i + 1}")
254
+ Remove `http`, `https` and `www` URLs using regex e.g. `www.google.com` identified.
252
255
 
253
- return self.cleaned_texts
254
-
255
- def email_addresses(self, replacement_text: str = "[EMAIL_ADDRESS]", label: str = "email_address") -> list[str]:
256
- """
257
- Remove email addresses using regex.
258
- e.g. `johnsmith@gmail.com` scrubbed
256
+ `example.com` will not be scrubbed by this method.
259
257
 
260
258
  Args:
261
- replacement_text (str): The replacement text for the removed text.
259
+ texts (list[str]): Strings to scrub.
260
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
261
+ If None, current cleaned state of `texts` passed at Class initiation used.
262
+ replacement (str): The replacement text for the removed text.
262
263
  label (str): Label for the personal data removed.
264
+ priority (float): Priority score for overlapping entities.
265
+ Higher scored entities are scrubbed where an overlap occurs.
266
+ The scores are relative e.g. 0.2 beats 0.1.
263
267
 
264
268
  Returns:
265
- list[str]: The input list of text with email addresses replaced.
269
+ list[IDEnt]: A list of IDEnt objects.
266
270
  """
267
271
 
268
- self.logger.info("Scrubbing email addresses using regex...")
269
- pattern = r"\b\S+@\S+\.\S+\b"
270
-
271
- return self.scrub_regex(pattern, replacement_text, label=label)
272
+ pattern = r"\b(?:https?://|www\.)[^\s<>()\"']+"
273
+ return self.find_regex(
274
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
275
+ )
272
276
 
273
- def handles(self, replacement_text: str = "[HANDLE]", label: str = "handle") -> list[str]:
277
+ def handles(
278
+ self,
279
+ texts: list[str] = None,
280
+ text_ids: list = None,
281
+ replacement: str = "[HANDLE]",
282
+ label: str = "handle",
283
+ priority: float = 0.4,
284
+ ) -> list[IDEnt]:
274
285
  """
275
- Remove `@` user handles using regex
276
- e.g. `@username` scrubbed
286
+ Remove `@` user handles using regex e.g. `@username` identified.
277
287
 
278
288
  Args:
279
- replacement_text (str): The replacement text for the removed text.
289
+ texts (list[str]): Strings to scrub.
290
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
291
+ If None, current cleaned state of `texts` passed at Class initiation used.
292
+ replacement (str): The replacement text for the removed text.
280
293
  label (str): Label for the personal data removed.
294
+ priority (float): Priority score for overlapping entities.
295
+ Higher scored entities are scrubbed where an overlap occurs.
296
+ The scores are relative e.g. 0.2 beats 0.1.
281
297
 
282
298
  Returns:
283
- list[str]: The input list of text with handles replaced.
299
+ list[IDEnt]: A list of IDEnt objects.
284
300
  """
285
301
 
286
- self.logger.info("Scrubbing @user handles using regex...")
287
302
  pattern = r"@[\w.-]+(?=\b)"
288
-
289
- return self.scrub_regex(pattern, replacement_text, label=label)
303
+ return self.find_regex(
304
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
305
+ )
290
306
 
291
307
  def google_phone_numbers(
292
- self, region: str = "GB", replacement_text: str = "[PHONENO]", label: str = "phone_number"
293
- ) -> list[str]:
308
+ self,
309
+ texts: list[str] = None,
310
+ text_ids: list = None,
311
+ region: str = "GB",
312
+ replacement: str = "[PHONENO]",
313
+ label: str = "phone_number",
314
+ priority: float = 0.8,
315
+ ) -> list[IDEnt]:
294
316
  """
295
- Remove phone numbers using Google's `phonenumbers`.
296
- e.g. `+441234567891` scrubbed
317
+ Remove phone numbers using Google's `phonenumbers` e.g. `+441234567891` identified.
297
318
 
298
319
  Args:
320
+ texts (list[str]): Strings to scrub.
321
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
322
+ If None, current cleaned state of `texts` passed at Class initiation used.
299
323
  region (str): The region to find phone numbers for. See `phonenumbers` regions.
300
- replacement_text (str): The replacement text for the removed text.
324
+ replacement (str): The replacement text for the removed text.
301
325
  label (str): Label for the personal data removed.
326
+ priority (float): Priority score for overlapping entities.
327
+ Higher scored entities are scrubbed where an overlap occurs.
328
+ The scores are relative e.g. 0.2 beats 0.1.
302
329
 
303
330
  Returns:
304
- list[str]: The input list of text with phone numbers replaced.
331
+ list[IDEnt]: A list of IDEnt objects.
305
332
  """
306
333
 
307
- self.logger.info(f"Scrubbing {region} phone numbers using Google's `phonenumbers`...")
308
-
309
- texts = self.get_texts()
334
+ if self.replacement:
335
+ replacement = self.replacement
310
336
 
311
- if self.replacement_text:
312
- replacement_text = self.replacement_text
337
+ idents = []
313
338
 
314
- cleaned_texts = []
315
-
316
- for i, text in zip(self.text_ids, texts):
339
+ for text, text_id in zip(texts, text_ids):
317
340
  matches = list(phonenumbers.PhoneNumberMatcher(text, region))
318
- phone_nos = [match.raw_string for match in matches]
319
-
320
- for phone_no in phone_nos:
321
- self.scrubbed_data.append({self.text_id_name: i, label: phone_no})
322
-
323
- cleaned = text
324
- for match in reversed(matches):
325
- cleaned = cleaned[: match.start] + replacement_text + cleaned[match.end :]
326
-
327
- cleaned_texts.append(cleaned)
328
-
329
- self.cleaned_texts = cleaned_texts
330
-
331
- self.log_message(label)
341
+ for match in matches:
342
+ idents.append(
343
+ self.IDEnt(
344
+ text_id=text_id,
345
+ text=match.raw_string,
346
+ start=match.start,
347
+ end=match.end,
348
+ priority=priority,
349
+ replacement=replacement,
350
+ label="phone_no",
351
+ source="google_phone_numbers",
352
+ )
353
+ )
332
354
 
333
- return cleaned_texts
355
+ return idents
334
356
 
335
- def uk_phone_numbers(self, replacement_text: str = "[PHONENO]", label: str = "uk_phone_number") -> list[str]:
357
+ def uk_phone_numbers(
358
+ self,
359
+ texts: list[str] = None,
360
+ text_ids: list = None,
361
+ replacement: str = "[PHONENO]",
362
+ label: str = "uk_phone_number",
363
+ priority: float = 0.8,
364
+ ) -> list[IDEnt]:
336
365
  """
337
- Remove phone numbers using regex.
338
- e.g. `+441234567891` scrubbed
366
+ Remove phone numbers using regex e.g. `+441234567891` identified.
339
367
 
340
368
  Args:
341
- replacement_text (str): The replacement text for the removed text.
369
+ texts (list[str]): Strings to scrub.
370
+ If None, current cleaned state of `text` passed at Class initiation used.
371
+ replacement (str): The replacement text for the removed text.
342
372
  label (str): Label for the personal data removed.
373
+ priority (float): Priority score for overlapping entities.
374
+ Higher scored entities are scrubbed where an overlap occurs.
375
+ The scores are relative e.g. 0.2 beats 0.1.
343
376
 
344
377
  Returns:
345
- list[str]: The input list of text with phone numbers replaced.
378
+ list[IDEnt]: A list of IDEnt objects.
346
379
  """
347
380
 
348
- self.logger.info("Scrubbing phone numbers using regex...")
349
381
  pattern = r"(\+?\d[\d\s]{7,}\d)"
382
+ return self.find_regex(
383
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
384
+ )
350
385
 
351
- return self.scrub_regex(pattern, replacement_text, label=label)
352
-
353
- def titles(self, strict: bool = False, replacement_text: str = "[TITLE]", label: str = "title") -> list[str]:
386
+ def titles(
387
+ self,
388
+ texts: list[str] = None,
389
+ text_ids: list = None,
390
+ strict: bool = False,
391
+ replacement: str = "[TITLE]",
392
+ label: str = "title",
393
+ priority: float = 0.4,
394
+ ) -> list[IDEnt]:
354
395
  """
355
396
  Remove titles using regex.
356
397
 
357
398
  Args:
399
+ texts (list[str]): Strings to scrub.
400
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
401
+ If None, current cleaned state of `text` passed at Class initiation used.
358
402
  strict (bool): Whether to use all of the titles or only essential titles.
359
403
  If strict, you may find scrubbing of common words, such as general.
360
- replacement_text (str): The replacement text for the removed text.
404
+ replacement (str): The replacement text for the removed text.
361
405
  label (str): Label for the personal data removed.
406
+ priority (float): Priority score for overlapping entities.
407
+ Higher scored entities are scrubbed where an overlap occurs.
408
+ The scores are relative e.g. 0.2 beats 0.1.
362
409
 
363
410
  Returns:
364
- list[str]: The input list of text with names after titles replaced.
411
+ list[IDEnt]: A list of IDEnt objects.
365
412
  """
366
413
 
367
414
  titles = [
@@ -413,103 +460,109 @@ class IDScrub:
413
460
  titles += [title + "." for title in titles]
414
461
  titles += [title + ":" for title in titles]
415
462
 
416
- self.logger.info("Scrubbing titles using regex...")
417
463
  pattern = r"\b(?:{})\b".format("|".join(re.escape(t) for t in titles))
464
+ return self.find_regex(
465
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
466
+ )
418
467
 
419
- return self.scrub_regex(pattern, replacement_text, label=label)
420
-
421
- def ip_addresses(self, replacement_text: str = "[IPADDRESS]", label: str = "ip_address") -> list[str]:
468
+ def ip_addresses(
469
+ self,
470
+ texts: list[str] = None,
471
+ text_ids: list = None,
472
+ replacement: str = "[IPADDRESS]",
473
+ label: str = "ip_address",
474
+ priority: float = 0.5,
475
+ ) -> list[IDEnt]:
422
476
  """
423
- Removes IP addresses.
424
- e.g. `192.168.1.1` scrubbed
477
+ Removes IP addresses e.g. `192.168.1.1` identified.
425
478
 
426
479
  Args:
427
- replacement_text (str): The replacement text for the removed text.
480
+ texts (list[str]): Strings to scrub.
481
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
482
+ If None, current cleaned state of `texts` passed at Class initiation used.
483
+ replacement (str): The replacement text for the removed text.
484
+ label (str): Label for the personal data removed.
485
+ priority (float): Priority score for overlapping entities.
486
+ Higher scored entities are scrubbed where an overlap occurs.
487
+ The scores are relative e.g. 0.2 beats 0.1.
428
488
 
429
489
  Returns:
430
- list[str]: The input list of text with IP addresses replaced.
490
+ list[IDEnt]: A list of IDEnt objects.
431
491
  """
432
492
 
433
- self.logger.info("Scrubbing IP addresses using regex...")
434
493
  pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
494
+ return self.find_regex(
495
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
496
+ )
435
497
 
436
- return self.scrub_regex(pattern, replacement_text, label=label)
437
-
438
- def uk_postcodes(self, replacement_text: str = "[POSTCODE]", label: str = "uk_postcode") -> list[str]:
498
+ def uk_postcodes(
499
+ self,
500
+ texts: list[str] = None,
501
+ text_ids: list = None,
502
+ replacement: str = "[POSTCODE]",
503
+ label: str = "uk_postcode",
504
+ priority: float = 0.5,
505
+ ) -> list[IDEnt]:
439
506
  """
440
- Removes postcodes.
441
- e.g. `A11 1AA` scrubbed
507
+ Removes postcodes e.g. `A11 1AA` identified.
442
508
 
443
509
  Args:
444
- replacement_text (str): The replacement text for the removed text.
510
+ texts (list[str]): Strings to scrub.
511
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
512
+ If None, current cleaned state of `texts` passed at Class initiation used.
513
+ replacement (str): The replacement text for the removed text.
445
514
  label (str): Label for the personal data removed.
515
+ priority (float): Priority score for overlapping entities.
516
+ Higher scored entities are scrubbed where an overlap occurs.
517
+ The scores are relative e.g. 0.2 beats 0.1.
446
518
 
447
519
  Returns:
448
- list[str]: The input list of text with postcodes replaced.
520
+ list[IDEnt]: A list of IDEnt objects.
449
521
  """
450
522
 
451
- self.logger.info("Scrubbing postcodes using regex...")
452
523
  pattern = r"\b(?:(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)[ \t]*[0-9][A-Z]{2}|GIR[ \t]*0A{2}|SAN[ \t]*TA1|ASCN[ \t]*1ZZ|STHL[ \t]*1ZZ|TDCU[ \t]*1ZZ|BBND[ \t]*1ZZ|[BFS]IQ{2}[ \t]*1ZZ|GX11[ \t]*1AA|PCRN[ \t]*1ZZ|TKCA[ \t]*1ZZ|AI-?[0-9]{4}|BFPO[ \t-]?[0-9]{2,4}|MSR[ \t-]?1(?:1[12]|[23][135])0|VG[ \t-]?11[1-6]0|KY[1-3][ \t-]?[0-2][0-9]{3})\b"
524
+ return self.find_regex(
525
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
526
+ )
453
527
 
454
- return self.scrub_regex(pattern, replacement_text, label=label)
455
-
456
- def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
528
+ def uk_addresses(
529
+ self,
530
+ texts: list[str] = None,
531
+ text_ids: list = None,
532
+ replacement: str = "[ADDRESS]",
533
+ label: str = "uk_address",
534
+ priority: float = 0.8,
535
+ ) -> list[IDEnt]:
457
536
  """
458
537
  Removes addresses.
459
- e.g. `10 Downing Street` scrubbed
538
+ e.g. `10 Downing Street` and `10, Downing Street` identified.
460
539
 
461
540
  Args:
462
- replacement_text (str): The replacement text for the removed text.
541
+ texts (list[str]): Strings to scrub.
542
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
543
+ If None, current cleaned state of `texts` passed at Class initiation used.
544
+ replacement (str): The replacement text for the removed text.
463
545
  label (str): Label for the personal data removed.
546
+ priority (float): Priority score for overlapping entities.
547
+ Higher scored entities are scrubbed where an overlap occurs.
548
+ The scores are relative e.g. 0.2 beats 0.1.
464
549
 
465
- Returns:
466
- list[str]: The input list of text with postcodes replaced.
467
- """
468
-
469
- self.logger.info("Scrubbing addresses using regex...")
470
- pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
471
-
472
- return self.scrub_regex(pattern, replacement_text, label)
473
550
 
474
- def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
475
- """
476
- Removes claimant names from employment tribunal texts.
477
- e.g. `Claimant: Jim Smith` scrubbed
478
-
479
- Args:
480
- None
481
551
  Returns:
482
- list[str]: The input list of text with claimants replaced.
552
+ list[IDEnt]: A list of IDEnt objects.
483
553
  """
484
554
 
485
- self.logger.info("Scrubbing claimants using regex...")
486
-
487
- texts = self.get_texts()
488
-
489
- claimant_name = None
490
-
491
- cleaned_texts = []
492
-
493
- for i, text in zip(self.text_ids, texts):
494
-
495
- def replace_claimant(match):
496
- nonlocal claimant_name
497
- claimant_name = match.group(2).strip()
498
- return f"{match.group(1)}[CLAIMANT] "
499
-
500
- cleaned = re.sub(r"[\r\n]", " ", text)
501
-
502
- cleaned = re.sub(r"(Claimant\s*:\s*)(.*?)(?=\bRespondents?\s*:)", replace_claimant, cleaned)
503
-
504
- if claimant_name:
505
- cleaned = re.sub(re.escape(claimant_name), replacement_text, cleaned)
506
- self.scrubbed_data.append({self.text_id_name: i, label: claimant_name})
507
-
508
- cleaned_texts.append(cleaned)
509
-
510
- self.cleaned_texts = cleaned_texts
555
+ if self.texts and self.text_ids:
556
+ texts = self.texts
557
+ text_ids = self.text_ids
558
+ else:
559
+ texts = texts
560
+ text_ids = text_ids
511
561
 
512
- return cleaned_texts
562
+ pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?,?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
563
+ return self.find_regex(
564
+ texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
565
+ )
513
566
 
514
567
  def get_spacy_model(self, model_name: str = "en_core_web_trf") -> Language:
515
568
  """
@@ -548,86 +601,69 @@ class IDScrub:
548
601
 
549
602
  def spacy_entities(
550
603
  self,
604
+ texts: list[str] = None,
605
+ text_ids: list = None,
551
606
  model_name: str = "en_core_web_trf",
552
- entities: list[str] = ["PERSON", "ORG", "NORP"],
553
- replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
554
- label_prefix: str = None,
607
+ entity_types: list[str] = ["PERSON", "ORG", "NORP"],
608
+ replacement_map: dict = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
609
+ priority: float = 1.0,
555
610
  n_process: int = 1,
556
611
  batch_size: int = 1000,
557
- ) -> list[str]:
612
+ ) -> list[IDEnt]:
558
613
  """
559
- Remove SpaCy entities using a given SpaCy model.
614
+ Remove SpaCy idents using a given SpaCy model.
560
615
  Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
561
616
  Note: only "en_core_web_trf" has been evaluated.
562
617
 
563
618
  Args:
619
+ texts (list[str]): Strings to scrub.
620
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
621
+ If None, current cleaned state of `texts` passed at Class initiation used.
564
622
  model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
565
- entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
566
- replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
623
+ entity_types (list[str]): Which SpaCy idents to scrub (based on SpaCy entity keys).
624
+ replacement_map (str): The replacement texts for the removed text. Key is entity type, value is replacement.
567
625
  label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
568
626
  n_process (int): Number of parallel processes.
569
627
  batch_size (int): The number of texts in each batch.
628
+ priority (float): Priority score for overlapping entities.
629
+ Higher scored entities are scrubbed where an overlap occurs.
630
+ The scores are relative e.g. 0.2 beats 0.1.
570
631
 
571
632
  Returns:
572
- list[str]: The input list of text with PERSON entities scrubbed.
633
+ list[IDEnt]: A list of IDEnt objects.
573
634
  """
574
635
 
575
- self.logger.info(
576
- f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
577
- )
578
-
579
- texts = self.get_texts()
636
+ nlp = self.get_spacy_model(model_name)
637
+ stripped_texts = ["" if s.strip() == "" else s for s in texts]
638
+ docs = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
580
639
 
581
- cleaned_texts = []
582
- labels = []
640
+ idents = []
583
641
 
584
- nlp = self.get_spacy_model(model_name)
585
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
586
- documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
587
-
588
- for i, (ids, doc, stripped_text) in tqdm(
589
- enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
590
- ):
591
- if not stripped_text:
592
- cleaned_texts.append(texts[i])
593
- continue
594
-
595
- all_found_entities = []
596
-
597
- for entity_type in entities:
598
- found = [
599
- ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
600
- ]
601
-
602
- for ent in found:
603
- label = ent.label_.lower()
604
- if label_prefix:
605
- label = f"{label_prefix}_{label}"
606
- labels.append(label)
607
- self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
608
-
609
- if self.replacement_text:
610
- all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
642
+ for doc, text_id in zip(docs, text_ids):
643
+ for ent in doc.ents:
644
+ if ent.label_ not in entity_types:
645
+ continue
646
+ if self.replacement:
647
+ replacement = self.replacement
611
648
  elif replacement_map:
612
- all_found_entities.extend(
613
- (ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
614
- )
649
+ replacement = replacement_map.get(ent.label_, "[REDACTED]")
615
650
  else:
616
- all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
617
-
618
- cleaned = stripped_text
619
-
620
- for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
621
- cleaned = cleaned[:start] + repl + cleaned[end:]
622
-
623
- cleaned_texts.append(cleaned)
624
-
625
- self.cleaned_texts = cleaned_texts
626
-
627
- for label in set(labels):
628
- self.log_message(label)
651
+ replacement = f"[{ent.label_}]"
652
+
653
+ idents.append(
654
+ self.IDEnt(
655
+ text_id=text_id,
656
+ text=ent.text,
657
+ start=ent.start_char,
658
+ end=ent.end_char,
659
+ priority=priority,
660
+ replacement=replacement,
661
+ label=ent.label_.lower(),
662
+ source="spacy",
663
+ )
664
+ )
629
665
 
630
- return cleaned_texts
666
+ return idents
631
667
 
632
668
  def get_hf_model(
633
669
  self,
@@ -666,41 +702,46 @@ class IDScrub:
666
702
 
667
703
  def huggingface_entities(
668
704
  self,
705
+ texts: list[str] = None,
706
+ text_ids: list = None,
707
+ entity_type="PER",
708
+ replacement: str = "[PERSON]",
709
+ label: str = "person",
710
+ priority: float = 1.0,
669
711
  hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
670
712
  download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
671
- entity="PER",
672
- replacement_text: str = "[PERSON]",
673
- label: str = "person",
674
- batch_size: int = 8,
675
- ) -> list[str]:
713
+ ) -> list[IDEnt]:
676
714
  """
677
- Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
715
+ Remove idents using a Hugging Face model. Default is a PERSON entity identifier.
678
716
  Note: No Hugging Face models have been evaluated for performance.
679
717
 
680
718
  Args:
719
+ texts (list[str]): Strings to scrub.
720
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
721
+ entity_type (str): Which entity to scrub (based on particular model keys).
722
+ If None, current cleaned state of `texts` passed at Class initiation used.
681
723
  hf_model_path (str): Path to the Hugging Face model.
682
724
  Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
683
725
  download_directory (str): Directory in which to save the model.
684
726
  Default is current working directory.
685
- replacement_text (str): The replacement text for the removed text.
727
+ replacement (str): The replacement text for the removed text.
686
728
  label (str): Label for the personal data removed.
729
+ priority (float): Priority score for overlapping entities.
730
+ Higher scored entities are scrubbed where an overlap occurs.
731
+ The scores are relative e.g. 0.2 beats 0.1.
687
732
  batch_size (int): Number of texts passed to the model in each batch.
688
733
  Memory (instance size) dependent.
689
734
 
690
735
  Returns:
691
- list[str]: The input list of text with PERSON entities replaced.
736
+ list[str]: The input list of text with PERSON idents replaced.
692
737
 
693
738
  """
694
739
 
695
- self.logger.info(f"Scrubbing names using Hugging Face model ({hf_model_path})...")
740
+ if self.replacement:
741
+ replacement = self.replacement
696
742
 
697
743
  tokenizer = self.get_hf_model(hf_model_path=hf_model_path, download_directory=download_directory)
698
744
 
699
- texts = self.get_texts()
700
-
701
- if self.replacement_text:
702
- replacement_text = self.replacement_text
703
-
704
745
  try:
705
746
  names_model = AutoModelForTokenClassification.from_pretrained(hf_model_path)
706
747
  except OSError:
@@ -708,74 +749,72 @@ class IDScrub:
708
749
  f"Hugging Face model `{hf_model_path}` does has not been downloaded correctly. Please delete `huggingface/` and retry."
709
750
  )
710
751
 
711
- ner_pipeline = pipeline("ner", model=names_model, tokenizer=tokenizer, aggregation_strategy="simple")
712
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
713
- batched_entities = ner_pipeline(stripped_texts, batch_size=batch_size)
714
-
715
- cleaned_texts = []
716
-
717
- for i, (ids, stripped_text, entities) in enumerate(zip(self.text_ids, stripped_texts, batched_entities)):
718
- if stripped_text == "":
719
- cleaned_texts.append(texts[i])
720
- continue
721
-
722
- person_entities = [
723
- ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
724
- ]
725
- self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
726
-
727
- cleaned = stripped_text
728
- for ent in sorted(person_entities, key=lambda x: x["start"], reverse=True):
729
- cleaned = cleaned[: ent["start"]] + replacement_text + cleaned[ent["end"] :]
730
-
731
- cleaned_texts.append(cleaned)
732
-
733
- self.cleaned_texts = cleaned_texts
734
-
735
- self.log_message(label)
752
+ ner = pipeline(task="ner", model=names_model, tokenizer=tokenizer, aggregation_strategy="simple")
753
+
754
+ idents = []
755
+
756
+ results = ner(texts)
757
+
758
+ for ents, text_id in zip(results, text_ids):
759
+ for ent in ents:
760
+ if ent["entity_group"] != entity_type:
761
+ continue
762
+ idents.append(
763
+ self.IDEnt(
764
+ text_id=text_id,
765
+ text=ent["word"],
766
+ start=ent["start"],
767
+ end=ent["end"],
768
+ priority=priority,
769
+ replacement=replacement,
770
+ label=label,
771
+ source="huggingface",
772
+ )
773
+ )
736
774
 
737
- return cleaned_texts
775
+ return idents
738
776
 
739
777
  def presidio_entities(
740
778
  self,
779
+ texts: list[str] = None,
780
+ text_ids: list = None,
741
781
  model_name: str = "en_core_web_trf",
742
- entities: list[str] = [
782
+ entity_types: list[str] = [
743
783
  "PERSON",
784
+ "EMAIL_ADDRESS",
744
785
  "UK_NINO",
745
786
  "UK_NHS",
746
787
  "CREDIT_CARD",
747
788
  "CRYPTO",
748
789
  "MEDICAL_LICENSE",
749
- "URL",
790
+ "SWIFT_CODE",
750
791
  "IBAN_CODE",
792
+ "LOCATION",
793
+ "NRP",
751
794
  ],
752
- replacement_map: str = None,
753
- label_prefix: str = None,
754
- ) -> list[str]:
795
+ replacement_map: dict = {},
796
+ priority: float = 1.0,
797
+ ) -> list[IDEnt]:
755
798
  """
756
- Scrub specified entities from texts using Presidio.
799
+ Scrub specified idents from texts using Presidio.
757
800
 
758
801
  See https://microsoft.github.io/presidio/supported_entities/ for further detail.
759
802
 
760
803
  Args:
804
+ texts (list[str]): Strings to scrub.
805
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
806
+ If None, current cleaned state of `texts` passed at Class initiation used.
761
807
  model_name (str): spaCy model to use
762
- entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
763
- replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
764
- label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
765
- Useful if you wish to identify this having being scrubbed by Presidio.
808
+ entity_types (list[str]): entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
809
+ replacement_map (str): The replacement texts for the removed text. Key is entity type, value is replacement.
810
+ priority (float): Priority score for overlapping entities.
811
+ Higher scored entities are scrubbed where an overlap occurs.
812
+ The scores are relative e.g. 0.2 beats 0.1.
766
813
 
767
814
  Returns:
768
- list[str]: The input list of text with entities replaced.
815
+ list[str]: The input list of text with idents replaced.
769
816
  """
770
817
 
771
- self.logger.info(
772
- f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
773
- )
774
-
775
- texts = self.get_texts()
776
-
777
- cleaned_texts = []
778
-
779
818
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
780
819
  def __init__(self, loaded_spacy_model):
781
820
  super().__init__()
@@ -785,199 +824,322 @@ class IDScrub:
785
824
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)
786
825
 
787
826
  analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine)
788
- anonymizer = AnonymizerEngine()
789
827
 
790
- cleaned_texts = []
791
- all_labels = []
828
+ stripped_texts = ["" if s.strip() == "" else s for s in texts]
792
829
 
793
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
830
+ idents = []
794
831
 
795
- for i, (ids, stripped_text) in tqdm(enumerate(zip(self.text_ids, stripped_texts)), total=len(texts)):
796
- if stripped_text == "":
797
- cleaned_texts.append(texts[i])
798
- continue
832
+ for text, text_id in zip(stripped_texts, text_ids):
833
+ results = analyzer.analyze(text=text, language="en", entities=entity_types)
834
+ for res in results:
835
+ if res.entity_type not in entity_types:
836
+ continue
799
837
 
800
- results = analyzer.analyze(text=stripped_text, language="en")
801
- results = [r for r in results if r.entity_type in entities]
802
-
803
- if label_prefix:
804
- labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
805
- else:
806
- labels = [f"{res.entity_type.lower()}" for res in results]
838
+ if self.replacement:
839
+ replacement = self.replacement
840
+ elif replacement_map:
841
+ replacement = replacement_map.get(res.entity_type, "[REDACTED]")
842
+ else:
843
+ replacement = f"[{res.entity_type}]"
844
+
845
+ idents.append(
846
+ self.IDEnt(
847
+ text_id=text_id,
848
+ text=text[res.start : res.end],
849
+ start=res.start,
850
+ end=res.end,
851
+ priority=priority,
852
+ replacement=replacement,
853
+ label=res.entity_type.lower(),
854
+ source="presidio",
855
+ )
856
+ )
807
857
 
808
- for label in labels:
809
- all_labels.append(label)
858
+ return idents
810
859
 
811
- self.scrubbed_data.extend(
812
- {self.text_id_name: ids, label: stripped_text[res.start : res.end]}
813
- for res, label in zip(results, labels)
814
- )
860
+ def group_idents(self, idents: list[IDEnt]) -> dict[int | str | float, list[IDEnt]]:
861
+ """
862
+ Group a list of IDEnt objects by `text_id`.
815
863
 
816
- if self.replacement_text:
817
- operators = {
818
- res.entity_type: OperatorConfig("replace", {"new_value": self.replacement_text}) for res in results
819
- }
820
- elif replacement_map:
821
- operators = {
822
- res.entity_type: OperatorConfig("replace", {"new_value": replacement_map.get(res.entity_type)})
823
- for res in results
824
- }
825
- else:
826
- operators = {
827
- res.entity_type: OperatorConfig("replace", {"new_value": f"[{res.entity_type}]"}) for res in results
828
- }
864
+ Each unique `text_id` becomes a dictionary key,
865
+ and its value is a list of all IDEnt objects associated with that ID.
829
866
 
830
- anonymized = anonymizer.anonymize(text=stripped_text, analyzer_results=results, operators=operators)
867
+ Args:
868
+ idents (list[IDEnt]) A list of IDEnt objects.
831
869
 
832
- cleaned_texts.append(anonymized.text)
870
+ Returns:
871
+ dict[int | str | float, list[IDEnt]]: A dictionary mapping each text_id to a list of IDEnt objects.
872
+ """
833
873
 
834
- self.cleaned_texts = cleaned_texts
874
+ idents_grouped = defaultdict(list)
835
875
 
836
- for label in set(all_labels):
837
- self.log_message(label)
876
+ for ident in idents:
877
+ idents_grouped[ident.text_id].append(ident)
838
878
 
839
- return cleaned_texts
879
+ return idents_grouped
840
880
 
841
- def all_regex(self) -> list[str]:
881
+ def resolve_overlaps(self, idents: list[IDEnt]) -> list[IDEnt]:
842
882
  """
843
- Use all regex methods to remove personal information from text.
883
+ Select the highest-scoring non-overlapping idents.
884
+
885
+ Resolves conflicts between idents that overlap in their
886
+ character ranges. Entities are first sorted by descending priority and then by
887
+ start position to ensure a priority order.
888
+
889
+ Each IDEnt is accepted only if it does not overlap with any IDEnt
890
+ already selected. The resulting set of idents is returned in ascending
891
+ document order.
892
+
893
+ A IDEnt is considered overlapping if:
894
+ IDEnt.start <= other.end and IDEnt.end >= other.start
844
895
 
845
896
  Args:
846
- None
897
+ idents (list[IDEnt]) A list of IDEnt objects.
847
898
 
848
899
  Returns:
849
- list[str]: The input list of text with various personal information replaced.
850
-
900
+ list[IDEnt]: A list of non-overlapping idents, sorted by their start position.
851
901
  """
852
902
 
853
- self.email_addresses()
854
- self.handles()
855
- self.ip_addresses()
856
- self.uk_phone_numbers()
857
- self.uk_addresses()
858
- self.uk_postcodes()
859
- self.titles()
903
+ idents_grouped = self.group_idents(idents)
860
904
 
861
- return self.cleaned_texts
905
+ resolved = []
862
906
 
863
- def all(
864
- self,
865
- custom_regex_patterns: list = None,
866
- custom_replacement_texts: list[str] = None,
867
- model_name: str = "en_core_web_trf",
868
- spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
869
- presidio_entities: list[str] = [
870
- "PERSON",
871
- "EMAIL_ADDRESS",
872
- "UK_NINO",
873
- "UK_NHS",
874
- "CREDIT_CARD",
875
- "CRYPTO",
876
- "MEDICAL_LICENSE",
877
- "URL",
878
- "SWIFT_CODE",
879
- "IBAN_CODE",
880
- "LOCATION",
881
- "NRP",
882
- ],
883
- n_process: int = 1,
884
- batch_size: int = 1000,
885
- ) -> list[str]:
907
+ for text_id, idents in idents_grouped.items():
908
+ if not idents:
909
+ return []
910
+
911
+ idents_by_score = sorted(idents, key=lambda ident: (-ident.priority, ident.start))
912
+
913
+ kept_idents = []
914
+
915
+ for current_ident in idents_by_score:
916
+ has_overlap = any(
917
+ current_ident.start <= existing_ident.end and current_ident.end >= existing_ident.start
918
+ for existing_ident in kept_idents
919
+ )
920
+
921
+ if not has_overlap:
922
+ kept_idents.append(current_ident)
923
+
924
+ resolved.extend(kept_idents)
925
+
926
+ return resolved
927
+
928
+ def scrub_text(self, texts: str = None, text_ids: list = None, idents: list[IDEnt] = None):
886
929
  """
887
- Use all regex and NER (Spacy) methods to remove personal information from text.
930
+ Apply a set of non-overlapping replacement idents to a text.
931
+
932
+ Each IDEnt specifies a character range to replace (`IDEnt.start` to `IDEnt.end`)
933
+ and a `replacement` string that will be inserted in place of that range.
888
934
 
889
935
  Args:
890
- custom_regex_patterns list[str]: Regex(s) pattern to apply.
891
- custom_replacement_texts list[str]: The replacement texts for the removed text. Defaults to '[REDACTED]' for all.
892
- model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
893
- n_process (str): Number of parallel processes.
894
- batch_size (int): The number of texts in each batch.
936
+ texts list[str]: The original input text with overlaps resolved.
937
+ text_ids (list): A list of identifiers that correspond to each string in `texts`.
938
+ idents list[IDEnt]: a list of IDEnt objects. Must be non-overlapping.
939
+ See `resolve_conflicts`.
895
940
 
896
- Returns:
897
- list[str]: The input list of text with various personal information replaced.
941
+ Return:
942
+ str: A scrubbed string with all replacements applied.
898
943
  """
899
944
 
900
- if custom_regex_patterns:
901
- self.custom_regex(
902
- custom_regex_patterns=custom_regex_patterns,
903
- custom_replacement_texts=custom_replacement_texts,
904
- )
945
+ if texts is None:
946
+ texts = getattr(self, "texts", None)
947
+ if text_ids is None:
948
+ text_ids = getattr(self, "text_ids", None)
949
+ if idents is None:
950
+ idents = getattr(self, "idents", None)
905
951
 
906
- self.presidio_entities(model_name=model_name, entities=presidio_entities)
907
- self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
908
- self.google_phone_numbers()
909
- self.all_regex()
952
+ if texts is None or text_ids is None or idents is None:
953
+ raise ValueError("texts, text_ids, and idents must be provided or set on self.")
910
954
 
911
- return self.cleaned_texts
955
+ if len(texts) != len(text_ids):
956
+ raise ValueError("texts and text_ids must be the same length.")
912
957
 
913
- def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
914
- """
915
- Scrubs text using given methods (in order).
916
- Uses default values for the given scrub method.
958
+ scrubbed_texts = list(texts)
959
+ idents_grouped = self.group_idents(idents)
917
960
 
918
- Methods available (see associated method docstring for further information):
961
+ for i, text_id in enumerate(text_ids):
962
+ text = texts[i]
919
963
 
920
- "all", "spacy_persons", "huggingface_persons", "email_addresses", "handles",
921
- "ip_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
922
- "titles", "presidio"
964
+ group = idents_grouped.get(text_id, [])
965
+ sorted_group = sorted(group, key=lambda ident: ident.start, reverse=True)
923
966
 
924
- Example:
967
+ for ident in sorted_group:
968
+ text = text[: ident.start] + ident.replacement + text[ident.end :]
925
969
 
926
- "email_addresses" = scrub.email_addresses()
970
+ scrubbed_texts[i] = text
927
971
 
928
- Therefore we can call:
972
+ return scrubbed_texts
929
973
 
930
- IDScrub.scrub(scrub_methods = ["email_addresses"])
974
+ def scrub(
975
+ self,
976
+ pipeline: list[dict] = [
977
+ {"method": "presidio_entities"},
978
+ {"method": "spacy_entities"},
979
+ {"method": "email_addresses"},
980
+ {"method": "handles"},
981
+ {"method": "ip_addresses"},
982
+ {"method": "uk_addresses"},
983
+ {"method": "uk_phone_numbers"},
984
+ {"method": "google_phone_numbers"},
985
+ {"method": "uk_postcodes"},
986
+ {"method": "urls"},
987
+ {"method": "titles"},
988
+ ],
989
+ ):
990
+ """
991
+ Scrubs text using given methods.
992
+ Uses default values for the given scrub method.
931
993
 
932
994
  Args:
933
- scrub_method (str): string name of scrub method.
995
+ pipeline (list[dict]): Scrub methods and their method parameters to apply.
996
+ Methods are specified with "method" key.
997
+ Parameters are specified with argument name as "key" and argument value as value.
998
+
999
+ Example: IDScrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"])
1000
+
1001
+ See associated method docstring for further parameters e.g. ?IDScrub.spacy_entities.
1002
+
1003
+ Methods available:
1004
+
1005
+ "spacy_entities", "huggingface_entities", "email_addresses", "handles",
1006
+ "ip_addresses", "uk_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
1007
+ "titles", "presidio_entities"
1008
+
1009
+ Each method takes a `priority` argument. Higher priority scored entities
1010
+ are scrubbed where an overlap occurs. The scores are relative.
934
1011
 
935
1012
  Returns:
936
- list[str]: The input list of text with personal information replaced.
1013
+ list[str]: The input texts scrubbed of personal data.
937
1014
 
938
1015
  """
939
1016
 
940
- for scrub_method in scrub_methods:
1017
+ if not isinstance(pipeline, list):
1018
+ raise TypeError("Argument `pipeline` must be a list of dicts.")
1019
+
1020
+ self.idents_all = []
1021
+ self.idents = []
1022
+
1023
+ for step in pipeline:
1024
+ scrub_method = step["method"]
1025
+ args = {k: v for k, v in step.items() if k != "method"}
1026
+
1027
+ if args:
1028
+ self.logger.info(f"Scrubbing using {scrub_method} with parameters {args}...")
1029
+ else:
1030
+ self.logger.info(f"Scrubbing using {scrub_method} with default parameters...")
1031
+
941
1032
  try:
942
1033
  method = getattr(self, scrub_method)
943
- method()
944
1034
  except AttributeError:
945
1035
  self.logger.warning("Not a scrub method.")
946
1036
 
947
- return self.cleaned_texts
1037
+ self.idents_all.extend(method(texts=self.texts, text_ids=self.text_ids, **args))
1038
+
1039
+ idents_exclude = [ident for ident in self.idents_all if ident.text not in self.exclude]
1040
+ idents_resolved = self.resolve_overlaps(idents=idents_exclude)
1041
+ self.idents.extend(idents_resolved)
1042
+ self.scrubbed_texts = self.scrub_text(texts=self.texts, text_ids=self.text_ids, idents=self.idents)
1043
+
1044
+ return self.scrubbed_texts
1045
+
1046
+ def get_all_identified_data(self) -> pd.DataFrame:
1047
+ """
1048
+ Get all of the identified data before overlaps have been resolved.
1049
+
1050
+ Each row is a identified entity. Columns are the IDEnt attributes.
1051
+
1052
+ Args:
1053
+ None
1054
+ Return:
1055
+ pd.DataFrame: All identified data and their attributes.
1056
+ """
1057
+ all_idents = pd.DataFrame([asdict(ident) for ident in self.idents_all])
1058
+ return all_idents
1059
+
1060
+ def get_scrubbed_data(self) -> pd.DataFrame:
1061
+ """
1062
+ Create a DataFrame summarising scrubbed text idents grouped by text ID and label.
1063
+
1064
+ Each row corresponds to a unique `text_id`, and each column represents a IDEnt label.
1065
+ The cell values are lists of the IDEnt text values associated with that label for the given text ID.
1066
+ Args:
1067
+ None
1068
+ Return:
1069
+ pd.DataFrame: All data scrubbed from text.
1070
+ """
1071
+ data = defaultdict(lambda: defaultdict(list))
1072
+
1073
+ for ident in self.idents:
1074
+ data[ident.text_id][ident.label].append(ident.text)
1075
+
1076
+ df = pd.DataFrame.from_dict(data, orient="index")
1077
+ df = df.reset_index().rename(columns={"index": self.text_id_name})
1078
+ df = df.where(pd.notna(df), None)
1079
+
1080
+ return df
948
1081
 
949
1082
  @staticmethod
950
1083
  def dataframe(
951
1084
  df: pd.DataFrame = None,
952
1085
  id_col: str = None,
953
1086
  exclude_cols: list = None,
954
- scrub_methods: list[str] = ["all"],
1087
+ pipeline: list[dict] = [
1088
+ {"method": "presidio_entities"},
1089
+ {"method": "spacy_entities"},
1090
+ {"method": "email_addresses"},
1091
+ {"method": "handles"},
1092
+ {"method": "ip_addresses"},
1093
+ {"method": "uk_addresses"},
1094
+ {"method": "uk_phone_numbers"},
1095
+ {"method": "google_phone_numbers"},
1096
+ {"method": "uk_postcodes"},
1097
+ {"method": "urls"},
1098
+ {"method": "titles"},
1099
+ ],
955
1100
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
956
1101
  """
957
1102
  Scrubs all personal data from a Pandas Dataframe.
958
1103
 
959
1104
  Args:
960
1105
  df (pd.DataFrame): A Pandas dataframe to scrub.
961
- id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `id` is applied.
1106
+ id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `text_id` is applied.
962
1107
  exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
963
- scrub_methods (list[str]): Which scrub methods to apply to the DataFrame (in order).
964
- These are string versions of the existing methods e.g. "all" == scrub.all() and "email_addresses" == scrub.email_addresses().
1108
+ pipeline (list[dict]): Scrub methods and their method parameters to apply.
1109
+ Methods are specified with "method" key.
1110
+
1111
+ Example: IDScrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"])
1112
+
1113
+ See associated method docstring for further parameters e.g. ?IDScrub.spacy_entities.
1114
+
1115
+ Methods available:
1116
+
1117
+ "spacy_entities", "huggingface_entities", "email_addresses", "handles",
1118
+ "ip_addresses", "uk_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
1119
+ "titles", "presidio_entities"
1120
+
1121
+ Each method takes a `priority` argument. Higher priority scored entities
1122
+ are scrubbed where an overlap occurs. The scores are relative.
965
1123
 
966
1124
  Returns:
967
1125
  tuple[pd.DataFrame, pd.DataFrame]: The input dataframe with all personal data removed and a dataframe with the personal data that has been removed.
968
1126
 
969
1127
  """
970
1128
 
971
- assert id_col in df.columns, "`id_col` is not a column in `df`. Please check."
1129
+ if not isinstance(df, pd.DataFrame):
1130
+ raise TypeError("`df` must be a Pandas DataFrame.")
972
1131
 
973
- if id_col:
974
- ids = df[id_col].to_list()
975
- if not id_col:
976
- id_col = "id"
1132
+ if id_col is None:
977
1133
  ids = range(1, len(df) + 1)
1134
+ id_col = "id"
1135
+ else:
1136
+ if id_col not in df.columns:
1137
+ raise ValueError(f"`id_col` '{id_col}' is not a column in df.")
1138
+
1139
+ ids = df[id_col].tolist()
978
1140
 
979
- assert isinstance(df, pd.DataFrame), "`df` must be a Pandas DataFrame."
980
- assert len(df) == len(ids), "Length of dataframe is different to the length of IDs."
1141
+ if not len(df) == len(ids):
1142
+ raise ValueError("Length of dataframe is different to the length of IDs.")
981
1143
 
982
1144
  if exclude_cols is None:
983
1145
  cols_to_scrub = df.columns.to_list()
@@ -994,16 +1156,17 @@ class IDScrub:
994
1156
  original_dtype = scrubbed_df[col].dtype
995
1157
  scrubbed_df[col] = scrubbed_df[col].astype(str)
996
1158
 
997
- scrub = IDScrub(texts=scrubbed_df[col].to_list(), text_id_name=id_col, text_ids=ids)
1159
+ scrub = IDScrub(texts=scrubbed_df[col].to_list(), text_ids=ids)
998
1160
  scrub.logger.info(f"Scrubbing column `{col}`...")
999
1161
 
1000
- scrubbed_texts = scrub.scrub(scrub_methods)
1162
+ scrubbed_texts = scrub.scrub(pipeline=pipeline)
1001
1163
  scrubbed_df[col] = scrubbed_texts
1002
1164
 
1003
1165
  scrubbed_data = scrub.get_scrubbed_data()
1004
1166
 
1005
1167
  if scrubbed_data is not None:
1006
1168
  scrubbed_data.insert(1, "column", col)
1169
+ scrubbed_data.rename(columns={"text_id": id_col}, inplace=True)
1007
1170
  all_scrubbed_data.append(scrubbed_data)
1008
1171
 
1009
1172
  try:
@@ -1013,8 +1176,14 @@ class IDScrub:
1013
1176
  pass
1014
1177
 
1015
1178
  all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
1179
+ all_scrubbed_data["column"] = pd.Categorical(
1180
+ all_scrubbed_data["column"], categories=cols_to_scrub, ordered=True
1181
+ )
1182
+ all_scrubbed_data = all_scrubbed_data.sort_values(by=["column", id_col]).reset_index(drop=True)
1183
+ all_scrubbed_data["column"] = all_scrubbed_data["column"].astype(str)
1016
1184
  all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
1017
1185
 
1018
- assert df.shape == scrubbed_df.shape, "Original and scrubbed dataframe not the same shape. Check."
1186
+ if not df.shape == scrubbed_df.shape:
1187
+ raise ValueError("Original and scrubbed dataframe not the same shape. Check input DataFrame.")
1019
1188
 
1020
1189
  return scrubbed_df, all_scrubbed_data