idscrub 1.0.1__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {idscrub-1.0.1 → idscrub-1.1.1}/.pre-commit-config.yaml +1 -1
  2. {idscrub-1.0.1 → idscrub-1.1.1}/PKG-INFO +20 -14
  3. {idscrub-1.0.1 → idscrub-1.1.1}/README.md +18 -12
  4. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub/scrub.py +85 -39
  5. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/PKG-INFO +20 -14
  6. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/SOURCES.txt +1 -1
  7. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/requires.txt +1 -1
  8. {idscrub-1.0.1 → idscrub-1.1.1}/notebooks/basic_usage.ipynb +138 -74
  9. {idscrub-1.0.1 → idscrub-1.1.1}/pyproject.toml +1 -1
  10. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_chain.py +2 -2
  11. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_huggingface.py +3 -3
  12. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_label.py +1 -1
  13. idscrub-1.0.1/test/test_persidio.py → idscrub-1.1.1/test/test_presidio.py +7 -7
  14. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_regex.py +46 -0
  15. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_scrub.py +3 -3
  16. idscrub-1.1.1/test/test_spacy.py +53 -0
  17. {idscrub-1.0.1 → idscrub-1.1.1}/uv.lock +356 -320
  18. idscrub-1.0.1/test/test_spacy.py +0 -26
  19. {idscrub-1.0.1 → idscrub-1.1.1}/.github/pull_request_template.md +0 -0
  20. {idscrub-1.0.1 → idscrub-1.1.1}/.github/workflows/cd.yml +0 -0
  21. {idscrub-1.0.1 → idscrub-1.1.1}/.github/workflows/ci.yml +0 -0
  22. {idscrub-1.0.1 → idscrub-1.1.1}/.gitignore +0 -0
  23. {idscrub-1.0.1 → idscrub-1.1.1}/CODEOWNERS +0 -0
  24. {idscrub-1.0.1 → idscrub-1.1.1}/LICENSE +0 -0
  25. {idscrub-1.0.1 → idscrub-1.1.1}/Makefile +0 -0
  26. {idscrub-1.0.1 → idscrub-1.1.1}/SECURITY_CHECKLIST.md +0 -0
  27. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub/__init__.py +0 -0
  28. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub/locations.py +0 -0
  29. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/dependency_links.txt +0 -0
  30. {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/top_level.txt +0 -0
  31. {idscrub-1.0.1 → idscrub-1.1.1}/setup.cfg +0 -0
  32. {idscrub-1.0.1 → idscrub-1.1.1}/test/conftest.py +0 -0
  33. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_all.py +0 -0
  34. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_dataframe.py +0 -0
  35. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_id.py +0 -0
  36. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_log.py +0 -0
  37. {idscrub-1.0.1 → idscrub-1.1.1}/test/test_phonenumbers.py +0 -0
@@ -12,7 +12,7 @@ repos:
12
12
 
13
13
  # Mandatory internal hooks
14
14
  - repo: https://github.com/uktrade/github-standards
15
- rev: v1.2.1 # update periodically with pre-commit autoupdate
15
+ rev: v1.3.1 # update periodically with pre-commit autoupdate
16
16
  hooks:
17
17
  - id: run-security-scan
18
18
  verbose: false
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 1.0.1
3
+ Version: 1.1.1
4
4
  Author: Department for Business and Trade
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -8,7 +8,7 @@ License-File: LICENSE
8
8
  Requires-Dist: ipykernel>=7.1.0
9
9
  Requires-Dist: ipywidgets
10
10
  Requires-Dist: numpy>=2.3.4
11
- Requires-Dist: pandas>=2.3.3
11
+ Requires-Dist: pandas<3.0
12
12
  Requires-Dist: phonenumbers>=9.0.18
13
13
  Requires-Dist: pip>=25.3
14
14
  Requires-Dist: spacy-transformers>=1.3.9
@@ -19,12 +19,17 @@ Provides-Extra: trf
19
19
  Requires-Dist: en_core_web_trf; extra == "trf"
20
20
  Dynamic: license-file
21
21
 
22
+ ![Development](https://img.shields.io/badge/status-development-orange)
23
+
22
24
  # idscrub 🧽✨
23
25
 
24
26
  * Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
25
27
  * This information may need to be removed prior to further analysis in many cases.
26
28
  * `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
27
29
 
30
+ > [!IMPORTANT]
31
+ > * This package is undergoing frequent internal development. Major updates will be made public periodically.
32
+
28
33
  ## Installation
29
34
 
30
35
  `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
@@ -45,7 +50,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
45
50
  from idscrub import IDScrub
46
51
 
47
52
  scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
48
- scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
53
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
49
54
 
50
55
  print(scrubbed_texts)
51
56
 
@@ -57,17 +62,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
57
62
 
58
63
  | Argument | Scrubs |
59
64
  |-------------------------|------------------------------------------------------------------------|
60
- | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
61
- | `spacy_persons` | Person names detected by spaCy's `en_core_web_trf` (or other user-selected spaCy models) |
62
- | `huggingface_persons` | Person names detected by user-selected HuggingFace models |
63
- | `email_addresses` | Email addresses |
64
- | `titles` | Titles (e.g., Mr., Mrs., Dr.) |
65
- | `handles` | Social media handles (e.g., @username) |
66
- | `ip_addresses` | IP addresses |
67
- | `uk_postcodes` | UK postal codes |
68
- | `uk_phone_numbers` | UK phone numbers |
69
- | `google_phone_numbers` | Phone numbers detected by Google’s [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
70
- | `presidio` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g., names, URLs, NHS numbers, IBAN codes) |
65
+ | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
66
+ | `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
67
+ | `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
68
+ | `huggingface_entities` | Entities detected by user-selected HuggingFace models |
69
+ | `email_addresses` | Email addresses (e.g. john@email.com) |
70
+ | `titles` | Titles (e.g. Mr., Mrs., Dr.) |
71
+ | `handles` | Social media handles (e.g. @username) |
72
+ | `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
73
+ | `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
74
+ | `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
75
+ | `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
76
+ | `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
71
77
 
72
78
  ## Considerations before use
73
79
 
@@ -1,9 +1,14 @@
1
+ ![Development](https://img.shields.io/badge/status-development-orange)
2
+
1
3
  # idscrub 🧽✨
2
4
 
3
5
  * Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
4
6
  * This information may need to be removed prior to further analysis in many cases.
5
7
  * `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
6
8
 
9
+ > [!IMPORTANT]
10
+ > * This package is undergoing frequent internal development. Major updates will be made public periodically.
11
+
7
12
  ## Installation
8
13
 
9
14
  `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
@@ -24,7 +29,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
24
29
  from idscrub import IDScrub
25
30
 
26
31
  scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
27
- scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
32
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
28
33
 
29
34
  print(scrubbed_texts)
30
35
 
@@ -36,17 +41,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
36
41
 
37
42
  | Argument | Scrubs |
38
43
  |-------------------------|------------------------------------------------------------------------|
39
- | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
40
- | `spacy_persons` | Person names detected by spaCy's `en_core_web_trf` (or other user-selected spaCy models) |
41
- | `huggingface_persons` | Person names detected by user-selected HuggingFace models |
42
- | `email_addresses` | Email addresses |
43
- | `titles` | Titles (e.g., Mr., Mrs., Dr.) |
44
- | `handles` | Social media handles (e.g., @username) |
45
- | `ip_addresses` | IP addresses |
46
- | `uk_postcodes` | UK postal codes |
47
- | `uk_phone_numbers` | UK phone numbers |
48
- | `google_phone_numbers` | Phone numbers detected by Google’s [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
49
- | `presidio` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g., names, URLs, NHS numbers, IBAN codes) |
44
+ | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
45
+ | `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
46
+ | `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
47
+ | `huggingface_entities` | Entities detected by user-selected HuggingFace models |
48
+ | `email_addresses` | Email addresses (e.g. john@email.com) |
49
+ | `titles` | Titles (e.g. Mr., Mrs., Dr.) |
50
+ | `handles` | Social media handles (e.g. @username) |
51
+ | `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
52
+ | `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
53
+ | `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
54
+ | `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
55
+ | `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
50
56
 
51
57
  ## Considerations before use
52
58
 
@@ -453,6 +453,24 @@ class IDScrub:
453
453
 
454
454
  return self.scrub_regex(pattern, replacement_text, label=label)
455
455
 
456
+ def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
457
+ """
458
+ Removes addresses.
459
+ e.g. `10 Downing Street` scrubbed
460
+
461
+ Args:
462
+ replacement_text (str): The replacement text for the removed text.
463
+ label (str): Label for the personal data removed.
464
+
465
+ Returns:
466
+ list[str]: The input list of text with postcodes replaced.
467
+ """
468
+
469
+ self.logger.info("Scrubbing addresses using regex...")
470
+ pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
471
+
472
+ return self.scrub_regex(pattern, replacement_text, label)
473
+
456
474
  def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
457
475
  """
458
476
  Removes claimant names from employment tribunal texts.
@@ -528,64 +546,86 @@ class IDScrub:
528
546
 
529
547
  return model
530
548
 
531
- def spacy_persons(
549
+ def spacy_entities(
532
550
  self,
533
551
  model_name: str = "en_core_web_trf",
552
+ entities: list[str] = ["PERSON", "ORG", "NORP"],
553
+ replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
554
+ label_prefix: str = None,
534
555
  n_process: int = 1,
535
556
  batch_size: int = 1000,
536
- replacement_text: str = "[PERSON]",
537
- label: str = "person",
538
557
  ) -> list[str]:
539
558
  """
540
- Remove PERSON entities using a Spacy model.
559
+ Remove SpaCy entities using a given SpaCy model.
560
+ Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
541
561
  Note: only "en_core_web_trf" has been evaluated.
542
562
 
543
563
  Args:
544
564
  model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
565
+ entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
566
+ replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
567
+ label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
545
568
  n_process (int): Number of parallel processes.
546
569
  batch_size (int): The number of texts in each batch.
547
- replacement_text (str): The replacement text for the removed text.
548
- label (str): Label for the personal data removed.
549
570
 
550
571
  Returns:
551
572
  list[str]: The input list of text with PERSON entities scrubbed.
552
573
  """
553
- self.logger.info(f"Scrubbing names using SpaCy model `{model_name}`...")
554
574
 
555
- texts = self.get_texts()
575
+ self.logger.info(
576
+ f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
577
+ )
556
578
 
557
- if self.replacement_text:
558
- replacement_text = self.replacement_text
579
+ texts = self.get_texts()
559
580
 
560
581
  cleaned_texts = []
582
+ labels = []
561
583
 
562
584
  nlp = self.get_spacy_model(model_name)
563
585
  stripped_texts = [s.strip() if s.isspace() else s for s in texts]
564
586
  documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
565
587
 
566
588
  for i, (ids, doc, stripped_text) in tqdm(
567
- enumerate((zip(self.text_ids, documents, stripped_texts))), total=len(texts)
589
+ enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
568
590
  ):
569
- if stripped_text == "":
591
+ if not stripped_text:
570
592
  cleaned_texts.append(texts[i])
571
593
  continue
572
594
 
573
- # Collect person entities
574
- person_entities = [
575
- ent for ent in doc.ents if ent.label_ == "PERSON" and ent.text not in {"PERSON", "HANDLE"}
576
- ]
577
- self.scrubbed_data.extend({self.text_id_name: ids, label: ent.text} for ent in person_entities)
595
+ all_found_entities = []
596
+
597
+ for entity_type in entities:
598
+ found = [
599
+ ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
600
+ ]
601
+
602
+ for ent in found:
603
+ label = ent.label_.lower()
604
+ if label_prefix:
605
+ label = f"{label_prefix}_{label}"
606
+ labels.append(label)
607
+ self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
608
+
609
+ if self.replacement_text:
610
+ all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
611
+ elif replacement_map:
612
+ all_found_entities.extend(
613
+ (ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
614
+ )
615
+ else:
616
+ all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
578
617
 
579
- # Remove person entities
580
618
  cleaned = stripped_text
581
- for ent in sorted(person_entities, key=lambda x: [x.start_char], reverse=True):
582
- cleaned = cleaned[: ent.start_char] + replacement_text + cleaned[ent.end_char :]
619
+
620
+ for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
621
+ cleaned = cleaned[:start] + repl + cleaned[end:]
583
622
 
584
623
  cleaned_texts.append(cleaned)
585
624
 
586
625
  self.cleaned_texts = cleaned_texts
587
626
 
588
- self.log_message(label)
627
+ for label in set(labels):
628
+ self.log_message(label)
589
629
 
590
630
  return cleaned_texts
591
631
 
@@ -600,7 +640,7 @@ class IDScrub:
600
640
  Note: No Hugging Face models have been evaluated for performance.
601
641
 
602
642
  Args:
603
- hf_model_path (str): Path to the Hugging Face model on the DBT mirror.
643
+ hf_model_path (str): Path to the Hugging Face model.
604
644
  Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been evaluated.
605
645
  download_directory (str): Directory in which to save the model.
606
646
  Default is current working directory.
@@ -624,20 +664,21 @@ class IDScrub:
624
664
 
625
665
  return tokenizer
626
666
 
627
- def huggingface_persons(
667
+ def huggingface_entities(
628
668
  self,
629
669
  hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
630
670
  download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
671
+ entity="PER",
631
672
  replacement_text: str = "[PERSON]",
632
673
  label: str = "person",
633
674
  batch_size: int = 8,
634
675
  ) -> list[str]:
635
676
  """
636
- Remove PERSON entities using a Hugging Face model.
677
+ Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
637
678
  Note: No Hugging Face models have been evaluated for performance.
638
679
 
639
680
  Args:
640
- hf_model_path (str): Path to the Hugging Face model on the DBT mirror.
681
+ hf_model_path (str): Path to the Hugging Face model.
641
682
  Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
642
683
  download_directory (str): Directory in which to save the model.
643
684
  Default is current working directory.
@@ -679,7 +720,7 @@ class IDScrub:
679
720
  continue
680
721
 
681
722
  person_entities = [
682
- ent for ent in entities if ent["entity_group"] == "PER" and ent["word"] not in {"HANDLE", "PERSON"}
723
+ ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
683
724
  ]
684
725
  self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
685
726
 
@@ -695,10 +736,10 @@ class IDScrub:
695
736
 
696
737
  return cleaned_texts
697
738
 
698
- def presidio(
739
+ def presidio_entities(
699
740
  self,
700
741
  model_name: str = "en_core_web_trf",
701
- entities_to_scrub: list[str] = [
742
+ entities: list[str] = [
702
743
  "PERSON",
703
744
  "UK_NINO",
704
745
  "UK_NHS",
@@ -718,15 +759,18 @@ class IDScrub:
718
759
 
719
760
  Args:
720
761
  model_name (str): spaCy model to use
721
- entities_to_scrub (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
762
+ entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
722
763
  replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
723
764
  label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
765
+ Useful if you wish to identify this having being scrubbed by Presidio.
724
766
 
725
767
  Returns:
726
768
  list[str]: The input list of text with entities replaced.
727
769
  """
728
770
 
729
- self.logger.info("Scrubbing using Presidio...")
771
+ self.logger.info(
772
+ f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
773
+ )
730
774
 
731
775
  texts = self.get_texts()
732
776
 
@@ -744,7 +788,7 @@ class IDScrub:
744
788
  anonymizer = AnonymizerEngine()
745
789
 
746
790
  cleaned_texts = []
747
- unique_labels = []
791
+ all_labels = []
748
792
 
749
793
  stripped_texts = [s.strip() if s.isspace() else s for s in texts]
750
794
 
@@ -754,14 +798,15 @@ class IDScrub:
754
798
  continue
755
799
 
756
800
  results = analyzer.analyze(text=stripped_text, language="en")
757
- results = [r for r in results if r.entity_type in entities_to_scrub]
801
+ results = [r for r in results if r.entity_type in entities]
758
802
 
759
803
  if label_prefix:
760
804
  labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
761
805
  else:
762
806
  labels = [f"{res.entity_type.lower()}" for res in results]
763
807
 
764
- unique_labels.append(list(set(labels)))
808
+ for label in labels:
809
+ all_labels.append(label)
765
810
 
766
811
  self.scrubbed_data.extend(
767
812
  {self.text_id_name: ids, label: stripped_text[res.start : res.end]}
@@ -788,9 +833,8 @@ class IDScrub:
788
833
 
789
834
  self.cleaned_texts = cleaned_texts
790
835
 
791
- for label in unique_labels:
792
- if label:
793
- self.log_message(label[0])
836
+ for label in set(all_labels):
837
+ self.log_message(label)
794
838
 
795
839
  return cleaned_texts
796
840
 
@@ -810,6 +854,7 @@ class IDScrub:
810
854
  self.handles()
811
855
  self.ip_addresses()
812
856
  self.uk_phone_numbers()
857
+ self.uk_addresses()
813
858
  self.uk_postcodes()
814
859
  self.titles()
815
860
 
@@ -820,7 +865,8 @@ class IDScrub:
820
865
  custom_regex_patterns: list = None,
821
866
  custom_replacement_texts: list[str] = None,
822
867
  model_name: str = "en_core_web_trf",
823
- presidio_entities_to_scrub: list[str] = [
868
+ spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
869
+ presidio_entities: list[str] = [
824
870
  "PERSON",
825
871
  "EMAIL_ADDRESS",
826
872
  "UK_NINO",
@@ -857,8 +903,8 @@ class IDScrub:
857
903
  custom_replacement_texts=custom_replacement_texts,
858
904
  )
859
905
 
860
- self.presidio(model_name=model_name, entities_to_scrub=presidio_entities_to_scrub)
861
- self.spacy_persons(model_name=model_name, n_process=n_process, batch_size=batch_size)
906
+ self.presidio_entities(model_name=model_name, entities=presidio_entities)
907
+ self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
862
908
  self.google_phone_numbers()
863
909
  self.all_regex()
864
910
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 1.0.1
3
+ Version: 1.1.1
4
4
  Author: Department for Business and Trade
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -8,7 +8,7 @@ License-File: LICENSE
8
8
  Requires-Dist: ipykernel>=7.1.0
9
9
  Requires-Dist: ipywidgets
10
10
  Requires-Dist: numpy>=2.3.4
11
- Requires-Dist: pandas>=2.3.3
11
+ Requires-Dist: pandas<3.0
12
12
  Requires-Dist: phonenumbers>=9.0.18
13
13
  Requires-Dist: pip>=25.3
14
14
  Requires-Dist: spacy-transformers>=1.3.9
@@ -19,12 +19,17 @@ Provides-Extra: trf
19
19
  Requires-Dist: en_core_web_trf; extra == "trf"
20
20
  Dynamic: license-file
21
21
 
22
+ ![Development](https://img.shields.io/badge/status-development-orange)
23
+
22
24
  # idscrub 🧽✨
23
25
 
24
26
  * Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
25
27
  * This information may need to be removed prior to further analysis in many cases.
26
28
  * `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
27
29
 
30
+ > [!IMPORTANT]
31
+ > * This package is undergoing frequent internal development. Major updates will be made public periodically.
32
+
28
33
  ## Installation
29
34
 
30
35
  `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
@@ -45,7 +50,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
45
50
  from idscrub import IDScrub
46
51
 
47
52
  scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
48
- scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
53
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
49
54
 
50
55
  print(scrubbed_texts)
51
56
 
@@ -57,17 +62,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
57
62
 
58
63
  | Argument | Scrubs |
59
64
  |-------------------------|------------------------------------------------------------------------|
60
- | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
61
- | `spacy_persons` | Person names detected by spaCy's `en_core_web_trf` (or other user-selected spaCy models) |
62
- | `huggingface_persons` | Person names detected by user-selected HuggingFace models |
63
- | `email_addresses` | Email addresses |
64
- | `titles` | Titles (e.g., Mr., Mrs., Dr.) |
65
- | `handles` | Social media handles (e.g., @username) |
66
- | `ip_addresses` | IP addresses |
67
- | `uk_postcodes` | UK postal codes |
68
- | `uk_phone_numbers` | UK phone numbers |
69
- | `google_phone_numbers` | Phone numbers detected by Google’s [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
70
- | `presidio` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g., names, URLs, NHS numbers, IBAN codes) |
65
+ | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
66
+ | `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
67
+ | `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
68
+ | `huggingface_entities` | Entities detected by user-selected HuggingFace models |
69
+ | `email_addresses` | Email addresses (e.g. john@email.com) |
70
+ | `titles` | Titles (e.g. Mr., Mrs., Dr.) |
71
+ | `handles` | Social media handles (e.g. @username) |
72
+ | `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
73
+ | `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
74
+ | `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
75
+ | `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
76
+ | `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
71
77
 
72
78
  ## Considerations before use
73
79
 
@@ -27,8 +27,8 @@ test/test_huggingface.py
27
27
  test/test_id.py
28
28
  test/test_label.py
29
29
  test/test_log.py
30
- test/test_persidio.py
31
30
  test/test_phonenumbers.py
31
+ test/test_presidio.py
32
32
  test/test_regex.py
33
33
  test/test_scrub.py
34
34
  test/test_spacy.py
@@ -1,7 +1,7 @@
1
1
  ipykernel>=7.1.0
2
2
  ipywidgets
3
3
  numpy>=2.3.4
4
- pandas>=2.3.3
4
+ pandas<3.0
5
5
  phonenumbers>=9.0.18
6
6
  pip>=25.3
7
7
  spacy-transformers>=1.3.9