idscrub 0.1.1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. idscrub-0.2.0/.github/pull_request_template.md +27 -0
  2. {idscrub-0.1.1 → idscrub-0.2.0}/.pre-commit-config.yaml +1 -1
  3. idscrub-0.2.0/CODEOWNERS +1 -0
  4. {idscrub-0.1.1 → idscrub-0.2.0}/PKG-INFO +9 -9
  5. {idscrub-0.1.1 → idscrub-0.2.0}/README.md +8 -8
  6. idscrub-0.2.0/SECURITY.md +47 -0
  7. idscrub-0.2.0/SECURITY_CHECKLIST.md +117 -0
  8. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub/scrub.py +50 -5
  9. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub.egg-info/PKG-INFO +9 -9
  10. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub.egg-info/SOURCES.txt +5 -0
  11. {idscrub-0.1.1 → idscrub-0.2.0}/notebooks/basic_usage.ipynb +173 -91
  12. idscrub-0.2.0/test/test_scrub.py +48 -0
  13. {idscrub-0.1.1 → idscrub-0.2.0}/.github/workflows/cd.yml +0 -0
  14. {idscrub-0.1.1 → idscrub-0.2.0}/.github/workflows/ci.yml +0 -0
  15. {idscrub-0.1.1 → idscrub-0.2.0}/.gitignore +0 -0
  16. {idscrub-0.1.1 → idscrub-0.2.0}/LICENSE +0 -0
  17. {idscrub-0.1.1 → idscrub-0.2.0}/Makefile +0 -0
  18. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub/__init__.py +0 -0
  19. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub/locations.py +0 -0
  20. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub.egg-info/dependency_links.txt +0 -0
  21. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub.egg-info/requires.txt +0 -0
  22. {idscrub-0.1.1 → idscrub-0.2.0}/idscrub.egg-info/top_level.txt +0 -0
  23. {idscrub-0.1.1 → idscrub-0.2.0}/pyproject.toml +0 -0
  24. {idscrub-0.1.1 → idscrub-0.2.0}/setup.cfg +0 -0
  25. {idscrub-0.1.1 → idscrub-0.2.0}/test/conftest.py +0 -0
  26. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_all.py +0 -0
  27. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_chain.py +0 -0
  28. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_dataframe.py +0 -0
  29. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_huggingface.py +0 -0
  30. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_id.py +0 -0
  31. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_log.py +0 -0
  32. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_persidio.py +0 -0
  33. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_phonenumbers.py +0 -0
  34. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_regex.py +0 -0
  35. {idscrub-0.1.1 → idscrub-0.2.0}/test/test_spacy.py +0 -0
  36. {idscrub-0.1.1 → idscrub-0.2.0}/uv.lock +0 -0
@@ -0,0 +1,27 @@
1
+ ## What
2
+
3
+ <!---
4
+ What is this PR doing, e.g. implementations, algorithms, etc.?
5
+ * Set the scene - you probably have a lot of context in your head that the reader doesn't have.
6
+ * Explain like I'm 5 - try to make as few assumptions as possible about the reader
7
+ * Use pictures, screenshots, or a diagram if you can, for example https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-diagrams#creating-mermaid-diagrams
8
+ --->
9
+
10
+ ## Why
11
+
12
+ <!---
13
+ Why is this change happening, e.g. goals, use cases, stories, etc.?
14
+ * Explain what the problem was that this PR addresses.
15
+ * Explain why this solution was chosen, and any alternatives considered.
16
+ * Mention any assumptions, deliberately ignored edge-cases, or changes that are left for later.
17
+ --->
18
+
19
+ ## How this has been tested
20
+
21
+ - [ ] I have tested locally
22
+ - [ ] I have added a new unit test (if appropriate)
23
+ - [ ] Testing not required
24
+
25
+ ## Reviewer Checklist
26
+
27
+ - [ ] I have reviewed the PR and ensured no secret values are present
@@ -12,7 +12,7 @@ repos:
12
12
 
13
13
  # Mandatory internal hooks
14
14
  - repo: https://github.com/uktrade/github-standards
15
- rev: v1.1.2 # update periodically with pre-commit autoupdate
15
+ rev: v1.2.1 # update periodically with pre-commit autoupdate
16
16
  hooks:
17
17
  - id: run-security-scan
18
18
  verbose: false
@@ -0,0 +1 @@
1
+ * @uktrade/ag-data-science
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Author: Department for Business and Trade
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -20,7 +20,7 @@ Dynamic: license-file
20
20
 
21
21
  # idscrub 🧽✨
22
22
 
23
- ## Project Info
23
+ ## Project Information
24
24
 
25
25
  * This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
26
26
 
@@ -84,15 +84,15 @@ Dynamic: license-file
84
84
 
85
85
  ## Installation
86
86
 
87
- `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example (with spaCy model installed):
87
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
88
88
 
89
89
  ```console
90
- pip install 'git+ssh://git@github.com/uktrade/idscrub.git#egg=idscrub[trf]'
90
+ pip install idscrub
91
91
  ```
92
- or without spaCy installed (it will be installed automatically if name cleaning methods are called):
92
+ or with the spaCy transformer model (`en_core_web_trf`) already installed:
93
93
 
94
94
  ```console
95
- pip install 'git+ssh://git@github.com/uktrade/idscrub.git'
95
+ pip instll idscrub[trf]
96
96
  ```
97
97
 
98
98
  ## How to use the code
@@ -102,12 +102,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
102
102
  ```python
103
103
  from idscrub import IDScrub
104
104
 
105
- scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA, Lapland.'])
106
- scrubbed_texts = scrub.all()
105
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
106
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
107
107
 
108
108
  print(scrubbed_texts)
109
109
 
110
- # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']
110
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
111
111
  ```
112
112
 
113
113
  ## AI Declaration
@@ -1,6 +1,6 @@
1
1
  # idscrub 🧽✨
2
2
 
3
- ## Project Info
3
+ ## Project Information
4
4
 
5
5
  * This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
6
6
 
@@ -64,15 +64,15 @@
64
64
 
65
65
  ## Installation
66
66
 
67
- `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example (with spaCy model installed):
67
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
68
68
 
69
69
  ```console
70
- pip install 'git+ssh://git@github.com/uktrade/idscrub.git#egg=idscrub[trf]'
70
+ pip install idscrub
71
71
  ```
72
- or without spaCy installed (it will be installed automatically if name cleaning methods are called):
72
+ or with the spaCy transformer model (`en_core_web_trf`) already installed:
73
73
 
74
74
  ```console
75
- pip install 'git+ssh://git@github.com/uktrade/idscrub.git'
75
+ pip instll idscrub[trf]
76
76
  ```
77
77
 
78
78
  ## How to use the code
@@ -82,12 +82,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
82
82
  ```python
83
83
  from idscrub import IDScrub
84
84
 
85
- scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA, Lapland.'])
86
- scrubbed_texts = scrub.all()
85
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
86
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
87
87
 
88
88
  print(scrubbed_texts)
89
89
 
90
- # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']
90
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
91
91
  ```
92
92
 
93
93
  ## AI Declaration
@@ -0,0 +1,47 @@
1
+ # Security Policy
2
+
3
+ The Department for Business and Trade (DBT) supports businesses to invest, grow and export, creating jobs and opportunities across the country. Read more about what we do on [GOV.UK](https://www.gov.uk/government/organisations/department-for-business-and-trade/about).
4
+
5
+ ## Reporting a Vulnerability
6
+
7
+ If you believe you have found a security vulnerability, please submit your report to us using [here](https://hackerone.com/2680e4cd-0436-42a5-bd2a-37fd86367276/embedded_submissions/new)
8
+ In your report please include details of:
9
+
10
+ - The website, IP or page where the vulnerability can be observed.
11
+ - A brief description of the type of vulnerability, for example; "XSS vulnerability".
12
+ - Steps to reproduce. These should be a benign, non-destructive, proof of concept. This helps to ensure that the report can be triaged quickly and accurately. It also reduces the likelihood of duplicate reports, or malicious exploitation of some vulnerabilities, such as sub-domain takeovers.
13
+
14
+ Vulnerability reporting guidelines
15
+
16
+ - Please do not share the vulnerability information beyond the owner and us, without express consent from the owner
17
+ - Vulnerabilities reported to the HackerOne platform can be submitted without the need to create a HackerOne account. However, if you wish to be updated you should create an account
18
+ - To submit your report, you will need to agree to the HackerOne Terms and Conditions and acknowledge that you have read their Privacy Policy and Disclosure Guidelines
19
+ - Once you have submitted the report, it will be assessed by NCC Group within five working days, and forwarded to the affected owners as soon as possible.
20
+
21
+ The DBT Cyber Team will attempt to make contact with the affected owner. However, the affected owner holds responsibility for resolving the issue.
22
+
23
+ ## Repository security
24
+
25
+ A new organisation github policy has been created that will enforce a set of security checks we expect a repository in the uktrade account to have. This policy is enabled by default for any new repositories, however existing repositories need to have it applied to them before they can be made public. The expectation is that once enough repositories have been switched from the legacy security policy to the new security policy, this new security policy is enforced across all repositories within the uktrade account
26
+
27
+ ### Custom properties
28
+
29
+ The uktrade account makes use of custom github properties to enforce branch protection rules and run organisation level github actions. New properties can be added by logging into github using a uktrade account and using the [custom props page](https://github.com/organizations/uktrade/settings/custom-properties)
30
+
31
+ ### Code scanning
32
+
33
+ All uktrade repositories with the new security policy applied have CodeQL scanning enabled. Individual repositories can apply their own advanced scanning rules if required
34
+
35
+ ### Push protection
36
+
37
+ To block known secrets being committed into github, all repositories with the new security policy applied will have push protection enabled and enforced.
38
+
39
+ ### Branch protection
40
+
41
+ An organisation ruleset has been created to apply a minimum set of branch protection rules to each public repository in the uktrade account. These rules are to be seen as the minimum, and repository admins might decide to add additional rules to their own repositories. Organisation admins and repository admins have been added to the bypass list for this branch protection ruleset. The protection rules that will be applied to each repository are:
42
+
43
+ - A PR is required for merges into the default branch (usually main)
44
+ - At least 1 approver is required before a PR can be merged
45
+ - Any conversations on the PR must be marked as resolved
46
+
47
+ As these rules are applied as an organisation ruleset, it is not possible for repository admins to add their own rules that reduce this level of protection. As an example, a repository admin could add a ruleset that drops the required number of approvers to 0 but that would have no effect as the organisation ruleset would take precedence. They could add a ruleset that sets the number of approvers to 3, and as this is not reducing the organisation ruleset protection this would take precedence
@@ -0,0 +1,117 @@
1
+ # Security Checklist
2
+
3
+ This checklist is designed to make it easier to improve the security posture of a GitHub repository.
4
+
5
+ - It is mandatory for public repositories.
6
+ - This checklist must be copied over to the root of the repository.
7
+ - The repository steward is responsible for populating the checklist, or at least approving the related pull request.
8
+ - Any feedback should be shared with the GitHub Security working group.
9
+
10
+ ## Checklist
11
+
12
+ - [x] [Setup the pre-commit hook framework](#setup-the-pre-commit-hook-framework)
13
+ - [x] [Setup custom properties on the repository](#setup-custom-properties-on-the-repository)
14
+ - [x] [Apply the correct github security policy](#apply-the-correct-github-security-policy)
15
+ - [x] [Ensure CODEOWNERS file exists](#ensure-codeowners-file-exists)
16
+ - [x] [Copy the SECURITY_CHECKLIST.md file](#copy-the-security_checklistmd-file)
17
+ - [x] [Review the GitHub CI/CD overview](#review-the-github-cicd-overview)
18
+ - [x] [Review the GitHub Safety Tips](#review-github-safety-tips)
19
+ - [x] [Add Steward to Repository access](#add-at-least-one-steward-to-repository-access)
20
+ - [x] [Review and limit maintainers with admin rights to the strict minimum](#review-and-limit-maintainers-with-admin-rights-to-the-strict-minimum)
21
+ - [x] [Review the Pull Request template](#review-pull-request-template)
22
+ - [x] [Review the SECURITY.md policy](#review-securitymd-policy)
23
+
24
+ ## Setup the pre-commit hook framework
25
+
26
+ Several uktrade repositories already make use of the pre-commit framework for flagging code quality issues before pushing. Even in the repositories that have the pre-commit framework installed, it is still optional for an individual engineer to either avoid configuring the commit hooks, or skipping them entirely using the `--no-verify` cli argument.
27
+
28
+ As part of the go live process, each engineer making changes to the repository being reopened will be required to install the organisation wide pre-commit hooks locally. When a PR is opened, an organisation level github action will run to confirm the pre-commit hooks ran on the engineers machine and will block any PRs that have not run these hooks.
29
+
30
+ Instructions have been added to the [dbt hooks repository](https://github.com/uktrade/github-standards/blob/main/README.md#usage) to provide guidance on adding these organisation wide pre-commit hooks to an individual repository
31
+
32
+ ## Setup custom properties on the repository
33
+
34
+ A set of github tags have been created at an organisation level, these must be applied to a repository to allow organisation level github actions to run on each pull request.
35
+
36
+ ### Mandatory custom properties
37
+
38
+ - `reusable_workflow_opt_in`: This one has to be applied and set to `true` to allow this repository to apply the correct organisation branch protection ruleset and run the necessary github workflows on each PR
39
+ - `ddat_portfolio`: The portfolio inside DDAT this repository belongs to. If your portfolio is missing, this can be added by raising an SRE ticket.
40
+
41
+ ### Optional custom properties
42
+
43
+ - `is_docker`: If this repository builds a docker image, this tag should be added to run docker related github workflows
44
+ - `language`: All languages used by this repository should be selected, and github workflows will run with dedicated checks on that language.
45
+
46
+ ## Apply the correct github security policy
47
+
48
+ To add the new security policy, follow these instructions:
49
+
50
+ 1. As an organisation administrator, navigate to the [security config page](https://github.com/organizations/uktrade/settings/security_products).
51
+ 1. Scroll down to the **Apply configurations** sections, and enter the name of the repository to be made public in the filter input field
52
+ 1. Use the checkbox next to the results list to select all repositories being made public, then use the **Apply configuration** button to select the **Default DBT security** configuration
53
+ 1. A confirmation modal will appear displaying a summary of the action being made. Click the apply button
54
+ 1. In the repository that has had the new policy applied, navigate to the **Advanced Security** page in the repository settings. At the top of the page there should be a banner message **Modifications to some settings have been blocked by organization administrators.**
55
+
56
+ ### Optional: Setup CodeQL to allow PRs from repository forks
57
+
58
+ For most repositories, the default CodeQL configuration applied by the **Default DBT security** policy will be sufficient. However, this default configuration does not currently support scanning PRs raised from a fork of a repository. If your repository needs to accept PRs from a fork, you must follow these steps to switch to the advanced CodeQL setup:
59
+
60
+ 1. Open the GitHub settings page, and navigate to the Advanced Security section using the left hand menu
61
+ 1. Scroll down to the Code Scanning section, under the Tools sub-section there will be an item for CodeQL analysis
62
+ 1. Click the ... button next to Default setup text, then choose the Switch to advanced option from the menu
63
+ 1. On the popup, click the Disable CodeQL button. Although you are disabling CodeQL, there is still a branch protection rule in place that blocks a PR unless a CodeQL scan is detected. Disabling here will not allow PRs to be merged
64
+ 1. The GitHub online editor will open to create a new file called codeql.yml in your repo, and the contents of this file will be prefilled with the languages CodeQL has detected in your repo. You can modify the contents of this file if needed, however you must leave the workflow name as `CodeQL Advanced`
65
+ 1. Once happy with the workflow file contents, click the green Commit changes button to trigger a PR to merge this into the main branch
66
+ 1. Approve and merge the PR with this workflow file. Once merged, the CodeQL scan will perform an initial scan that can take a while but you can track the progress by viewing the Actions tab for your repository
67
+
68
+ ## Ensure CODEOWNERS file exists
69
+
70
+ The organisation rulesets require a CODEOWNERS file to be present in the repository. If you don't already have one of these, github has produced [documentation explaining](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) what they are and why they are used.
71
+
72
+ ## Copy the SECURITY_CHECKLIST.md file
73
+
74
+ To allow tracking of repositories that have successfully completed the reopening process, this file must be copied to the root of your repository and each of the items in the Checklist marked as completed
75
+
76
+ ## Review the GitHub CI/CD overview
77
+
78
+ Internal contributors to the repository should review the CI/CD overview below
79
+ ![CI/CD overview](assets/CI-CD%20pipeline.svg)
80
+
81
+ ## Review GitHub Safety Tips
82
+
83
+ Internal contributors to the repository should review the [GitHub Safety Tips](https://uktrade.atlassian.net/wiki/x/n4AEKQE)
84
+
85
+ ## Add at least one steward to repository access
86
+
87
+ To ensure correct governance of a repository, at least one steward must be added. This will usually be the most senior engineer on the team. To add a steward to a repository:
88
+
89
+ 1. Open the `Collaborators and teams` settings page. The url for this is `https://github.com/uktrade/REPO_NAME/github-standards/settings/access`
90
+ 1. Use the `Add people` button to open the people finder autocomplete box.
91
+ 1. Find and click the user who is going to be a steward
92
+ 1. On the Choose a role page, select the `Steward` role.
93
+ 1. Repeat for any additional users who are going to be a steward
94
+
95
+ ## Review and limit maintainers with admin rights to the strict minimum
96
+
97
+ You should review who has been assigned the github `admin` role. The `write` role is sufficient to allow team members to commit changes and raise pull requests
98
+
99
+ ## Review Pull Request template
100
+
101
+ If your repository does not already contain a pull_request_template.md file, by default you will inherit the template from this repository. If you are already using your own template, you should add this section to remind reviewers they should be ensuring no secret values are visible
102
+
103
+ ```
104
+ ## Reviewer Checklist
105
+
106
+ - [ ] I have reviewed the PR and ensured no secret values are present
107
+ ```
108
+
109
+ ## Review SECURITY.md policy
110
+
111
+ This repository contain the SECURITY.md file, which is inherited by all repositories in the uktrade organisation account. This file should be read and understood by the repository steward, and discussed with the team to ensure all engineers understand the tooling that has been put in place
112
+
113
+ ## More information
114
+
115
+ For more information about GitHub security standards, please refer [to this link](https://dbis.sharepoint.com/:w:/r/sites/DDaTDirectorate/Shared%20Documents/Work%20-%20GitHub%20Security/Github%20Security%20Framework/Guidelines%20and%20Policies/GitHub%20Security%20Standards%20v0.5.docx?d=wb29cd9b99ca042deb5c0cd8d670966d9&csf=1&web=1&e=6ITbnL)
116
+
117
+ For more details about the security features please refer to the [GitHub Standards](https://github.com/uktrade/github-standards) repo.
@@ -19,7 +19,7 @@ from tqdm import tqdm
19
19
  from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
20
20
  from transformers.utils import logging as trf_logging
21
21
 
22
- from idscrub.locations import DOWNLOAD_DIR, PROJECT_DIR
22
+ from idscrub.locations import DOWNLOAD_DIR
23
23
 
24
24
  # Suppress Torch FutureWarning
25
25
  # TODO: Find better way
@@ -879,10 +879,46 @@ class IDScrub:
879
879
 
880
880
  return scrub_methods.get(scrub_method, lambda: "Unknown method.")()
881
881
 
882
+ def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
883
+ """
884
+ Scrubs text using given methods (in order).
885
+ Uses default values for the given scrub method.
886
+
887
+ Methods available (see associated method docstring for further information):
888
+
889
+ "all", "spacy_persons", "huggingface_persons", "email_addresses", "handles",
890
+ "ip_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
891
+ "titles", "presidio"
892
+
893
+ Example:
894
+
895
+ "email_addresses" = scrub.email_addresses()
896
+
897
+ Therefore we can call:
898
+
899
+ IDScrub.scrub(scrub_methods = ["email_addresses"])
900
+
901
+ Args:
902
+ scrub_method (str): string name of scrub method.
903
+
904
+ Returns:
905
+ list[str]: The input list of text with personal information replaced.
906
+
907
+ """
908
+
909
+ for i, scrub_method in enumerate(scrub_methods):
910
+ if i == len(scrub_methods) - 1:
911
+ self.call_scrub_method(scrub_method)
912
+ else:
913
+ self.call_scrub_method(scrub_method)
914
+
915
+ return self.cleaned_texts
916
+
882
917
  @staticmethod
883
918
  def dataframe(
884
919
  df: pd.DataFrame = None,
885
920
  id_col: str = None,
921
+ exclude_cols: list[str] = None,
886
922
  scrub_methods: list[str] = ["all"],
887
923
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
888
924
  """
@@ -891,6 +927,7 @@ class IDScrub:
891
927
  Args:
892
928
  df (pd.DataFrame): A Pandas dataframe to scrub.
893
929
  id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `id` is applied.
930
+ exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
894
931
  scrub_methods (list[str]): Which scrub methods to apply to the DataFrame (in order).
895
932
  These are string versions of the existing methods e.g. "all" == scrub.all() and "email_addresses" == scrub.email_addresses().
896
933
 
@@ -899,6 +936,8 @@ class IDScrub:
899
936
 
900
937
  """
901
938
 
939
+ assert id_col in df.columns, "`id_col` is not a column in `df`. Please check."
940
+
902
941
  if id_col:
903
942
  ids = df[id_col].to_list()
904
943
  if not id_col:
@@ -908,14 +947,18 @@ class IDScrub:
908
947
  assert isinstance(df, pd.DataFrame), "`df` must be a Pandas DataFrame."
909
948
  assert len(df) == len(ids), "Length of dataframe is different to the length of IDs."
910
949
 
950
+ if exclude_cols is None:
951
+ cols_to_scrub = df.columns.to_list()
952
+ else:
953
+ cols_to_scrub = [col for col in df.columns if col not in exclude_cols]
954
+
955
+ cols_to_scrub.remove(id_col)
956
+
911
957
  scrubbed_df = df.copy()
912
958
 
913
959
  all_scrubbed_data = []
914
960
 
915
- for col in tqdm(scrubbed_df.columns):
916
- if col == id_col:
917
- continue
918
-
961
+ for col in tqdm(cols_to_scrub):
919
962
  original_dtype = scrubbed_df[col].dtype
920
963
  scrubbed_df[col] = scrubbed_df[col].astype(str)
921
964
 
@@ -944,4 +987,6 @@ class IDScrub:
944
987
  all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
945
988
  all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
946
989
 
990
+ assert df.shape == scrubbed_df.shape, "Original and scrubbed dataframe not the same shape. Check."
991
+
947
992
  return scrubbed_df, all_scrubbed_data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Author: Department for Business and Trade
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -20,7 +20,7 @@ Dynamic: license-file
20
20
 
21
21
  # idscrub 🧽✨
22
22
 
23
- ## Project Info
23
+ ## Project Information
24
24
 
25
25
  * This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
26
26
 
@@ -84,15 +84,15 @@ Dynamic: license-file
84
84
 
85
85
  ## Installation
86
86
 
87
- `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example (with spaCy model installed):
87
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
88
88
 
89
89
  ```console
90
- pip install 'git+ssh://git@github.com/uktrade/idscrub.git#egg=idscrub[trf]'
90
+ pip install idscrub
91
91
  ```
92
- or without spaCy installed (it will be installed automatically if name cleaning methods are called):
92
+ or with the spaCy transformer model (`en_core_web_trf`) already installed:
93
93
 
94
94
  ```console
95
- pip install 'git+ssh://git@github.com/uktrade/idscrub.git'
95
+ pip instll idscrub[trf]
96
96
  ```
97
97
 
98
98
  ## How to use the code
@@ -102,12 +102,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
102
102
  ```python
103
103
  from idscrub import IDScrub
104
104
 
105
- scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA, Lapland.'])
106
- scrubbed_texts = scrub.all()
105
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
106
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
107
107
 
108
108
  print(scrubbed_texts)
109
109
 
110
- # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']
110
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
111
111
  ```
112
112
 
113
113
  ## AI Declaration
@@ -1,10 +1,14 @@
1
1
  .gitignore
2
2
  .pre-commit-config.yaml
3
+ CODEOWNERS
3
4
  LICENSE
4
5
  Makefile
5
6
  README.md
7
+ SECURITY.md
8
+ SECURITY_CHECKLIST.md
6
9
  pyproject.toml
7
10
  uv.lock
11
+ .github/pull_request_template.md
8
12
  .github/workflows/cd.yml
9
13
  .github/workflows/ci.yml
10
14
  idscrub/__init__.py
@@ -26,4 +30,5 @@ test/test_log.py
26
30
  test/test_persidio.py
27
31
  test/test_phonenumbers.py
28
32
  test/test_regex.py
33
+ test/test_scrub.py
29
34
  test/test_spacy.py
@@ -9,22 +9,139 @@
9
9
  },
10
10
  {
11
11
  "cell_type": "code",
12
- "execution_count": 1,
12
+ "execution_count": 12,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "INFO: Texts loaded.\n",
20
+ "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
21
+ "100%|██████████| 2/2 [00:00<00:00, 44.29it/s]\n",
22
+ "INFO: 3 spacy person scrubbed.\n",
23
+ "INFO: Scrubbing phone numbers using regex...\n",
24
+ "INFO: 1 uk phone numbers scrubbed.\n",
25
+ "INFO: Scrubbing UK postcodes using regex...\n",
26
+ "INFO: 1 uk postcodes scrubbed.\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], Lapland.']\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "from idscrub import IDScrub\n",
39
+ "\n",
40
+ "scrub = IDScrub(\n",
41
+ " [\n",
42
+ " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
43
+ " \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
44
+ " ]\n",
45
+ ")\n",
46
+ "\n",
47
+ "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_persons\", \"uk_phone_numbers\", \"uk_postcodes\"])\n",
48
+ "\n",
49
+ "print(scrubbed_texts)"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 13,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "data": {
59
+ "text/html": [
60
+ "<div>\n",
61
+ "<style scoped>\n",
62
+ " .dataframe tbody tr th:only-of-type {\n",
63
+ " vertical-align: middle;\n",
64
+ " }\n",
65
+ "\n",
66
+ " .dataframe tbody tr th {\n",
67
+ " vertical-align: top;\n",
68
+ " }\n",
69
+ "\n",
70
+ " .dataframe thead th {\n",
71
+ " text-align: right;\n",
72
+ " }\n",
73
+ "</style>\n",
74
+ "<table border=\"1\" class=\"dataframe\">\n",
75
+ " <thead>\n",
76
+ " <tr style=\"text-align: right;\">\n",
77
+ " <th></th>\n",
78
+ " <th>text_id</th>\n",
79
+ " <th>scrubbed_spacy_person</th>\n",
80
+ " <th>scrubbed_uk_phone_numbers</th>\n",
81
+ " <th>scrubbed_uk_postcodes</th>\n",
82
+ " </tr>\n",
83
+ " </thead>\n",
84
+ " <tbody>\n",
85
+ " <tr>\n",
86
+ " <th>0</th>\n",
87
+ " <td>1</td>\n",
88
+ " <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
89
+ " <td>None</td>\n",
90
+ " <td>None</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>1</th>\n",
94
+ " <td>2</td>\n",
95
+ " <td>None</td>\n",
96
+ " <td>[+441111111111]</td>\n",
97
+ " <td>[AA11 1AA]</td>\n",
98
+ " </tr>\n",
99
+ " </tbody>\n",
100
+ "</table>\n",
101
+ "</div>"
102
+ ],
103
+ "text/plain": [
104
+ " text_id scrubbed_spacy_person \\\n",
105
+ "0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
106
+ "1 2 None \n",
107
+ "\n",
108
+ " scrubbed_uk_phone_numbers scrubbed_uk_postcodes \n",
109
+ "0 None None \n",
110
+ "1 [+441111111111] [AA11 1AA] "
111
+ ]
112
+ },
113
+ "execution_count": 13,
114
+ "metadata": {},
115
+ "output_type": "execute_result"
116
+ }
117
+ ],
118
+ "source": [
119
+ "scrub.get_scrubbed_data()"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "markdown",
124
+ "metadata": {},
125
+ "source": [
126
+ "Or scrub `all`:"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 14,
13
132
  "metadata": {},
14
133
  "outputs": [
15
134
  {
16
135
  "name": "stderr",
17
136
  "output_type": "stream",
18
137
  "text": [
19
- "/Users/euansoutter/Documents/code/idscrub/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
- " from .autonotebook import tqdm as notebook_tqdm\n",
21
138
  "INFO: Texts loaded.\n",
22
139
  "INFO: Scrubbing using Presidio...\n",
23
- "100%|██████████| 2/2 [00:00<00:00, 9.48it/s]\n",
140
+ "100%|██████████| 2/2 [00:00<00:00, 25.19it/s]\n",
24
141
  "INFO: 3 presidio person scrubbed.\n",
25
142
  "INFO: 1 presidio location scrubbed.\n",
26
143
  "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
27
- "100%|██████████| 2/2 [00:00<00:00, 55.76it/s]\n",
144
+ "100%|██████████| 2/2 [00:00<00:00, 48.66it/s]\n",
28
145
  "INFO: 0 spacy person scrubbed.\n",
29
146
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
30
147
  "INFO: 0 gb phone numbers scrubbed.\n",
@@ -59,14 +176,15 @@
59
176
  " \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
60
177
  " ]\n",
61
178
  ")\n",
62
- "scrubbed_texts = scrub.all()\n",
179
+ "\n",
180
+ "scrubbed_texts = scrub.scrub(scrub_methods=[\"all\"])\n",
63
181
  "\n",
64
182
  "print(scrubbed_texts)"
65
183
  ]
66
184
  },
67
185
  {
68
186
  "cell_type": "code",
69
- "execution_count": 2,
187
+ "execution_count": 15,
70
188
  "metadata": {},
71
189
  "outputs": [
72
190
  {
@@ -128,7 +246,7 @@
128
246
  "1 [Lapland] [+441111111111] [AA11 1AA] "
129
247
  ]
130
248
  },
131
- "execution_count": 2,
249
+ "execution_count": 15,
132
250
  "metadata": {},
133
251
  "output_type": "execute_result"
134
252
  }
@@ -146,7 +264,7 @@
146
264
  },
147
265
  {
148
266
  "cell_type": "code",
149
- "execution_count": 3,
267
+ "execution_count": 16,
150
268
  "metadata": {},
151
269
  "outputs": [
152
270
  {
@@ -155,7 +273,7 @@
155
273
  "text": [
156
274
  "INFO: Texts loaded.\n",
157
275
  "INFO: Scrubbing using Presidio...\n",
158
- "100%|██████████| 2/2 [00:00<00:00, 25.84it/s]\n",
276
+ "100%|██████████| 2/2 [00:00<00:00, 23.03it/s]\n",
159
277
  "INFO: 3 presidio person scrubbed.\n",
160
278
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
161
279
  "INFO: 0 gb phone numbers scrubbed.\n",
@@ -206,7 +324,7 @@
206
324
  },
207
325
  {
208
326
  "cell_type": "code",
209
- "execution_count": 4,
327
+ "execution_count": 17,
210
328
  "metadata": {},
211
329
  "outputs": [
212
330
  {
@@ -271,7 +389,7 @@
271
389
  "1 [ACHILLES] [+441111111111] [AA11 1AA] "
272
390
  ]
273
391
  },
274
- "execution_count": 4,
392
+ "execution_count": 17,
275
393
  "metadata": {},
276
394
  "output_type": "execute_result"
277
395
  }
@@ -290,7 +408,7 @@
290
408
  },
291
409
  {
292
410
  "cell_type": "code",
293
- "execution_count": 5,
411
+ "execution_count": 18,
294
412
  "metadata": {},
295
413
  "outputs": [
296
414
  {
@@ -299,7 +417,7 @@
299
417
  "text": [
300
418
  "INFO: Texts loaded.\n",
301
419
  "INFO: Scrubbing using Presidio...\n",
302
- "100%|██████████| 2/2 [00:00<00:00, 26.18it/s]\n",
420
+ "100%|██████████| 2/2 [00:00<00:00, 23.38it/s]\n",
303
421
  "INFO: 3 presidio person scrubbed.\n",
304
422
  "INFO: 1 presidio iban code scrubbed.\n"
305
423
  ]
@@ -325,7 +443,7 @@
325
443
  },
326
444
  {
327
445
  "cell_type": "code",
328
- "execution_count": 6,
446
+ "execution_count": 19,
329
447
  "metadata": {},
330
448
  "outputs": [
331
449
  {
@@ -381,7 +499,7 @@
381
499
  "1 [GB91BKEN10000041610008] "
382
500
  ]
383
501
  },
384
- "execution_count": 6,
502
+ "execution_count": 19,
385
503
  "metadata": {},
386
504
  "output_type": "execute_result"
387
505
  }
@@ -399,7 +517,7 @@
399
517
  },
400
518
  {
401
519
  "cell_type": "code",
402
- "execution_count": 7,
520
+ "execution_count": 20,
403
521
  "metadata": {},
404
522
  "outputs": [
405
523
  {
@@ -505,7 +623,7 @@
505
623
  "4 They did not expected a reply from otis.reddin... "
506
624
  ]
507
625
  },
508
- "execution_count": 7,
626
+ "execution_count": 20,
509
627
  "metadata": {},
510
628
  "output_type": "execute_result"
511
629
  }
@@ -551,21 +669,21 @@
551
669
  },
552
670
  {
553
671
  "cell_type": "code",
554
- "execution_count": 8,
672
+ "execution_count": 21,
555
673
  "metadata": {},
556
674
  "outputs": [
557
675
  {
558
676
  "name": "stderr",
559
677
  "output_type": "stream",
560
678
  "text": [
561
- " 0%| | 0/5 [00:00<?, ?it/s]INFO: Texts loaded.\n",
679
+ " 0%| | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
562
680
  "INFO: Scrubbing using Presidio...\n",
563
- "100%|██████████| 5/5 [00:00<00:00, 24.83it/s]\n",
681
+ "100%|██████████| 5/5 [00:00<00:00, 18.99it/s]\n",
564
682
  "INFO: 4 presidio person scrubbed.\n",
565
683
  "INFO: 4 presidio person scrubbed.\n",
566
684
  "INFO: 4 presidio person scrubbed.\n",
567
685
  "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
568
- "100%|██████████| 5/5 [00:00<00:00, 71.71it/s]\n",
686
+ "100%|██████████| 5/5 [00:00<00:00, 67.00it/s]\n",
569
687
  "INFO: 0 spacy person scrubbed.\n",
570
688
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
571
689
  "INFO: 0 gb phone numbers scrubbed.\n",
@@ -581,34 +699,13 @@
581
699
  "INFO: 0 uk postcodes scrubbed.\n",
582
700
  "INFO: Scrubbing titles using regex...\n",
583
701
  "INFO: 2 titles scrubbed.\n",
584
- " 40%|████ | 2/5 [00:02<00:03, 1.25s/it]INFO: Texts loaded.\n",
702
+ " 33%|███▎ | 1/3 [00:03<00:06, 3.24s/it]INFO: Texts loaded.\n",
585
703
  "INFO: Scrubbing using Presidio...\n",
586
- "100%|██████████| 5/5 [00:00<00:00, 29.98it/s]\n",
704
+ "100%|██████████| 5/5 [00:00<00:00, 21.83it/s]\n",
587
705
  "INFO: 2 presidio person scrubbed.\n",
588
706
  "INFO: 2 presidio person scrubbed.\n",
589
707
  "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
590
- "100%|██████████| 5/5 [00:00<00:00, 96.09it/s]\n",
591
- "INFO: 0 spacy person scrubbed.\n",
592
- "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
593
- "INFO: 0 gb phone numbers scrubbed.\n",
594
- "INFO: Scrubbing email addresses using regex...\n",
595
- "INFO: 0 email addresses scrubbed.\n",
596
- "INFO: Scrubbing @user handles using regex...\n",
597
- "INFO: 0 handles scrubbed.\n",
598
- "INFO: Scrubbing IP addresses using regex...\n",
599
- "INFO: 0 ip addresses scrubbed.\n",
600
- "INFO: Scrubbing phone numbers using regex...\n",
601
- "INFO: 0 uk phone numbers scrubbed.\n",
602
- "INFO: Scrubbing UK postcodes using regex...\n",
603
- "INFO: 0 uk postcodes scrubbed.\n",
604
- "INFO: Scrubbing titles using regex...\n",
605
- "INFO: 0 titles scrubbed.\n",
606
- " 60%|██████ | 3/5 [00:04<00:03, 1.66s/it]INFO: Texts loaded.\n",
607
- "INFO: Scrubbing using Presidio...\n",
608
- "100%|██████████| 5/5 [00:00<00:00, 26.73it/s]\n",
609
- "INFO: 1 presidio person scrubbed.\n",
610
- "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
611
- "100%|██████████| 5/5 [00:00<00:00, 89.71it/s]\n",
708
+ "100%|██████████| 5/5 [00:00<00:00, 84.69it/s]\n",
612
709
  "INFO: 0 spacy person scrubbed.\n",
613
710
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
614
711
  "INFO: 0 gb phone numbers scrubbed.\n",
@@ -624,15 +721,15 @@
624
721
  "INFO: 0 uk postcodes scrubbed.\n",
625
722
  "INFO: Scrubbing titles using regex...\n",
626
723
  "INFO: 0 titles scrubbed.\n",
627
- " 80%|████████ | 4/5 [00:07<00:01, 1.91s/it]INFO: Texts loaded.\n",
724
+ " 67%|██████▋ | 2/3 [00:06<00:03, 3.24s/it]INFO: Texts loaded.\n",
628
725
  "INFO: Scrubbing using Presidio...\n",
629
- "100%|██████████| 5/5 [00:00<00:00, 21.44it/s]\n",
630
- "INFO: 3 presidio email address scrubbed.\n",
726
+ "100%|██████████| 5/5 [00:00<00:00, 29.32it/s]\n",
727
+ "INFO: 5 presidio url scrubbed.\n",
631
728
  "INFO: 2 presidio person scrubbed.\n",
632
729
  "INFO: 3 presidio email address scrubbed.\n",
633
730
  "INFO: 3 presidio email address scrubbed.\n",
634
731
  "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
635
- "100%|██████████| 5/5 [00:00<00:00, 57.46it/s]\n",
732
+ "100%|██████████| 5/5 [00:00<00:00, 66.37it/s]\n",
636
733
  "INFO: 0 spacy person scrubbed.\n",
637
734
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
638
735
  "INFO: 0 gb phone numbers scrubbed.\n",
@@ -648,7 +745,7 @@
648
745
  "INFO: 4 uk postcodes scrubbed.\n",
649
746
  "INFO: Scrubbing titles using regex...\n",
650
747
  "INFO: 0 titles scrubbed.\n",
651
- "100%|██████████| 5/5 [00:09<00:00, 1.91s/it]\n"
748
+ "100%|██████████| 3/3 [00:08<00:00, 2.94s/it]\n"
652
749
  ]
653
750
  },
654
751
  {
@@ -685,7 +782,7 @@
685
782
  " <td>A</td>\n",
686
783
  " <td>[TITLE]. [PERSON] walked off; and [PERSON] rem...</td>\n",
687
784
  " <td>To [PERSON] she is always the woman.</td>\n",
688
- " <td>My dear [PERSON], do not waste your time upon ...</td>\n",
785
+ " <td>My dear Victor, do not waste your time upon th...</td>\n",
689
786
  " <td>The letter to [EMAIL_ADDRESS] was stamped with...</td>\n",
690
787
  " </tr>\n",
691
788
  " <tr>\n",
@@ -740,7 +837,7 @@
740
837
  "4 When you have eliminated the impossible, whate... \n",
741
838
  "\n",
742
839
  " Frankenstein \\\n",
743
- "0 My dear [PERSON], do not waste your time upon ... \n",
840
+ "0 My dear Victor, do not waste your time upon th... \n",
744
841
  "1 Learn from me, if not by my precepts, at least... \n",
745
842
  "2 I had worked hard for nearly two years, for th... \n",
746
843
  "3 Nothing is more painful to the human mind than... \n",
@@ -754,7 +851,7 @@
754
851
  "4 They did not expected a reply from [EMAIL_ADDR... "
755
852
  ]
756
853
  },
757
- "execution_count": 8,
854
+ "execution_count": 21,
758
855
  "metadata": {},
759
856
  "output_type": "execute_result"
760
857
  }
@@ -762,14 +859,14 @@
762
859
  "source": [
763
860
  "from idscrub import IDScrub\n",
764
861
  "\n",
765
- "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", scrub_methods=[\"all\"])\n",
862
+ "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], scrub_methods=[\"all\"])\n",
766
863
  "\n",
767
864
  "scrubbed_df"
768
865
  ]
769
866
  },
770
867
  {
771
868
  "cell_type": "code",
772
- "execution_count": 9,
869
+ "execution_count": 22,
773
870
  "metadata": {},
774
871
  "outputs": [
775
872
  {
@@ -862,17 +959,6 @@
862
959
  " <tr>\n",
863
960
  " <th>5</th>\n",
864
961
  " <td>A</td>\n",
865
- " <td>Frankenstein</td>\n",
866
- " <td>[Victor]</td>\n",
867
- " <td>None</td>\n",
868
- " <td>None</td>\n",
869
- " <td>None</td>\n",
870
- " <td>None</td>\n",
871
- " <td>None</td>\n",
872
- " </tr>\n",
873
- " <tr>\n",
874
- " <th>6</th>\n",
875
- " <td>A</td>\n",
876
962
  " <td>Fake book</td>\n",
877
963
  " <td>None</td>\n",
878
964
  " <td>None</td>\n",
@@ -882,7 +968,7 @@
882
968
  " <td>[SW1A 2AA]</td>\n",
883
969
  " </tr>\n",
884
970
  " <tr>\n",
885
- " <th>7</th>\n",
971
+ " <th>6</th>\n",
886
972
  " <td>B</td>\n",
887
973
  " <td>Fake book</td>\n",
888
974
  " <td>[Mick Jagger, David Bowie]</td>\n",
@@ -893,7 +979,7 @@
893
979
  " <td>[SW1A 2WH]</td>\n",
894
980
  " </tr>\n",
895
981
  " <tr>\n",
896
- " <th>8</th>\n",
982
+ " <th>7</th>\n",
897
983
  " <td>C</td>\n",
898
984
  " <td>Fake book</td>\n",
899
985
  " <td>None</td>\n",
@@ -904,7 +990,7 @@
904
990
  " <td>[SW19 5AE]</td>\n",
905
991
  " </tr>\n",
906
992
  " <tr>\n",
907
- " <th>9</th>\n",
993
+ " <th>8</th>\n",
908
994
  " <td>E</td>\n",
909
995
  " <td>Fake book</td>\n",
910
996
  " <td>None</td>\n",
@@ -925,11 +1011,10 @@
925
1011
  "2 C Pride and Prejudice [Elizabeth] \n",
926
1012
  "3 A The Adventures of Sherlock Holmes [Sherlock Holmes] \n",
927
1013
  "4 D The Adventures of Sherlock Holmes [Watson] \n",
928
- "5 A Frankenstein [Victor] \n",
929
- "6 A Fake book None \n",
930
- "7 B Fake book [Mick Jagger, David Bowie] \n",
931
- "8 C Fake book None \n",
932
- "9 E Fake book None \n",
1014
+ "5 A Fake book None \n",
1015
+ "6 B Fake book [Mick Jagger, David Bowie] \n",
1016
+ "7 C Fake book None \n",
1017
+ "8 E Fake book None \n",
933
1018
  "\n",
934
1019
  " scrubbed_titles scrubbed_presidio_email_address \\\n",
935
1020
  "0 [Mr] None \n",
@@ -937,11 +1022,10 @@
937
1022
  "2 None None \n",
938
1023
  "3 None None \n",
939
1024
  "4 None None \n",
940
- "5 None None \n",
941
- "6 None [freddie.mercury@queen.com] \n",
942
- "7 None None \n",
943
- "8 None [serena.williams@tennis.com] \n",
944
- "9 None [otis.redding@dockofthebay.org] \n",
1025
+ "5 None [freddie.mercury@queen.com] \n",
1026
+ "6 None None \n",
1027
+ "7 None [serena.williams@tennis.com] \n",
1028
+ "8 None [otis.redding@dockofthebay.org] \n",
945
1029
  "\n",
946
1030
  " scrubbed_presidio_iban_code scrubbed_presidio_url \\\n",
947
1031
  "0 None None \n",
@@ -949,11 +1033,10 @@
949
1033
  "2 None None \n",
950
1034
  "3 None None \n",
951
1035
  "4 None None \n",
952
- "5 None None \n",
953
- "6 [GB91BKEN10000041610008] [freddie.me, queen.com] \n",
954
- "7 None None \n",
955
- "8 None [tennis.com] \n",
956
- "9 None [otis.red, dockofthebay.org] \n",
1036
+ "5 [GB91BKEN10000041610008] [freddie.me, queen.com] \n",
1037
+ "6 None None \n",
1038
+ "7 None [tennis.com] \n",
1039
+ "8 None [otis.red, dockofthebay.org] \n",
957
1040
  "\n",
958
1041
  " scrubbed_uk_postcodes \n",
959
1042
  "0 None \n",
@@ -961,14 +1044,13 @@
961
1044
  "2 None \n",
962
1045
  "3 None \n",
963
1046
  "4 None \n",
964
- "5 None \n",
965
- "6 [SW1A 2AA] \n",
966
- "7 [SW1A 2WH] \n",
967
- "8 [SW19 5AE] \n",
968
- "9 [EH8 8DX] "
1047
+ "5 [SW1A 2AA] \n",
1048
+ "6 [SW1A 2WH] \n",
1049
+ "7 [SW19 5AE] \n",
1050
+ "8 [EH8 8DX] "
969
1051
  ]
970
1052
  },
971
- "execution_count": 9,
1053
+ "execution_count": 22,
972
1054
  "metadata": {},
973
1055
  "output_type": "execute_result"
974
1056
  }
@@ -0,0 +1,48 @@
1
+ import pandas as pd
2
+ from idscrub import IDScrub
3
+ from pandas.testing import assert_frame_equal
4
+
5
+
6
+ # Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
7
+ def test_scrub(scrub_object):
8
+ scrubbed = scrub_object.scrub(scrub_methods=["spacy_persons", "uk_phone_numbers", "uk_postcodes"])
9
+ assert scrubbed == [
10
+ "Our names are [PERSON], [PERSON], and [PERSON].",
11
+ "My number is [PHONENO] and I live at [POSTCODE].",
12
+ ]
13
+
14
+
15
+ def test_scrub_text_id():
16
+ scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
17
+
18
+ scrub.scrub(scrub_methods=["spacy_persons"])
19
+
20
+ df = scrub.get_scrubbed_data()
21
+
22
+ assert df["text_id"].max() == 10
23
+ assert len(df["text_id"]) == 10
24
+
25
+
26
+ def test_scrub_get_scrubbed_data(scrub_object):
27
+ scrub_object.scrub(scrub_methods=["uk_postcodes"])
28
+ df = scrub_object.get_scrubbed_data()
29
+
30
+ expected_df = pd.DataFrame(
31
+ {
32
+ "text_id": {0: 2},
33
+ "scrubbed_uk_postcodes": {0: ["AA11 1AA"]},
34
+ }
35
+ )
36
+
37
+ assert_frame_equal(df, expected_df)
38
+
39
+
40
+ def test_scrub_order(scrub_object):
41
+ scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_persons"])
42
+
43
+ assert scrub_object.get_scrubbed_data().columns.to_list() == [
44
+ "text_id",
45
+ "scrubbed_uk_postcodes",
46
+ "scrubbed_uk_phone_numbers",
47
+ "scrubbed_spacy_person",
48
+ ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes