idscrub 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub-0.2.0/.github/pull_request_template.md +27 -0
- idscrub-0.2.0/.github/workflows/cd.yml +35 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/.pre-commit-config.yaml +1 -1
- idscrub-0.2.0/CODEOWNERS +1 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/PKG-INFO +9 -9
- {idscrub-0.1.0 → idscrub-0.2.0}/README.md +8 -8
- idscrub-0.2.0/SECURITY.md +47 -0
- idscrub-0.2.0/SECURITY_CHECKLIST.md +117 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub/scrub.py +50 -5
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub.egg-info/PKG-INFO +9 -9
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub.egg-info/SOURCES.txt +6 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub.egg-info/top_level.txt +0 -1
- {idscrub-0.1.0 → idscrub-0.2.0}/notebooks/basic_usage.ipynb +173 -91
- {idscrub-0.1.0 → idscrub-0.2.0}/pyproject.toml +1 -2
- idscrub-0.2.0/test/test_scrub.py +48 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/uv.lock +0 -236
- {idscrub-0.1.0 → idscrub-0.2.0}/.github/workflows/ci.yml +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/.gitignore +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/LICENSE +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/Makefile +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub/__init__.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub/locations.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub.egg-info/dependency_links.txt +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/idscrub.egg-info/requires.txt +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/setup.cfg +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/conftest.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_all.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_chain.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_dataframe.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_huggingface.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_id.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_log.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_persidio.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_phonenumbers.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_regex.py +0 -0
- {idscrub-0.1.0 → idscrub-0.2.0}/test/test_spacy.py +0 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
## What
|
|
2
|
+
|
|
3
|
+
<!---
|
|
4
|
+
What is this PR doing, e.g. implementations, algorithms, etc.?
|
|
5
|
+
* Set the scene - you probably have a lot of context in your head that the reader doesn't have.
|
|
6
|
+
* Explain like I'm 5 - try to make as few assumptions as possible about the reader
|
|
7
|
+
* Use pictures, screenshots, or a diagram if you can, for example https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-diagrams#creating-mermaid-diagrams
|
|
8
|
+
--->
|
|
9
|
+
|
|
10
|
+
## Why
|
|
11
|
+
|
|
12
|
+
<!---
|
|
13
|
+
Why is this change happening, e.g. goals, use cases, stories, etc.?
|
|
14
|
+
* Explain what the problem was that this PR addresses.
|
|
15
|
+
* Explain why this solution was chosen, and any alternatives considered.
|
|
16
|
+
* Mention any assumptions, deliberately ignored edge-cases, or changes that are left for later.
|
|
17
|
+
--->
|
|
18
|
+
|
|
19
|
+
## How this has been tested
|
|
20
|
+
|
|
21
|
+
- [ ] I have tested locally
|
|
22
|
+
- [ ] I have added a new unit test (if appropriate)
|
|
23
|
+
- [ ] Testing not required
|
|
24
|
+
|
|
25
|
+
## Reviewer Checklist
|
|
26
|
+
|
|
27
|
+
- [ ] I have reviewed the PR and ensured no secret values are present
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: "Publish"
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
# Publish on any tag starting with a `v`, e.g., v0.1.0
|
|
7
|
+
- v*
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
run:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
environment:
|
|
13
|
+
name: pypi
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write
|
|
16
|
+
contents: read
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout
|
|
19
|
+
uses: actions/checkout@v5
|
|
20
|
+
- name: Install uv
|
|
21
|
+
uses: astral-sh/setup-uv@v7
|
|
22
|
+
|
|
23
|
+
- name: Install Python 3.12
|
|
24
|
+
run: uv python install 3.12
|
|
25
|
+
- name: Build
|
|
26
|
+
run: uv build
|
|
27
|
+
|
|
28
|
+
# Check that basic features work and we didn't miss to include crucial files
|
|
29
|
+
- name: Regex test (wheel)
|
|
30
|
+
run: uv run --isolated --no-project --with dist/*.whl test/test_regex.py
|
|
31
|
+
- name: Regex test (source distribution)
|
|
32
|
+
run: uv run --isolated --no-project --with dist/*.tar.gz test/test_regex.py
|
|
33
|
+
|
|
34
|
+
- name: Publish
|
|
35
|
+
run: uv publish
|
|
@@ -12,7 +12,7 @@ repos:
|
|
|
12
12
|
|
|
13
13
|
# Mandatory internal hooks
|
|
14
14
|
- repo: https://github.com/uktrade/github-standards
|
|
15
|
-
rev: v1.1
|
|
15
|
+
rev: v1.2.1 # update periodically with pre-commit autoupdate
|
|
16
16
|
hooks:
|
|
17
17
|
- id: run-security-scan
|
|
18
18
|
verbose: false
|
idscrub-0.2.0/CODEOWNERS
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
* @uktrade/ag-data-science
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
# idscrub 🧽✨
|
|
22
22
|
|
|
23
|
-
## Project
|
|
23
|
+
## Project Information
|
|
24
24
|
|
|
25
25
|
* This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
26
26
|
|
|
@@ -84,15 +84,15 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
## Installation
|
|
86
86
|
|
|
87
|
-
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example
|
|
87
|
+
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
88
88
|
|
|
89
89
|
```console
|
|
90
|
-
pip install
|
|
90
|
+
pip install idscrub
|
|
91
91
|
```
|
|
92
|
-
or
|
|
92
|
+
or with the spaCy transformer model (`en_core_web_trf`) already installed:
|
|
93
93
|
|
|
94
94
|
```console
|
|
95
|
-
pip
|
|
95
|
+
pip instll idscrub[trf]
|
|
96
96
|
```
|
|
97
97
|
|
|
98
98
|
## How to use the code
|
|
@@ -102,12 +102,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
|
|
|
102
102
|
```python
|
|
103
103
|
from idscrub import IDScrub
|
|
104
104
|
|
|
105
|
-
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA
|
|
106
|
-
scrubbed_texts = scrub.
|
|
105
|
+
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
|
|
106
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
|
|
107
107
|
|
|
108
108
|
print(scrubbed_texts)
|
|
109
109
|
|
|
110
|
-
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE]
|
|
110
|
+
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
|
|
111
111
|
```
|
|
112
112
|
|
|
113
113
|
## AI Declaration
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# idscrub 🧽✨
|
|
2
2
|
|
|
3
|
-
## Project
|
|
3
|
+
## Project Information
|
|
4
4
|
|
|
5
5
|
* This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
6
6
|
|
|
@@ -64,15 +64,15 @@
|
|
|
64
64
|
|
|
65
65
|
## Installation
|
|
66
66
|
|
|
67
|
-
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example
|
|
67
|
+
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
68
68
|
|
|
69
69
|
```console
|
|
70
|
-
pip install
|
|
70
|
+
pip install idscrub
|
|
71
71
|
```
|
|
72
|
-
or
|
|
72
|
+
or with the spaCy transformer model (`en_core_web_trf`) already installed:
|
|
73
73
|
|
|
74
74
|
```console
|
|
75
|
-
pip
|
|
75
|
+
pip instll idscrub[trf]
|
|
76
76
|
```
|
|
77
77
|
|
|
78
78
|
## How to use the code
|
|
@@ -82,12 +82,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
|
|
|
82
82
|
```python
|
|
83
83
|
from idscrub import IDScrub
|
|
84
84
|
|
|
85
|
-
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA
|
|
86
|
-
scrubbed_texts = scrub.
|
|
85
|
+
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
|
|
86
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
|
|
87
87
|
|
|
88
88
|
print(scrubbed_texts)
|
|
89
89
|
|
|
90
|
-
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE]
|
|
90
|
+
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
|
|
91
91
|
```
|
|
92
92
|
|
|
93
93
|
## AI Declaration
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
The Department for Business and Trade (DBT) supports businesses to invest, grow and export, creating jobs and opportunities across the country. Read more about what we do on [GOV.UK](https://www.gov.uk/government/organisations/department-for-business-and-trade/about).
|
|
4
|
+
|
|
5
|
+
## Reporting a Vulnerability
|
|
6
|
+
|
|
7
|
+
If you believe you have found a security vulnerability, please submit your report to us using [here](https://hackerone.com/2680e4cd-0436-42a5-bd2a-37fd86367276/embedded_submissions/new)
|
|
8
|
+
In your report please include details of:
|
|
9
|
+
|
|
10
|
+
- The website, IP or page where the vulnerability can be observed.
|
|
11
|
+
- A brief description of the type of vulnerability, for example; "XSS vulnerability".
|
|
12
|
+
- Steps to reproduce. These should be a benign, non-destructive, proof of concept. This helps to ensure that the report can be triaged quickly and accurately. It also reduces the likelihood of duplicate reports, or malicious exploitation of some vulnerabilities, such as sub-domain takeovers.
|
|
13
|
+
|
|
14
|
+
Vulnerability reporting guidelines
|
|
15
|
+
|
|
16
|
+
- Please do not share the vulnerability information beyond the owner and us, without express consent from the owner
|
|
17
|
+
- Vulnerabilities reported to the HackerOne platform can be submitted without the need to create a HackerOne account. However, if you wish to be updated you should create an account
|
|
18
|
+
- To submit your report, you will need to agree to the HackerOne Terms and Conditions and acknowledge that you have read their Privacy Policy and Disclosure Guidelines
|
|
19
|
+
- Once you have submitted the report, it will be assessed by NCC Group within five working days, and forwarded to the affected owners as soon as possible.
|
|
20
|
+
|
|
21
|
+
The DBT Cyber Team will attempt to make contact with the affected owner. However, the affected owner holds responsibility for resolving the issue.
|
|
22
|
+
|
|
23
|
+
## Repository security
|
|
24
|
+
|
|
25
|
+
A new organisation github policy has been created that will enforce a set of security checks we expect a repository in the uktrade account to have. This policy is enabled by default for any new repositories, however existing repositories need to have it applied to them before they can be made public. The expectation is that once enough repositories have been switched from the legacy security policy to the new security policy, this new security policy is enforced across all repositories within the uktrade account
|
|
26
|
+
|
|
27
|
+
### Custom properties
|
|
28
|
+
|
|
29
|
+
The uktrade account makes use of custom github properties to enforce branch protection rules and run organisation level github actions. New properties can be added by logging into github using a uktrade account and using the [custom props page](https://github.com/organizations/uktrade/settings/custom-properties)
|
|
30
|
+
|
|
31
|
+
### Code scanning
|
|
32
|
+
|
|
33
|
+
All uktrade repositories with the new security policy applied have CodeQL scanning enabled. Individual repositories can apply their own advanced scanning rules if required
|
|
34
|
+
|
|
35
|
+
### Push protection
|
|
36
|
+
|
|
37
|
+
To block known secrets being committed into github, all repositories with the new security policy applied will have push protection enabled and enforced.
|
|
38
|
+
|
|
39
|
+
### Branch protection
|
|
40
|
+
|
|
41
|
+
An organisation ruleset has been created to apply a minimum set of branch protection rules to each public repository in the uktrade account. These rules are to be seen as the minimum, and repository admins might decide to add additional rules to their own repositories. Organisation admins and repository admins have been added to the bypass list for this branch protection ruleset. The protection rules that will be applied to each repository are:
|
|
42
|
+
|
|
43
|
+
- A PR is required for merges into the default branch (usually main)
|
|
44
|
+
- At least 1 approver is required before a PR can be merged
|
|
45
|
+
- Any conversations on the PR must be marked as resolved
|
|
46
|
+
|
|
47
|
+
As these rules are applied as an organisation ruleset, it is not possible for repository admins to add their own rules that reduce this level of protection. As an example, a repository admin could add a ruleset that drops the required number of approvers to 0 but that would have no effect as the organisation ruleset would take precedence. They could add a ruleset that sets the number of approvers to 3, and as this is not reducing the organisation ruleset protection this would take precedence
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Security Checklist
|
|
2
|
+
|
|
3
|
+
This checklist is designed to make it easier to improve the security posture of a GitHub repository.
|
|
4
|
+
|
|
5
|
+
- It is mandatory for public repositories.
|
|
6
|
+
- This checklist must be copied over to the root of the repository.
|
|
7
|
+
- The repository steward is responsible for populating the checklist, or at least approving the related pull request.
|
|
8
|
+
- Any feedback should be shared with the GitHub Security working group.
|
|
9
|
+
|
|
10
|
+
## Checklist
|
|
11
|
+
|
|
12
|
+
- [x] [Setup the pre-commit hook framework](#setup-the-pre-commit-hook-framework)
|
|
13
|
+
- [x] [Setup custom properties on the repository](#setup-custom-properties-on-the-repository)
|
|
14
|
+
- [x] [Apply the correct github security policy](#apply-the-correct-github-security-policy)
|
|
15
|
+
- [x] [Ensure CODEOWNERS file exists](#ensure-codeowners-file-exists)
|
|
16
|
+
- [x] [Copy the SECURITY_CHECKLIST.md file](#copy-the-security_checklistmd-file)
|
|
17
|
+
- [x] [Review the GitHub CI/CD overview](#review-the-github-cicd-overview)
|
|
18
|
+
- [x] [Review the GitHub Safety Tips](#review-github-safety-tips)
|
|
19
|
+
- [x] [Add Steward to Repository access](#add-at-least-one-steward-to-repository-access)
|
|
20
|
+
- [x] [Review and limit maintainers with admin rights to the strict minimum](#review-and-limit-maintainers-with-admin-rights-to-the-strict-minimum)
|
|
21
|
+
- [x] [Review the Pull Request template](#review-pull-request-template)
|
|
22
|
+
- [x] [Review the SECURITY.md policy](#review-securitymd-policy)
|
|
23
|
+
|
|
24
|
+
## Setup the pre-commit hook framework
|
|
25
|
+
|
|
26
|
+
Several uktrade repositories already make use of the pre-commit framework for flagging code quality issues before pushing. Even in the repositories that have the pre-commit framework installed, it is still optional for an individual engineer to either avoid configuring the commit hooks, or skipping them entirely using the `--no-verify` cli argument.
|
|
27
|
+
|
|
28
|
+
As part of the go live process, each engineer making changes to the repository being reopened will be required to install the organisation wide pre-commit hooks locally. When a PR is opened, an organisation level github action will run to confirm the pre-commit hooks ran on the engineers machine and will block any PRs that have not run these hooks.
|
|
29
|
+
|
|
30
|
+
Instructions have been added to the [dbt hooks repository](https://github.com/uktrade/github-standards/blob/main/README.md#usage) to provide guidance on adding these organisation wide pre-commit hooks to an individual repository
|
|
31
|
+
|
|
32
|
+
## Setup custom properties on the repository
|
|
33
|
+
|
|
34
|
+
A set of github tags have been created at an organisation level, these must be applied to a repository to allow organisation level github actions to run on each pull request.
|
|
35
|
+
|
|
36
|
+
### Mandatory custom properties
|
|
37
|
+
|
|
38
|
+
- `reusable_workflow_opt_in`: This one has to be applied and set to `true` to allow this repository to apply the correct organisation branch protection ruleset and run the necessary github workflows on each PR
|
|
39
|
+
- `ddat_portfolio`: The portfolio inside DDAT this repository belongs to. If your portfolio is missing, this can be added by raising an SRE ticket.
|
|
40
|
+
|
|
41
|
+
### Optional custom properties
|
|
42
|
+
|
|
43
|
+
- `is_docker`: If this repository builds a docker image, this tag should be added to run docker related github workflows
|
|
44
|
+
- `language`: All languages used by this repository should be selected, and github workflows will run with dedicated checks on that language.
|
|
45
|
+
|
|
46
|
+
## Apply the correct github security policy
|
|
47
|
+
|
|
48
|
+
To add the new security policy, follow these instructions:
|
|
49
|
+
|
|
50
|
+
1. As an organisation administrator, navigate to the [security config page](https://github.com/organizations/uktrade/settings/security_products).
|
|
51
|
+
1. Scroll down to the **Apply configurations** sections, and enter the name of the repository to be made public in the filter input field
|
|
52
|
+
1. Use the checkbox next to the results list to select all repositories being made public, then use the **Apply configuration** button to select the **Default DBT security** configuration
|
|
53
|
+
1. A confirmation modal will appear displaying a summary of the action being made. Click the apply button
|
|
54
|
+
1. In the repository that has had the new policy applied, navigate to the **Advanced Security** page in the repository settings. At the top of the page there should be a banner message **Modifications to some settings have been blocked by organization administrators.**
|
|
55
|
+
|
|
56
|
+
### Optional: Setup CodeQL to allow PRs from repository forks
|
|
57
|
+
|
|
58
|
+
For most repositories, the default CodeQL configuration applied by the **Default DBT security** policy will be sufficient. However, this default configuration does not currently support scanning PRs raised from a fork of a repository. If your repository needs to accept PRs from a fork, you must follow these steps to switch to the advanced CodeQL setup:
|
|
59
|
+
|
|
60
|
+
1. Open the GitHub settings page, and navigate to the Advanced Security section using the left hand menu
|
|
61
|
+
1. Scroll down to the Code Scanning section, under the Tools sub-section there will be an item for CodeQL analysis
|
|
62
|
+
1. Click the ... button next to Default setup text, then choose the Switch to advanced option from the menu
|
|
63
|
+
1. On the popup, click the Disable CodeQL button. Although you are disabling CodeQL, there is still a branch protection rule in place that blocks a PR unless a CodeQL scan is detected. Disabling here will not allow PRs to be merged
|
|
64
|
+
1. The GitHub online editor will open to create a new file called codeql.yml in your repo, and the contents of this file will be prefilled with the languages CodeQL has detected in your repo. You can modify the contents of this file if needed, however you must leave the workflow name as `CodeQL Advanced`
|
|
65
|
+
1. Once happy with the workflow file contents, click the green Commit changes button to trigger a PR to merge this into the main branch
|
|
66
|
+
1. Approve and merge the PR with this workflow file. Once merged, the CodeQL scan will perform an initial scan that can take a while but you can track the progress by viewing the Actions tab for your repository
|
|
67
|
+
|
|
68
|
+
## Ensure CODEOWNERS file exists
|
|
69
|
+
|
|
70
|
+
The organisation rulesets require a CODEOWNERS file to be present in the repository. If you don't already have one of these, github has produced [documentation explaining](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) what they are and why they are used.
|
|
71
|
+
|
|
72
|
+
## Copy the SECURITY_CHECKLIST.md file
|
|
73
|
+
|
|
74
|
+
To allow tracking of repositories that have successfully completed the reopening process, this file must be copied to the root of your repository and each of the items in the Checklist marked as completed
|
|
75
|
+
|
|
76
|
+
## Review the GitHub CI/CD overview
|
|
77
|
+
|
|
78
|
+
Internal contributors to the repository should review the CI/CD overview below
|
|
79
|
+

|
|
80
|
+
|
|
81
|
+
## Review GitHub Safety Tips
|
|
82
|
+
|
|
83
|
+
Internal contributors to the repository should review the [GitHub Safety Tips](https://uktrade.atlassian.net/wiki/x/n4AEKQE)
|
|
84
|
+
|
|
85
|
+
## Add at least one steward to repository access
|
|
86
|
+
|
|
87
|
+
To ensure correct governance of a repository, at least one steward must be added. This will usually be the most senior engineer on the team. To add a steward to a repository:
|
|
88
|
+
|
|
89
|
+
1. Open the `Collaborators and teams` settings page. The url for this is `https://github.com/uktrade/REPO_NAME/github-standards/settings/access`
|
|
90
|
+
1. Use the `Add people` button to open the people finder autocomplete box.
|
|
91
|
+
1. Find and click the user who is going to be a steward
|
|
92
|
+
1. On the Choose a role page, select the `Steward` role.
|
|
93
|
+
1. Repeat for any additional users who are going to be a steward
|
|
94
|
+
|
|
95
|
+
## Review and limit maintainers with admin rights to the strict minimum
|
|
96
|
+
|
|
97
|
+
You should review who has been assigned the github `admin` role. The `write` role is sufficient to allow team members to commit changes and raise pull requests
|
|
98
|
+
|
|
99
|
+
## Review Pull Request template
|
|
100
|
+
|
|
101
|
+
If your repository does not already contain a pull_request_template.md file, by default you will inherit the template from this repository. If you are already using your own template, you should add this section to remind reviewers they should be ensuring no secret values are visible
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
## Reviewer Checklist
|
|
105
|
+
|
|
106
|
+
- [ ] I have reviewed the PR and ensured no secret values are present
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Review SECURITY.md policy
|
|
110
|
+
|
|
111
|
+
This repository contain the SECURITY.md file, which is inherited by all repositories in the uktrade organisation account. This file should be read and understood by the repository steward, and discussed with the team to ensure all engineers understand the tooling that has been put in place
|
|
112
|
+
|
|
113
|
+
## More information
|
|
114
|
+
|
|
115
|
+
For more information about GitHub security standards, please refer [to this link](https://dbis.sharepoint.com/:w:/r/sites/DDaTDirectorate/Shared%20Documents/Work%20-%20GitHub%20Security/Github%20Security%20Framework/Guidelines%20and%20Policies/GitHub%20Security%20Standards%20v0.5.docx?d=wb29cd9b99ca042deb5c0cd8d670966d9&csf=1&web=1&e=6ITbnL)
|
|
116
|
+
|
|
117
|
+
For more details about the security features please refer to the [GitHub Standards](https://github.com/uktrade/github-standards) repo.
|
|
@@ -19,7 +19,7 @@ from tqdm import tqdm
|
|
|
19
19
|
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
|
20
20
|
from transformers.utils import logging as trf_logging
|
|
21
21
|
|
|
22
|
-
from idscrub.locations import DOWNLOAD_DIR
|
|
22
|
+
from idscrub.locations import DOWNLOAD_DIR
|
|
23
23
|
|
|
24
24
|
# Suppress Torch FutureWarning
|
|
25
25
|
# TODO: Find better way
|
|
@@ -879,10 +879,46 @@ class IDScrub:
|
|
|
879
879
|
|
|
880
880
|
return scrub_methods.get(scrub_method, lambda: "Unknown method.")()
|
|
881
881
|
|
|
882
|
+
def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
|
|
883
|
+
"""
|
|
884
|
+
Scrubs text using given methods (in order).
|
|
885
|
+
Uses default values for the given scrub method.
|
|
886
|
+
|
|
887
|
+
Methods available (see associated method docstring for further information):
|
|
888
|
+
|
|
889
|
+
"all", "spacy_persons", "huggingface_persons", "email_addresses", "handles",
|
|
890
|
+
"ip_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
|
|
891
|
+
"titles", "presidio"
|
|
892
|
+
|
|
893
|
+
Example:
|
|
894
|
+
|
|
895
|
+
"email_addresses" = scrub.email_addresses()
|
|
896
|
+
|
|
897
|
+
Therefore we can call:
|
|
898
|
+
|
|
899
|
+
IDScrub.scrub(scrub_methods = ["email_addresses"])
|
|
900
|
+
|
|
901
|
+
Args:
|
|
902
|
+
scrub_method (str): string name of scrub method.
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
list[str]: The input list of text with personal information replaced.
|
|
906
|
+
|
|
907
|
+
"""
|
|
908
|
+
|
|
909
|
+
for i, scrub_method in enumerate(scrub_methods):
|
|
910
|
+
if i == len(scrub_methods) - 1:
|
|
911
|
+
self.call_scrub_method(scrub_method)
|
|
912
|
+
else:
|
|
913
|
+
self.call_scrub_method(scrub_method)
|
|
914
|
+
|
|
915
|
+
return self.cleaned_texts
|
|
916
|
+
|
|
882
917
|
@staticmethod
|
|
883
918
|
def dataframe(
|
|
884
919
|
df: pd.DataFrame = None,
|
|
885
920
|
id_col: str = None,
|
|
921
|
+
exclude_cols: list[str] = None,
|
|
886
922
|
scrub_methods: list[str] = ["all"],
|
|
887
923
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
888
924
|
"""
|
|
@@ -891,6 +927,7 @@ class IDScrub:
|
|
|
891
927
|
Args:
|
|
892
928
|
df (pd.DataFrame): A Pandas dataframe to scrub.
|
|
893
929
|
id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `id` is applied.
|
|
930
|
+
exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
|
|
894
931
|
scrub_methods (list[str]): Which scrub methods to apply to the DataFrame (in order).
|
|
895
932
|
These are string versions of the existing methods e.g. "all" == scrub.all() and "email_addresses" == scrub.email_addresses().
|
|
896
933
|
|
|
@@ -899,6 +936,8 @@ class IDScrub:
|
|
|
899
936
|
|
|
900
937
|
"""
|
|
901
938
|
|
|
939
|
+
assert id_col in df.columns, "`id_col` is not a column in `df`. Please check."
|
|
940
|
+
|
|
902
941
|
if id_col:
|
|
903
942
|
ids = df[id_col].to_list()
|
|
904
943
|
if not id_col:
|
|
@@ -908,14 +947,18 @@ class IDScrub:
|
|
|
908
947
|
assert isinstance(df, pd.DataFrame), "`df` must be a Pandas DataFrame."
|
|
909
948
|
assert len(df) == len(ids), "Length of dataframe is different to the length of IDs."
|
|
910
949
|
|
|
950
|
+
if exclude_cols is None:
|
|
951
|
+
cols_to_scrub = df.columns.to_list()
|
|
952
|
+
else:
|
|
953
|
+
cols_to_scrub = [col for col in df.columns if col not in exclude_cols]
|
|
954
|
+
|
|
955
|
+
cols_to_scrub.remove(id_col)
|
|
956
|
+
|
|
911
957
|
scrubbed_df = df.copy()
|
|
912
958
|
|
|
913
959
|
all_scrubbed_data = []
|
|
914
960
|
|
|
915
|
-
for col in tqdm(
|
|
916
|
-
if col == id_col:
|
|
917
|
-
continue
|
|
918
|
-
|
|
961
|
+
for col in tqdm(cols_to_scrub):
|
|
919
962
|
original_dtype = scrubbed_df[col].dtype
|
|
920
963
|
scrubbed_df[col] = scrubbed_df[col].astype(str)
|
|
921
964
|
|
|
@@ -944,4 +987,6 @@ class IDScrub:
|
|
|
944
987
|
all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
|
|
945
988
|
all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
|
|
946
989
|
|
|
990
|
+
assert df.shape == scrubbed_df.shape, "Original and scrubbed dataframe not the same shape. Check."
|
|
991
|
+
|
|
947
992
|
return scrubbed_df, all_scrubbed_data
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
# idscrub 🧽✨
|
|
22
22
|
|
|
23
|
-
## Project
|
|
23
|
+
## Project Information
|
|
24
24
|
|
|
25
25
|
* This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
26
26
|
|
|
@@ -84,15 +84,15 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
## Installation
|
|
86
86
|
|
|
87
|
-
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example
|
|
87
|
+
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
88
88
|
|
|
89
89
|
```console
|
|
90
|
-
pip install
|
|
90
|
+
pip install idscrub
|
|
91
91
|
```
|
|
92
|
-
or
|
|
92
|
+
or with the spaCy transformer model (`en_core_web_trf`) already installed:
|
|
93
93
|
|
|
94
94
|
```console
|
|
95
|
-
pip
|
|
95
|
+
pip instll idscrub[trf]
|
|
96
96
|
```
|
|
97
97
|
|
|
98
98
|
## How to use the code
|
|
@@ -102,12 +102,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
|
|
|
102
102
|
```python
|
|
103
103
|
from idscrub import IDScrub
|
|
104
104
|
|
|
105
|
-
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA
|
|
106
|
-
scrubbed_texts = scrub.
|
|
105
|
+
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
|
|
106
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
|
|
107
107
|
|
|
108
108
|
print(scrubbed_texts)
|
|
109
109
|
|
|
110
|
-
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE]
|
|
110
|
+
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
|
|
111
111
|
```
|
|
112
112
|
|
|
113
113
|
## AI Declaration
|
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
.gitignore
|
|
2
2
|
.pre-commit-config.yaml
|
|
3
|
+
CODEOWNERS
|
|
3
4
|
LICENSE
|
|
4
5
|
Makefile
|
|
5
6
|
README.md
|
|
7
|
+
SECURITY.md
|
|
8
|
+
SECURITY_CHECKLIST.md
|
|
6
9
|
pyproject.toml
|
|
7
10
|
uv.lock
|
|
11
|
+
.github/pull_request_template.md
|
|
12
|
+
.github/workflows/cd.yml
|
|
8
13
|
.github/workflows/ci.yml
|
|
9
14
|
idscrub/__init__.py
|
|
10
15
|
idscrub/locations.py
|
|
@@ -25,4 +30,5 @@ test/test_log.py
|
|
|
25
30
|
test/test_persidio.py
|
|
26
31
|
test/test_phonenumbers.py
|
|
27
32
|
test/test_regex.py
|
|
33
|
+
test/test_scrub.py
|
|
28
34
|
test/test_spacy.py
|