idscrub 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ name: CI
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ pull_request:
7
+ branches:
8
+ - main
9
+ workflow_dispatch:
10
+
11
+ jobs:
12
+ test:
13
+ name: test
14
+ runs-on: ubuntu-latest
15
+ permissions:
16
+ contents: read
17
+ strategy:
18
+ matrix:
19
+ python-version:
20
+ - "3.12"
21
+ - "3.13"
22
+
23
+ steps:
24
+ - uses: actions/checkout@v5
25
+
26
+ - name: Install uv and Python
27
+ uses: astral-sh/setup-uv@v6
28
+ with:
29
+ enable-cache: true
30
+ cache-dependency-glob: "uv.lock"
31
+ python-version: ${{ matrix.python-version }}
32
+
33
+ - name: Install the project
34
+ run: uv sync --frozen --all-extras --dev
35
+
36
+ - name: Run Python tests
37
+ run: uv run pytest
38
+
39
+ - name: Dump docker logs
40
+ if: failure()
41
+ uses: jwalton/gh-docker-logs@v2
42
+
43
+ scan-for-secrets:
44
+ runs-on: ubuntu-latest
45
+ permissions:
46
+ contents: read
47
+
48
+ steps:
49
+ - uses: actions/checkout@v5
50
+ with:
51
+ fetch-depth: 0
52
+
53
+ - name: Run TruffleHog scan
54
+ run: |
55
+ docker run --rm -v ${{ github.workspace }}:/repo \
56
+ trufflesecurity/trufflehog:latest \
57
+ git file:///repo \
58
+ --since-commit main \
59
+ --branch ${{ github.head_ref || github.ref_name }} \
60
+ --github-actions \
61
+ --fail
62
+
63
+ build-package:
64
+ runs-on: ubuntu-latest
65
+ permissions:
66
+ contents: read
67
+
68
+ steps:
69
+ - uses: actions/checkout@v5
70
+ with:
71
+ fetch-depth: 0
72
+
73
+ - name: Install uv and Python
74
+ uses: astral-sh/setup-uv@v6
75
+ with:
76
+ python-version: "3.12"
77
+
78
+ - name: Build package
79
+ run: uv build
80
+
81
+ ci-success:
82
+ needs: [test, scan-for-secrets, build-package]
83
+ runs-on: ubuntu-latest
84
+ permissions:
85
+ contents: read
86
+
87
+ steps:
88
+ - name: CI success
89
+ run: echo "All CI checks passed!"
@@ -0,0 +1,18 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # idscrub specific
13
+ huggingface
14
+
15
+ .pytest_cache
16
+ .ruff_cache
17
+ .coverage
18
+ .DS_Store
@@ -0,0 +1,22 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ # Run the formatter.
5
+ - id: ruff
6
+ name: ruff
7
+ description: "Lint and format code using Ruff"
8
+ entry: make format
9
+ language: system
10
+ types_or: [python, pyi]
11
+ pass_filenames: false
12
+
13
+ # Mandatory internal hooks
14
+ - repo: https://github.com/uktrade/github-standards
15
+ rev: v1.1.2 # update periodically with pre-commit autoupdate
16
+ hooks:
17
+ - id: run-security-scan
18
+ verbose: false
19
+ - id: run-personal-data-scan
20
+ verbose: false
21
+ - id: validate-security-scan
22
+ verbose: false
idscrub-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Department for Business and Trade
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
idscrub-0.1.0/Makefile ADDED
@@ -0,0 +1,15 @@
1
+ .PHONY: clean precommit test
2
+
3
+ ## Delete all compiled Python files
4
+ clean:
5
+ find . -type f -name "*.py[co]" -delete
6
+ find . -type d -name "__pycache__" -delete
7
+
8
+ ## Reformat, lint
9
+ format:
10
+ uv run ruff format .
11
+ uv run ruff check . --fix
12
+
13
+ ## Run Python tests
14
+ test:
15
+ uv run pytest
idscrub-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,147 @@
1
+ Metadata-Version: 2.4
2
+ Name: idscrub
3
+ Version: 0.1.0
4
+ Author: Department for Business and Trade
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: ipykernel>=7.1.0
9
+ Requires-Dist: numpy>=2.3.4
10
+ Requires-Dist: pandas>=2.3.3
11
+ Requires-Dist: phonenumbers>=9.0.18
12
+ Requires-Dist: pip>=25.3
13
+ Requires-Dist: spacy-transformers>=1.3.9
14
+ Requires-Dist: tqdm>=4.67.1
15
+ Requires-Dist: presidio-analyzer
16
+ Requires-Dist: presidio-anonymizer
17
+ Provides-Extra: trf
18
+ Requires-Dist: en_core_web_trf; extra == "trf"
19
+ Dynamic: license-file
20
+
21
+ # idscrub 🧽✨
22
+
23
+ ## Project Info
24
+
25
+ * This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
26
+
27
+ > [!WARNING]
28
+ > You must follow [GDPR guidance](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/the-research-provisions/principles-and-grounds-for-processing/) when processing personal data using this package.
29
+ >
30
+ > Specifically, you must:
31
+ >
32
+ > - **Update privacy notices**: Clearly state this processing activity in new or existing privacy notices before using the package.
33
+ > - **Ensure secure deletion**: Remove any temporary or intermediary files and outputs in a secure manner.
34
+ > - **Ensure data subject rights upheld**: Ensure individuals can access, correct, or erase their data as required.
35
+ > - **Maintain processing records**: Document how personal data is handled and for what purpose.
36
+
37
+ ### Description
38
+
39
+ * Names and other personally identifying information are often present in text.
40
+ * This information may need to be removed prior to further analysis in many cases.
41
+ * `idscrub` provides a standardised way to do this in the Department for Business and Trade.
42
+
43
+ ### Expected Outputs
44
+
45
+ * A list of text with names and other identifying information removed.
46
+
47
+ > [!WARNING]
48
+ > * This package has been designed as a *first pass* for standardised personal data removal.
49
+ > * Users are encouraged to check and confirm outputs and conduct manual reviews where necessary, e.g. when cleaning high risk datasets.
50
+ > * It is up to the user to assess whether this removal process needs to be supplemented by other methods for their given dataset and security requirements.
51
+
52
+ ### Data
53
+
54
+ - This package is designed for text-based documents structured as a list of strings.
55
+ - It performs best when contextual meaning can be inferred from the text.
56
+ - For best results, input text should therefore resemble natural language.
57
+ - **Highly fragmented, informal, technical, or syntactically broken text may reduce detection accuracy and lead to incomplete or incorrect name detection.**
58
+
59
+ ### Biases and evaluation
60
+
61
+ - `idscrub` supports integration with SpaCy and Hugging Face models for name cleaning.
62
+ - These models are state-of-the-art, capable of identifying approximately 90% of named entities, but **may not remove all names**.
63
+ - **Biases present in these models due to their training data may affect performance**. For example:
64
+ - English names may be more reliably identified than names common in other languages.
65
+ - Uncommon or non-Western naming conventions may be missed or misclassified.
66
+
67
+ > [!IMPORTANT]
68
+ > * See [our wiki](https://github.com/uktrade/idscrub/wiki/Evaluation) for further details and notes on our evaluation of `idscrub`.
69
+
70
+ ### Models and Memory
71
+
72
+ * Only Spacy's `en_core_web_trf` and no Hugging Face models have been formally evaluated.
73
+ * We therefore recommend that the current default `en_core_web_trf` is used for name scrubbing. **Other models need to be evaluated by the user.**
74
+
75
+ > [!IMPORTANT]
76
+ > Spacy and Hugging Face models have high memory requirements. To avoid memory-related errors. Clear the auto-generated `huggingface` folder if not in use. Do not push the `huggingface` folder (or user-defined equivalent) to GitHub.
77
+
78
+ ## Similar Python packages
79
+
80
+ * Similar packages exist for undertaking this task, such as [presidio](https://microsoft.github.io/presidio/), [scrubadub](https://github.com/LeapBeyond/scrubadub) and [sanityze](https://github.com/UBC-MDS/sanityze).
81
+ * Development of `idscrub` was undertaken to: bring together different scrubbing methods across the department, adhere to infrastructure requirements, guarantee future stability and maintainability, and encourage future scrubbing methods to be added collaboratively and transparently.
82
+ * To leverage the power of other packages, we have added methods that allow you to interact with them. These include: `IDScrub.presidio()` and `IDScrub.google_phone_numbers()`. See the [usage example notebook](https://github.com/uktrade/idscrub/blob/main/notebooks/basic_usage.ipynb) and method docstrings for further information.
83
+
84
+
85
+ ## Installation
86
+
87
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example (with spaCy model installed):
88
+
89
+ ```console
90
+ pip install 'git+ssh://git@github.com/uktrade/idscrub.git#egg=idscrub[trf]'
91
+ ```
92
+ or without spaCy installed (it will be installed automatically if name cleaning methods are called):
93
+
94
+ ```console
95
+ pip install 'git+ssh://git@github.com/uktrade/idscrub.git'
96
+ ```
97
+
98
+ ## How to use the code
99
+
100
+ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
101
+
102
+ ```python
103
+ from idscrub import IDScrub
104
+
105
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA, Lapland.'])
106
+ scrubbed_texts = scrub.all()
107
+
108
+ print(scrubbed_texts)
109
+
110
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']
111
+ ```
112
+
113
+ ## AI Declaration
114
+
115
+ AI has been used in the development of `idscrub`, primarily to develop regular expressions, suggest code refinements and draft documentation.
116
+
117
+ ## Development setup
118
+
119
+ This project is managed by [uv](https://docs.astral.sh/uv/).
120
+
121
+ To install all dependencies for this project, run:
122
+
123
+ ```console
124
+ uv sync --all-extras
125
+ ```
126
+
127
+ If you do not have Python 3.12, run:
128
+
129
+ ```console
130
+ uv python install 3.12
131
+ ```
132
+
133
+ To run tests:
134
+
135
+ ```console
136
+ uv run pytest
137
+ ```
138
+
139
+ or
140
+
141
+ ```console
142
+ make test
143
+ ```
144
+
145
+ ## Author
146
+
147
+ Analytical Data Science, Department for Business and Trade
@@ -0,0 +1,127 @@
1
+ # idscrub 🧽✨
2
+
3
+ ## Project Info
4
+
5
+ * This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
6
+
7
+ > [!WARNING]
8
+ > You must follow [GDPR guidance](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/the-research-provisions/principles-and-grounds-for-processing/) when processing personal data using this package.
9
+ >
10
+ > Specifically, you must:
11
+ >
12
+ > - **Update privacy notices**: Clearly state this processing activity in new or existing privacy notices before using the package.
13
+ > - **Ensure secure deletion**: Remove any temporary or intermediary files and outputs in a secure manner.
14
+ > - **Ensure data subject rights upheld**: Ensure individuals can access, correct, or erase their data as required.
15
+ > - **Maintain processing records**: Document how personal data is handled and for what purpose.
16
+
17
+ ### Description
18
+
19
+ * Names and other personally identifying information are often present in text.
20
+ * This information may need to be removed prior to further analysis in many cases.
21
+ * `idscrub` provides a standardised way to do this in the Department for Business and Trade.
22
+
23
+ ### Expected Outputs
24
+
25
+ * A list of text with names and other identifying information removed.
26
+
27
+ > [!WARNING]
28
+ > * This package has been designed as a *first pass* for standardised personal data removal.
29
+ > * Users are encouraged to check and confirm outputs and conduct manual reviews where necessary, e.g. when cleaning high risk datasets.
30
+ > * It is up to the user to assess whether this removal process needs to be supplemented by other methods for their given dataset and security requirements.
31
+
32
+ ### Data
33
+
34
+ - This package is designed for text-based documents structured as a list of strings.
35
+ - It performs best when contextual meaning can be inferred from the text.
36
+ - For best results, input text should therefore resemble natural language.
37
+ - **Highly fragmented, informal, technical, or syntactically broken text may reduce detection accuracy and lead to incomplete or incorrect name detection.**
38
+
39
+ ### Biases and evaluation
40
+
41
+ - `idscrub` supports integration with SpaCy and Hugging Face models for name cleaning.
42
+ - These models are state-of-the-art, capable of identifying approximately 90% of named entities, but **may not remove all names**.
43
+ - **Biases present in these models due to their training data may affect performance**. For example:
44
+ - English names may be more reliably identified than names common in other languages.
45
+ - Uncommon or non-Western naming conventions may be missed or misclassified.
46
+
47
+ > [!IMPORTANT]
48
+ > * See [our wiki](https://github.com/uktrade/idscrub/wiki/Evaluation) for further details and notes on our evaluation of `idscrub`.
49
+
50
+ ### Models and Memory
51
+
52
+ * Only Spacy's `en_core_web_trf` and no Hugging Face models have been formally evaluated.
53
+ * We therefore recommend that the current default `en_core_web_trf` is used for name scrubbing. **Other models need to be evaluated by the user.**
54
+
55
+ > [!IMPORTANT]
56
+ > Spacy and Hugging Face models have high memory requirements. To avoid memory-related errors. Clear the auto-generated `huggingface` folder if not in use. Do not push the `huggingface` folder (or user-defined equivalent) to GitHub.
57
+
58
+ ## Similar Python packages
59
+
60
+ * Similar packages exist for undertaking this task, such as [presidio](https://microsoft.github.io/presidio/), [scrubadub](https://github.com/LeapBeyond/scrubadub) and [sanityze](https://github.com/UBC-MDS/sanityze).
61
+ * Development of `idscrub` was undertaken to: bring together different scrubbing methods across the department, adhere to infrastructure requirements, guarantee future stability and maintainability, and encourage future scrubbing methods to be added collaboratively and transparently.
62
+ * To leverage the power of other packages, we have added methods that allow you to interact with them. These include: `IDScrub.presidio()` and `IDScrub.google_phone_numbers()`. See the [usage example notebook](https://github.com/uktrade/idscrub/blob/main/notebooks/basic_usage.ipynb) and method docstrings for further information.
63
+
64
+
65
+ ## Installation
66
+
67
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example (with spaCy model installed):
68
+
69
+ ```console
70
+ pip install 'git+ssh://git@github.com/uktrade/idscrub.git#egg=idscrub[trf]'
71
+ ```
72
+ or without spaCy installed (it will be installed automatically if name cleaning methods are called):
73
+
74
+ ```console
75
+ pip install 'git+ssh://git@github.com/uktrade/idscrub.git'
76
+ ```
77
+
78
+ ## How to use the code
79
+
80
+ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
81
+
82
+ ```python
83
+ from idscrub import IDScrub
84
+
85
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA, Lapland.'])
86
+ scrubbed_texts = scrub.all()
87
+
88
+ print(scrubbed_texts)
89
+
90
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']
91
+ ```
92
+
93
+ ## AI Declaration
94
+
95
+ AI has been used in the development of `idscrub`, primarily to develop regular expressions, suggest code refinements and draft documentation.
96
+
97
+ ## Development setup
98
+
99
+ This project is managed by [uv](https://docs.astral.sh/uv/).
100
+
101
+ To install all dependencies for this project, run:
102
+
103
+ ```console
104
+ uv sync --all-extras
105
+ ```
106
+
107
+ If you do not have Python 3.12, run:
108
+
109
+ ```console
110
+ uv python install 3.12
111
+ ```
112
+
113
+ To run tests:
114
+
115
+ ```console
116
+ uv run pytest
117
+ ```
118
+
119
+ or
120
+
121
+ ```console
122
+ make test
123
+ ```
124
+
125
+ ## Author
126
+
127
+ Analytical Data Science, Department for Business and Trade
@@ -0,0 +1 @@
1
+ from .scrub import IDScrub as IDScrub
@@ -0,0 +1,10 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ PROJECT_DIR = Path(__file__).resolve().parents[1]
5
+
6
+ DOWNLOAD_DIR = Path.cwd()
7
+
8
+ DATA_HOME = os.path.join(PROJECT_DIR, "data")
9
+
10
+ NOTEBOOKS_HOME = os.path.join(PROJECT_DIR, "notebooks")