idscrub 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. idscrub-1.0.1/.github/pull_request_template.md +27 -0
  2. idscrub-1.0.1/.github/workflows/cd.yml +35 -0
  3. idscrub-1.0.1/.github/workflows/ci.yml +89 -0
  4. idscrub-1.0.1/.gitignore +18 -0
  5. idscrub-1.0.1/.pre-commit-config.yaml +22 -0
  6. idscrub-1.0.1/CODEOWNERS +1 -0
  7. idscrub-1.0.1/LICENSE +21 -0
  8. idscrub-1.0.1/Makefile +15 -0
  9. idscrub-1.0.1/PKG-INFO +149 -0
  10. idscrub-1.0.1/README.md +128 -0
  11. idscrub-1.0.1/SECURITY_CHECKLIST.md +117 -0
  12. idscrub-1.0.1/idscrub/__init__.py +1 -0
  13. idscrub-1.0.1/idscrub/locations.py +10 -0
  14. idscrub-1.0.1/idscrub/scrub.py +974 -0
  15. idscrub-1.0.1/idscrub.egg-info/PKG-INFO +149 -0
  16. idscrub-1.0.1/idscrub.egg-info/SOURCES.txt +34 -0
  17. idscrub-1.0.1/idscrub.egg-info/dependency_links.txt +1 -0
  18. idscrub-1.0.1/idscrub.egg-info/requires.txt +13 -0
  19. idscrub-1.0.1/idscrub.egg-info/top_level.txt +4 -0
  20. idscrub-1.0.1/notebooks/basic_usage.ipynb +1076 -0
  21. idscrub-1.0.1/pyproject.toml +102 -0
  22. idscrub-1.0.1/setup.cfg +4 -0
  23. idscrub-1.0.1/test/conftest.py +22 -0
  24. idscrub-1.0.1/test/test_all.py +39 -0
  25. idscrub-1.0.1/test/test_chain.py +54 -0
  26. idscrub-1.0.1/test/test_dataframe.py +160 -0
  27. idscrub-1.0.1/test/test_huggingface.py +25 -0
  28. idscrub-1.0.1/test/test_id.py +24 -0
  29. idscrub-1.0.1/test/test_label.py +17 -0
  30. idscrub-1.0.1/test/test_log.py +17 -0
  31. idscrub-1.0.1/test/test_persidio.py +44 -0
  32. idscrub-1.0.1/test/test_phonenumbers.py +13 -0
  33. idscrub-1.0.1/test/test_regex.py +123 -0
  34. idscrub-1.0.1/test/test_scrub.py +48 -0
  35. idscrub-1.0.1/test/test_spacy.py +26 -0
  36. idscrub-1.0.1/uv.lock +2416 -0
@@ -0,0 +1,27 @@
1
+ ## What
2
+
3
+ <!---
4
+ What is this PR doing, e.g. implementations, algorithms, etc.?
5
+ * Set the scene - you probably have a lot of context in your head that the reader doesn't have.
6
+ * Explain like I'm 5 - try to make as few assumptions as possible about the reader
7
+ * Use pictures, screenshots, or a diagram if you can, for example https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-diagrams#creating-mermaid-diagrams
8
+ --->
9
+
10
+ ## Why
11
+
12
+ <!---
13
+ Why is this change happening, e.g. goals, use cases, stories, etc.?
14
+ * Explain what the problem was that this PR addresses.
15
+ * Explain why this solution was chosen, and any alternatives considered.
16
+ * Mention any assumptions, deliberately ignored edge-cases, or changes that are left for later.
17
+ --->
18
+
19
+ ## How this has been tested
20
+
21
+ - [ ] I have tested locally
22
+ - [ ] I have added a new unit test (if appropriate)
23
+ - [ ] Testing not required
24
+
25
+ ## Reviewer Checklist
26
+
27
+ - [ ] I have reviewed the PR and ensured no secret values are present
@@ -0,0 +1,35 @@
1
+ name: "Publish"
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ # Publish on any tag starting with a `v`, e.g., v0.1.0
7
+ - v*
8
+
9
+ jobs:
10
+ run:
11
+ runs-on: ubuntu-latest
12
+ environment:
13
+ name: pypi
14
+ permissions:
15
+ id-token: write
16
+ contents: read
17
+ steps:
18
+ - name: Checkout
19
+ uses: actions/checkout@v5
20
+ - name: Install uv
21
+ uses: astral-sh/setup-uv@v7
22
+
23
+ - name: Install Python 3.12
24
+ run: uv python install 3.12
25
+ - name: Build
26
+ run: uv build
27
+
28
+ # Check that basic features work and we didn't miss to include crucial files
29
+ - name: Regex test (wheel)
30
+ run: uv run --isolated --no-project --with dist/*.whl test/test_regex.py
31
+ - name: Regex test (source distribution)
32
+ run: uv run --isolated --no-project --with dist/*.tar.gz test/test_regex.py
33
+
34
+ - name: Publish
35
+ run: uv publish
@@ -0,0 +1,89 @@
1
+ name: CI
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ pull_request:
7
+ branches:
8
+ - main
9
+ workflow_dispatch:
10
+
11
+ jobs:
12
+ test:
13
+ name: test
14
+ runs-on: ubuntu-latest
15
+ permissions:
16
+ contents: read
17
+ strategy:
18
+ matrix:
19
+ python-version:
20
+ - "3.12"
21
+ - "3.13"
22
+
23
+ steps:
24
+ - uses: actions/checkout@v5
25
+
26
+ - name: Install uv and Python
27
+ uses: astral-sh/setup-uv@v6
28
+ with:
29
+ enable-cache: true
30
+ cache-dependency-glob: "uv.lock"
31
+ python-version: ${{ matrix.python-version }}
32
+
33
+ - name: Install the project
34
+ run: uv sync --frozen --all-extras --dev
35
+
36
+ - name: Run Python tests
37
+ run: uv run pytest
38
+
39
+ - name: Dump docker logs
40
+ if: failure()
41
+ uses: jwalton/gh-docker-logs@v2
42
+
43
+ scan-for-secrets:
44
+ runs-on: ubuntu-latest
45
+ permissions:
46
+ contents: read
47
+
48
+ steps:
49
+ - uses: actions/checkout@v5
50
+ with:
51
+ fetch-depth: 0
52
+
53
+ - name: Run TruffleHog scan
54
+ run: |
55
+ docker run --rm -v ${{ github.workspace }}:/repo \
56
+ trufflesecurity/trufflehog:latest \
57
+ git file:///repo \
58
+ --since-commit main \
59
+ --branch ${{ github.head_ref || github.ref_name }} \
60
+ --github-actions \
61
+ --fail
62
+
63
+ build-package:
64
+ runs-on: ubuntu-latest
65
+ permissions:
66
+ contents: read
67
+
68
+ steps:
69
+ - uses: actions/checkout@v5
70
+ with:
71
+ fetch-depth: 0
72
+
73
+ - name: Install uv and Python
74
+ uses: astral-sh/setup-uv@v6
75
+ with:
76
+ python-version: "3.12"
77
+
78
+ - name: Build package
79
+ run: uv build
80
+
81
+ ci-success:
82
+ needs: [test, scan-for-secrets, build-package]
83
+ runs-on: ubuntu-latest
84
+ permissions:
85
+ contents: read
86
+
87
+ steps:
88
+ - name: CI success
89
+ run: echo "All CI checks passed!"
@@ -0,0 +1,18 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # idscrub specific
13
+ huggingface
14
+
15
+ .pytest_cache
16
+ .ruff_cache
17
+ .coverage
18
+ .DS_Store
@@ -0,0 +1,22 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ # Run the formatter.
5
+ - id: ruff
6
+ name: ruff
7
+ description: "Lint and format code using Ruff"
8
+ entry: make format
9
+ language: system
10
+ types_or: [python, pyi]
11
+ pass_filenames: false
12
+
13
+ # Mandatory internal hooks
14
+ - repo: https://github.com/uktrade/github-standards
15
+ rev: v1.2.1 # update periodically with pre-commit autoupdate
16
+ hooks:
17
+ - id: run-security-scan
18
+ verbose: false
19
+ - id: run-personal-data-scan
20
+ verbose: false
21
+ - id: validate-security-scan
22
+ verbose: false
@@ -0,0 +1 @@
1
+ * @uktrade/ag-data-science
idscrub-1.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Department for Business and Trade
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
idscrub-1.0.1/Makefile ADDED
@@ -0,0 +1,15 @@
1
+ .PHONY: clean precommit test
2
+
3
+ ## Delete all compiled Python files
4
+ clean:
5
+ find . -type f -name "*.py[co]" -delete
6
+ find . -type d -name "__pycache__" -delete
7
+
8
+ ## Reformat, lint
9
+ format:
10
+ uv run ruff format .
11
+ uv run ruff check . --fix
12
+
13
+ ## Run Python tests
14
+ test:
15
+ uv run pytest
idscrub-1.0.1/PKG-INFO ADDED
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.4
2
+ Name: idscrub
3
+ Version: 1.0.1
4
+ Author: Department for Business and Trade
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: ipykernel>=7.1.0
9
+ Requires-Dist: ipywidgets
10
+ Requires-Dist: numpy>=2.3.4
11
+ Requires-Dist: pandas>=2.3.3
12
+ Requires-Dist: phonenumbers>=9.0.18
13
+ Requires-Dist: pip>=25.3
14
+ Requires-Dist: spacy-transformers>=1.3.9
15
+ Requires-Dist: tqdm>=4.67.1
16
+ Requires-Dist: presidio-analyzer
17
+ Requires-Dist: presidio-anonymizer
18
+ Provides-Extra: trf
19
+ Requires-Dist: en_core_web_trf; extra == "trf"
20
+ Dynamic: license-file
21
+
22
+ # idscrub 🧽✨
23
+
24
+ * Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
25
+ * This information may need to be removed prior to further analysis in many cases.
26
+ * `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
27
+
28
+ ## Installation
29
+
30
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
31
+
32
+ ```console
33
+ pip install idscrub
34
+ ```
35
+ or with the spaCy transformer model (`en_core_web_trf`) already installed:
36
+
37
+ ```console
38
+ pip install idscrub[trf]
39
+ ```
40
+ ## How to use the code
41
+
42
+ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/blob/main/notebooks/basic_usage.ipynb) for further examples):
43
+
44
+ ```python
45
+ from idscrub import IDScrub
46
+
47
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
48
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
49
+
50
+ print(scrubbed_texts)
51
+
52
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
53
+ ```
54
+ ## Personal data types supported
55
+
56
+ Personal data can either be scrubbed as methods with arguments for extra customisation, e.g. `IDScrub.google_phone_numbers(region="GB")`, or as a string arguments with default configurations (see above). The method name and its string representation are the same.
57
+
58
+ | Argument | Scrubs |
59
+ |-------------------------|------------------------------------------------------------------------|
60
+ | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
61
+ | `spacy_persons` | Person names detected by spaCy's `en_core_web_trf` (or other user-selected spaCy models) |
62
+ | `huggingface_persons` | Person names detected by user-selected HuggingFace models |
63
+ | `email_addresses` | Email addresses |
64
+ | `titles` | Titles (e.g., Mr., Mrs., Dr.) |
65
+ | `handles` | Social media handles (e.g., @username) |
66
+ | `ip_addresses` | IP addresses |
67
+ | `uk_postcodes` | UK postal codes |
68
+ | `uk_phone_numbers` | UK phone numbers |
69
+ | `google_phone_numbers` | Phone numbers detected by Google’s [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
70
+ | `presidio` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g., names, URLs, NHS numbers, IBAN codes) |
71
+
72
+ ## Considerations before use
73
+
74
+ - You must follow [GDPR guidance](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/the-research-provisions/principles-and-grounds-for-processing/) when processing personal data using this package.
75
+ - This package has been designed as a *first pass* for standardised personal data removal.
76
+ - Users are encouraged to check and confirm outputs and conduct manual reviews where necessary, e.g. when cleaning high risk datasets.
77
+ - It is up to the user to assess whether this removal process needs to be supplemented by other methods for their given dataset and security requirements.
78
+
79
+ ### Input data
80
+
81
+ - This package is designed for text-based documents structured as a list of strings.
82
+ - It performs best when contextual meaning can be inferred from the text.
83
+ - For best results, input text should therefore resemble natural language.
84
+ - **Highly fragmented, informal, technical, or syntactically broken text may reduce detection accuracy and lead to incomplete or incorrect name detection.**
85
+
86
+ ### Biases and evaluation
87
+
88
+ - `idscrub` supports integration with SpaCy and Hugging Face models for name cleaning.
89
+ - These models are state-of-the-art, capable of identifying approximately 90% of named entities, but **may not remove all names**.
90
+ - **Biases present in these models due to their training data may affect performance**. For example:
91
+ - English names may be more reliably identified than names common in other languages.
92
+ - Uncommon or non-Western naming conventions may be missed or misclassified.
93
+
94
+ > [!IMPORTANT]
95
+ > * See [our wiki](https://github.com/uktrade/idscrub/wiki/Evaluation) for further details and notes on our evaluation of `idscrub`.
96
+
97
+ ### Models
98
+
99
+ * Only Spacy's `en_core_web_trf` and no Hugging Face models have been formally evaluated.
100
+ * We therefore recommend that the current default `en_core_web_trf` is used for name scrubbing. **Other models need to be evaluated by the user.**
101
+
102
+ ## Similar Python packages
103
+
104
+ * Similar packages exist for undertaking this task, such as [Presidio](https://microsoft.github.io/presidio/), [Scrubadub](https://github.com/LeapBeyond/scrubadub) and [Sanityze](https://github.com/UBC-MDS/sanityze).
105
+ * Development of `idscrub` was undertaken to:
106
+
107
+ * Bring together different scrubbing methods across the Department for Business and Trade.
108
+ * Adhere to infrastructure requirements.
109
+ * Guarantee future stability and maintainability.
110
+ * Encourage future scrubbing methods to be added collaboratively and transparently.
111
+ * Allow for full flexibility depending on the use case and required outputs.
112
+
113
+ * To leverage the power of other packages, we have added methods that allow you to interact with them. These include: `IDScrub.presidio()` and `IDScrub.google_phone_numbers()`. See the [usage example notebook](https://github.com/uktrade/idscrub/blob/main/notebooks/basic_usage.ipynb) and method docstrings for further information.
114
+
115
+ ## AI declaration
116
+
117
+ AI has been used in the development of `idscrub`, primarily to develop regular expressions, suggest code refinements and draft documentation.
118
+
119
+ ## Development setup
120
+
121
+ This project is managed by [uv](https://docs.astral.sh/uv/).
122
+
123
+ To install all dependencies for this project, run:
124
+
125
+ ```console
126
+ uv sync --all-extras
127
+ ```
128
+
129
+ If you do not have Python 3.12, run:
130
+
131
+ ```console
132
+ uv python install 3.12
133
+ ```
134
+
135
+ To run tests:
136
+
137
+ ```console
138
+ uv run pytest
139
+ ```
140
+
141
+ or
142
+
143
+ ```console
144
+ make test
145
+ ```
146
+
147
+ ## Author
148
+
149
+ Analytical Data Science, Department for Business and Trade
@@ -0,0 +1,128 @@
1
+ # idscrub 🧽✨
2
+
3
+ * Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
4
+ * This information may need to be removed prior to further analysis in many cases.
5
+ * `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
6
+
7
+ ## Installation
8
+
9
+ `idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
10
+
11
+ ```console
12
+ pip install idscrub
13
+ ```
14
+ or with the spaCy transformer model (`en_core_web_trf`) already installed:
15
+
16
+ ```console
17
+ pip install idscrub[trf]
18
+ ```
19
+ ## How to use the code
20
+
21
+ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/blob/main/notebooks/basic_usage.ipynb) for further examples):
22
+
23
+ ```python
24
+ from idscrub import IDScrub
25
+
26
+ scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
27
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
28
+
29
+ print(scrubbed_texts)
30
+
31
+ # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
32
+ ```
33
+ ## Personal data types supported
34
+
35
+ Personal data can either be scrubbed as methods with arguments for extra customisation, e.g. `IDScrub.google_phone_numbers(region="GB")`, or as a string arguments with default configurations (see above). The method name and its string representation are the same.
36
+
37
+ | Argument | Scrubs |
38
+ |-------------------------|------------------------------------------------------------------------|
39
+ | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
40
+ | `spacy_persons` | Person names detected by spaCy's `en_core_web_trf` (or other user-selected spaCy models) |
41
+ | `huggingface_persons` | Person names detected by user-selected HuggingFace models |
42
+ | `email_addresses` | Email addresses |
43
+ | `titles` | Titles (e.g., Mr., Mrs., Dr.) |
44
+ | `handles` | Social media handles (e.g., @username) |
45
+ | `ip_addresses` | IP addresses |
46
+ | `uk_postcodes` | UK postal codes |
47
+ | `uk_phone_numbers` | UK phone numbers |
48
+ | `google_phone_numbers` | Phone numbers detected by Google’s [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
49
+ | `presidio` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g., names, URLs, NHS numbers, IBAN codes) |
50
+
51
+ ## Considerations before use
52
+
53
+ - You must follow [GDPR guidance](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/the-research-provisions/principles-and-grounds-for-processing/) when processing personal data using this package.
54
+ - This package has been designed as a *first pass* for standardised personal data removal.
55
+ - Users are encouraged to check and confirm outputs and conduct manual reviews where necessary, e.g. when cleaning high risk datasets.
56
+ - It is up to the user to assess whether this removal process needs to be supplemented by other methods for their given dataset and security requirements.
57
+
58
+ ### Input data
59
+
60
+ - This package is designed for text-based documents structured as a list of strings.
61
+ - It performs best when contextual meaning can be inferred from the text.
62
+ - For best results, input text should therefore resemble natural language.
63
+ - **Highly fragmented, informal, technical, or syntactically broken text may reduce detection accuracy and lead to incomplete or incorrect name detection.**
64
+
65
+ ### Biases and evaluation
66
+
67
+ - `idscrub` supports integration with SpaCy and Hugging Face models for name cleaning.
68
+ - These models are state-of-the-art, capable of identifying approximately 90% of named entities, but **may not remove all names**.
69
+ - **Biases present in these models due to their training data may affect performance**. For example:
70
+ - English names may be more reliably identified than names common in other languages.
71
+ - Uncommon or non-Western naming conventions may be missed or misclassified.
72
+
73
+ > [!IMPORTANT]
74
+ > * See [our wiki](https://github.com/uktrade/idscrub/wiki/Evaluation) for further details and notes on our evaluation of `idscrub`.
75
+
76
+ ### Models
77
+
78
+ * Only Spacy's `en_core_web_trf` and no Hugging Face models have been formally evaluated.
79
+ * We therefore recommend that the current default `en_core_web_trf` is used for name scrubbing. **Other models need to be evaluated by the user.**
80
+
81
+ ## Similar Python packages
82
+
83
+ * Similar packages exist for undertaking this task, such as [Presidio](https://microsoft.github.io/presidio/), [Scrubadub](https://github.com/LeapBeyond/scrubadub) and [Sanityze](https://github.com/UBC-MDS/sanityze).
84
+ * Development of `idscrub` was undertaken to:
85
+
86
+ * Bring together different scrubbing methods across the Department for Business and Trade.
87
+ * Adhere to infrastructure requirements.
88
+ * Guarantee future stability and maintainability.
89
+ * Encourage future scrubbing methods to be added collaboratively and transparently.
90
+ * Allow for full flexibility depending on the use case and required outputs.
91
+
92
+ * To leverage the power of other packages, we have added methods that allow you to interact with them. These include: `IDScrub.presidio()` and `IDScrub.google_phone_numbers()`. See the [usage example notebook](https://github.com/uktrade/idscrub/blob/main/notebooks/basic_usage.ipynb) and method docstrings for further information.
93
+
94
+ ## AI declaration
95
+
96
+ AI has been used in the development of `idscrub`, primarily to develop regular expressions, suggest code refinements and draft documentation.
97
+
98
+ ## Development setup
99
+
100
+ This project is managed by [uv](https://docs.astral.sh/uv/).
101
+
102
+ To install all dependencies for this project, run:
103
+
104
+ ```console
105
+ uv sync --all-extras
106
+ ```
107
+
108
+ If you do not have Python 3.12, run:
109
+
110
+ ```console
111
+ uv python install 3.12
112
+ ```
113
+
114
+ To run tests:
115
+
116
+ ```console
117
+ uv run pytest
118
+ ```
119
+
120
+ or
121
+
122
+ ```console
123
+ make test
124
+ ```
125
+
126
+ ## Author
127
+
128
+ Analytical Data Science, Department for Business and Trade
@@ -0,0 +1,117 @@
1
+ # Security Checklist
2
+
3
+ This checklist is designed to make it easier to improve the security posture of a GitHub repository.
4
+
5
+ - It is mandatory for public repositories.
6
+ - This checklist must be copied over to the root of the repository.
7
+ - The repository steward is responsible for populating the checklist, or at least approving the related pull request.
8
+ - Any feedback should be shared with the GitHub Security working group.
9
+
10
+ ## Checklist
11
+
12
+ - [x] [Setup the pre-commit hook framework](#setup-the-pre-commit-hook-framework)
13
+ - [x] [Setup custom properties on the repository](#setup-custom-properties-on-the-repository)
14
+ - [x] [Apply the correct github security policy](#apply-the-correct-github-security-policy)
15
+ - [x] [Ensure CODEOWNERS file exists](#ensure-codeowners-file-exists)
16
+ - [x] [Copy the SECURITY_CHECKLIST.md file](#copy-the-security_checklistmd-file)
17
+ - [x] [Review the GitHub CI/CD overview](#review-the-github-cicd-overview)
18
+ - [x] [Review the GitHub Safety Tips](#review-github-safety-tips)
19
+ - [x] [Add Steward to Repository access](#add-at-least-one-steward-to-repository-access)
20
+ - [x] [Review and limit maintainers with admin rights to the strict minimum](#review-and-limit-maintainers-with-admin-rights-to-the-strict-minimum)
21
+ - [x] [Review the Pull Request template](#review-pull-request-template)
22
+ - [x] [Review the SECURITY.md policy](#review-securitymd-policy)
23
+
24
+ ## Setup the pre-commit hook framework
25
+
26
+ Several uktrade repositories already make use of the pre-commit framework for flagging code quality issues before pushing. Even in the repositories that have the pre-commit framework installed, it is still optional for an individual engineer to either avoid configuring the commit hooks, or skipping them entirely using the `--no-verify` cli argument.
27
+
28
+ As part of the go live process, each engineer making changes to the repository being reopened will be required to install the organisation wide pre-commit hooks locally. When a PR is opened, an organisation level github action will run to confirm the pre-commit hooks ran on the engineers machine and will block any PRs that have not run these hooks.
29
+
30
+ Instructions have been added to the [dbt hooks repository](https://github.com/uktrade/github-standards/blob/main/README.md#usage) to provide guidance on adding these organisation wide pre-commit hooks to an individual repository
31
+
32
+ ## Setup custom properties on the repository
33
+
34
+ A set of github tags have been created at an organisation level, these must be applied to a repository to allow organisation level github actions to run on each pull request.
35
+
36
+ ### Mandatory custom properties
37
+
38
+ - `reusable_workflow_opt_in`: This one has to be applied and set to `true` to allow this repository to apply the correct organisation branch protection ruleset and run the necessary github workflows on each PR
39
+ - `ddat_portfolio`: The portfolio inside DDAT this repository belongs to. If your portfolio is missing, this can be added by raising an SRE ticket.
40
+
41
+ ### Optional custom properties
42
+
43
+ - `is_docker`: If this repository builds a docker image, this tag should be added to run docker related github workflows
44
+ - `language`: All languages used by this repository should be selected, and github workflows will run with dedicated checks on that language.
45
+
46
+ ## Apply the correct github security policy
47
+
48
+ To add the new security policy, follow these instructions:
49
+
50
+ 1. As an organisation administrator, navigate to the [security config page](https://github.com/organizations/uktrade/settings/security_products).
51
+ 1. Scroll down to the **Apply configurations** sections, and enter the name of the repository to be made public in the filter input field
52
+ 1. Use the checkbox next to the results list to select all repositories being made public, then use the **Apply configuration** button to select the **Default DBT security** configuration
53
+ 1. A confirmation modal will appear displaying a summary of the action being made. Click the apply button
54
+ 1. In the repository that has had the new policy applied, navigate to the **Advanced Security** page in the repository settings. At the top of the page there should be a banner message **Modifications to some settings have been blocked by organization administrators.**
55
+
56
+ ### Optional: Setup CodeQL to allow PRs from repository forks
57
+
58
+ For most repositories, the default CodeQL configuration applied by the **Default DBT security** policy will be sufficient. However, this default configuration does not currently support scanning PRs raised from a fork of a repository. If your repository needs to accept PRs from a fork, you must follow these steps to switch to the advanced CodeQL setup:
59
+
60
+ 1. Open the GitHub settings page, and navigate to the Advanced Security section using the left hand menu
61
+ 1. Scroll down to the Code Scanning section, under the Tools sub-section there will be an item for CodeQL analysis
62
+ 1. Click the ... button next to Default setup text, then choose the Switch to advanced option from the menu
63
+ 1. On the popup, click the Disable CodeQL button. Although you are disabling CodeQL, there is still a branch protection rule in place that blocks a PR unless a CodeQL scan is detected. Disabling here will not allow PRs to be merged
64
+ 1. The GitHub online editor will open to create a new file called codeql.yml in your repo, and the contents of this file will be prefilled with the languages CodeQL has detected in your repo. You can modify the contents of this file if needed, however you must leave the workflow name as `CodeQL Advanced`
65
+ 1. Once happy with the workflow file contents, click the green Commit changes button to trigger a PR to merge this into the main branch
66
+ 1. Approve and merge the PR with this workflow file. Once merged, the CodeQL scan will perform an initial scan that can take a while but you can track the progress by viewing the Actions tab for your repository
67
+
68
+ ## Ensure CODEOWNERS file exists
69
+
70
+ The organisation rulesets require a CODEOWNERS file to be present in the repository. If you don't already have one of these, github has produced [documentation explaining](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) what they are and why they are used.
71
+
72
+ ## Copy the SECURITY_CHECKLIST.md file
73
+
74
+ To allow tracking of repositories that have successfully completed the reopening process, this file must be copied to the root of your repository and each of the items in the Checklist marked as completed
75
+
76
+ ## Review the GitHub CI/CD overview
77
+
78
+ Internal contributors to the repository should review the CI/CD overview below
79
+ ![CI/CD overview](assets/CI-CD%20pipeline.svg)
80
+
81
+ ## Review GitHub Safety Tips
82
+
83
+ Internal contributors to the repository should review the [GitHub Safety Tips](https://uktrade.atlassian.net/wiki/x/n4AEKQE)
84
+
85
+ ## Add at least one steward to repository access
86
+
87
+ To ensure correct governance of a repository, at least one steward must be added. This will usually be the most senior engineer on the team. To add a steward to a repository:
88
+
89
+ 1. Open the `Collaborators and teams` settings page. The url for this is `https://github.com/uktrade/REPO_NAME/github-standards/settings/access`
90
+ 1. Use the `Add people` button to open the people finder autocomplete box.
91
+ 1. Find and click the user who is going to be a steward
92
+ 1. On the Choose a role page, select the `Steward` role.
93
+ 1. Repeat for any additional users who are going to be a steward
94
+
95
+ ## Review and limit maintainers with admin rights to the strict minimum
96
+
97
+ You should review who has been assigned the github `admin` role. The `write` role is sufficient to allow team members to commit changes and raise pull requests
98
+
99
+ ## Review Pull Request template
100
+
101
+ If your repository does not already contain a pull_request_template.md file, by default you will inherit the template from this repository. If you are already using your own template, you should add this section to remind reviewers they should be ensuring no secret values are visible
102
+
103
+ ```
104
+ ## Reviewer Checklist
105
+
106
+ - [ ] I have reviewed the PR and ensured no secret values are present
107
+ ```
108
+
109
+ ## Review SECURITY.md policy
110
+
111
+ This repository contain the SECURITY.md file, which is inherited by all repositories in the uktrade organisation account. This file should be read and understood by the repository steward, and discussed with the team to ensure all engineers understand the tooling that has been put in place
112
+
113
+ ## More information
114
+
115
+ For more information about GitHub security standards, please refer [to this link](https://dbis.sharepoint.com/:w:/r/sites/DDaTDirectorate/Shared%20Documents/Work%20-%20GitHub%20Security/Github%20Security%20Framework/Guidelines%20and%20Policies/GitHub%20Security%20Standards%20v0.5.docx?d=wb29cd9b99ca042deb5c0cd8d670966d9&csf=1&web=1&e=6ITbnL)
116
+
117
+ For more details about the security features please refer to the [GitHub Standards](https://github.com/uktrade/github-standards) repo.
@@ -0,0 +1 @@
1
+ from .scrub import IDScrub as IDScrub
@@ -0,0 +1,10 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ PROJECT_DIR = Path(__file__).resolve().parents[1]
5
+
6
+ DOWNLOAD_DIR = Path.cwd()
7
+
8
+ DATA_HOME = os.path.join(PROJECT_DIR, "data")
9
+
10
+ NOTEBOOKS_HOME = os.path.join(PROJECT_DIR, "notebooks")