address-toolkit 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. address_toolkit-1.0.0/.github/dependabot.yml +15 -0
  2. address_toolkit-1.0.0/.github/workflows/CODEOWNERS +1 -0
  3. address_toolkit-1.0.0/.github/workflows/python-package.yml +38 -0
  4. address_toolkit-1.0.0/.github/workflows/python-publish.yml +57 -0
  5. address_toolkit-1.0.0/AI.gitignore +23 -0
  6. address_toolkit-1.0.0/LICENSE +21 -0
  7. address_toolkit-1.0.0/PKG-INFO +90 -0
  8. address_toolkit-1.0.0/README.md +74 -0
  9. address_toolkit-1.0.0/address_toolkit/__init__.py +0 -0
  10. address_toolkit-1.0.0/address_toolkit/cleaning/__init__.py +12 -0
  11. address_toolkit-1.0.0/address_toolkit/cleaning/cleaning.py +431 -0
  12. address_toolkit-1.0.0/address_toolkit/contextualising/__init__.py +5 -0
  13. address_toolkit-1.0.0/address_toolkit/contextualising/contextualising.py +127 -0
  14. address_toolkit-1.0.0/address_toolkit/extracting/__init__.py +3 -0
  15. address_toolkit-1.0.0/address_toolkit/extracting/extracting.py +204 -0
  16. address_toolkit-1.0.0/address_toolkit/resources/__init__.py +43 -0
  17. address_toolkit-1.0.0/address_toolkit/resources/resources.py +55725 -0
  18. address_toolkit-1.0.0/address_toolkit/tests/__init__.py +0 -0
  19. address_toolkit-1.0.0/address_toolkit/tests/test_cleaning.py +183 -0
  20. address_toolkit-1.0.0/address_toolkit/tests/test_contextualising.py +36 -0
  21. address_toolkit-1.0.0/address_toolkit/tests/test_extracting.py +52 -0
  22. address_toolkit-1.0.0/address_toolkit/tests/test_imports.py +105 -0
  23. address_toolkit-1.0.0/address_toolkit/tests/test_validating.py +54 -0
  24. address_toolkit-1.0.0/address_toolkit/utilities/__init__.py +12 -0
  25. address_toolkit-1.0.0/address_toolkit/utilities/utilities.py +253 -0
  26. address_toolkit-1.0.0/address_toolkit/validating/__init__.py +7 -0
  27. address_toolkit-1.0.0/address_toolkit/validating/validating.py +155 -0
  28. address_toolkit-1.0.0/address_toolkit/workflows/__init__.py +7 -0
  29. address_toolkit-1.0.0/address_toolkit/workflows/workflows.py +136 -0
  30. address_toolkit-1.0.0/address_toolkit.egg-info/PKG-INFO +90 -0
  31. address_toolkit-1.0.0/address_toolkit.egg-info/SOURCES.txt +39 -0
  32. address_toolkit-1.0.0/address_toolkit.egg-info/dependency_links.txt +1 -0
  33. address_toolkit-1.0.0/address_toolkit.egg-info/requires.txt +2 -0
  34. address_toolkit-1.0.0/address_toolkit.egg-info/top_level.txt +4 -0
  35. address_toolkit-1.0.0/analysis/analysis.ipynb +1396 -0
  36. address_toolkit-1.0.0/package_maintenance/README.md +31 -0
  37. address_toolkit-1.0.0/package_maintenance/resource_maintenance.ipynb +289 -0
  38. address_toolkit-1.0.0/pyproject.toml +27 -0
  39. address_toolkit-1.0.0/requirements.txt +2 -0
  40. address_toolkit-1.0.0/setup.cfg +4 -0
  41. address_toolkit-1.0.0/tutorial.ipynb +534 -0
@@ -0,0 +1,15 @@
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "pip" # See documentation for possible values
9
+ directory: "/" # Location of package manifests
10
+ schedule:
11
+ interval: "weekly"
12
+ - package-ecosystem: "github-actions" # See documentation for possible values
13
+ directory: "/" # Location of package manifests
14
+ schedule:
15
+ interval: "weekly"
@@ -0,0 +1 @@
1
+ * @DanHarrisONS @BenMoscropONS
@@ -0,0 +1,38 @@
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Python Package Version Testing
5
+
6
+ on:
7
+ push:
8
+ branches: [ "main" ]
9
+ pull_request:
10
+ branches: [ "main" ]
11
+
12
+ jobs:
13
+ build:
14
+
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ python-version: ["3.9", "3.10"]
20
+
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ - name: Set up Python ${{ matrix.python-version }}
24
+ uses: actions/setup-python@v3
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ - name: Install dependencies
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ python -m pip install flake8 pytest
31
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32
+ - name: Test with pytest
33
+ uses: actions/setup-java@v4
34
+ with:
35
+ java-version: '8'
36
+ distribution: temurin
37
+ - run: |
38
+ pytest
@@ -0,0 +1,57 @@
1
+ # This workflow will upload a Python Package to PyPI when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ release-build:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.10"
28
+
29
+ - name: Build release distributions
30
+ run: |
31
+ # NOTE: put your own distribution build steps here.
32
+ python -m pip install build
33
+ python -m build
34
+
35
+ - name: Upload distributions
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: release-dists
39
+ path: dist/
40
+
41
+ pypi-publish:
42
+ runs-on: ubuntu-latest
43
+ needs:
44
+ - release-build
45
+ permissions:
46
+ id-token: write
47
+ steps:
48
+ - name: Retrieve release distributions
49
+ uses: actions/download-artifact@v4
50
+ with:
51
+ name: release-dists
52
+ path: dist/
53
+
54
+ - name: Publish release distributions to PyPI
55
+ uses: pypa/gh-action-pypi-publish@release/v1
56
+ with:
57
+ packages-dir: dist/
@@ -0,0 +1,23 @@
1
+ # Ignore cache files
2
+ .cache/
3
+ .pip/
4
+ .local/
5
+ .config/
6
+ .cache/
7
+
8
+ # Ignore Python virtual environments
9
+ venv/
10
+ .venv/
11
+
12
+ # Ignore Jupyter notebook checkpoints
13
+ .ipynb_checkpoints/
14
+
15
+ # Ignore other system files
16
+ .DS_Store
17
+ Thumbs.db
18
+
19
+ # Ignore SSH keys and config
20
+ .ssh/
21
+
22
+ # Ignore Test Files
23
+ test_files/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Crown Copyright
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: address-toolkit
3
+ Version: 1.0.0
4
+ Summary: Toolkit for cleaning, validating, extracting and contextualising GB address data with PySpark.
5
+ Author-email: Dan Harris <Dan.Harris@ONS.gov.uk>, Ben Moscrop <Ben.Moscrop@ONS.gov.uk>, Stephen Rowlands <Stephen.Rowlands@ONS.gov.uk>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ONSdigital/address-toolkit
8
+ Project-URL: Documentation, https://github.com/ONSdigital/address-toolkit/blob/main/README.md
9
+ Keywords: address,postcode,address-cleaning,address-validating,address-extracting,address-contextualising
10
+ Requires-Python: <3.11,>=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: pyspark==3.3.2
14
+ Requires-Dist: RapidFuzz==3.13.0
15
+ Dynamic: license-file
16
+
17
+ # Address Toolkit
18
+
19
+ ![Python](https://img.shields.io/badge/Python-3.10-blue)
20
+ ![License](https://img.shields.io/badge/License-MIT-orange)
21
+
22
+ ## Introduction
23
+ The address toolkit package supports the cleaning and processing of address data registered as a `pyspark.sql.DataFrame`.
24
+ The package includes functions for cleaning, validating, extracting and contextualising addresses and address components.
25
+ Additionally, `workflows` have been created as an 'out-of-the-box' application combining the functions across `cleaning`, `validating` and `extracting`.
26
+
27
+ To install the package, run:
28
+ ```
29
+ pip install address-toolkit
30
+ ```
31
+
32
+ To register a spark dataframe from a CSV, the following code can be run.
33
+
34
+ ```
35
+ from pyspark.sql import SparkSession
36
+
37
+
38
+ spark = SparkSession.builder.master("local").appName("test").getOrCreate()
39
+ df = spark.read.csv("test.csv")
40
+ ```
41
+
42
+ A recommended first step in using this package is to apply the `clean_punctuation` function from `address_toolkit.cleaning`.
43
+
44
+ ```
45
+ from address_toolkit.cleaning import clean_punctuation
46
+
47
+ df = clean_punctuation(df, 'address_column', create_flag = True, overwrite = True)
48
+ ```
49
+
50
+ ## Main Package Contents:
51
+
52
+ | Folder Name | Description | Includes |
53
+ | ------------- | ------------- | ------------- |
54
+ | `cleaning` | Contains functions to clean addresses | `clean_punctuation`, `denoise_addresses`, `deduplicate_addresses`, `deduplicate_postcodes`, `rectify_postcodes`, `standardise_street_types`, `prettify_addresses` |
55
+ | `validating` | Contains functions to validate addresses | `validate_from_list`, `validate_from_regex`, `validate_postcodes` |
56
+ | `extracting` | Contains functions to extract address components | `extract_from_list`, `extract_from_regex`, `extract_postcodes` |
57
+ | `contextualising` | Contains functions to contextualise addresses | `contextualise_from_component` |
58
+ | `workflows` | Contains functions to streamline processing | `clean_addresses`, `validate_addresses`, `extract_address_components` |
59
+
60
+ ## Supplementary Package Contents (Resources):
61
+
62
+ | Resource | Includes |
63
+ | ------------- | ------------- |
64
+ | UK Postcode Regex | `postcode_regex` |
65
+ | Unit Address Level Regex | `flat_regex`, `room_regex`, `unit_regex`, `block_regex`, `apartment_regex`, `floor_regex` |
66
+ | Noise Regex | `consecutive_letters_regex` |
67
+ | Miscellaneous Regex | `misc_numbers_regex`, `standalone_numbers_regex`, `txt_before_numbers_regex`, `end_address_numbers_regex`, `start_address_numbers_regex`, `end_address_identifier_regex` |
68
+ | Address Component Lists | `town_list`, `city_list`, `village_list`, `hamlet_list`, `suburb_list`, `bay_list`, `place_list`, `district_list`, `county_list`, `allowed_country_list`, `disallowed_country_list` |
69
+ | Keyword Component Lists | `contextual_keywords`, `misc_keywords_list`, `misc_special_keywords` |
70
+ | Address Component Lookups | `town_lookup`, `village_lookup`, `bay_lookup`, `hamlet_lookup`, `suburb_lookup` |
71
+
72
+ Note:
73
+ For unit address level RegEx patterns, alternatives i.e. `flat_regex_alternative` are available which are less 'strict' in their matching.
74
+
75
+ Address Component lists and lookups have been created from [Ordnance Survey Open Names](https://osdatahub.os.uk/data/downloads/open/OpenNames).
76
+ Contains OS data © Crown copyright and database right 2026.
77
+ Licensed under the Open Government License v3.0. This dataset is updated quarterly in January, April, July and October.
78
+
79
+ Postcode processing is based upon the [Annex C - Valid post code format](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/611951/Appendix_C_ILR_2017_to_2018_v1_Published_28April17.pdf) specification.
80
+
81
+ ## Example Usage
82
+ See `tutorial.ipynb` for full use of all the functions within `cleaning`, `validating`, `extracting`, `contextualising` and `workflows`.
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
@@ -0,0 +1,74 @@
1
+ # Address Toolkit
2
+
3
+ ![Python](https://img.shields.io/badge/Python-3.10-blue)
4
+ ![License](https://img.shields.io/badge/License-MIT-orange)
5
+
6
+ ## Introduction
7
+ The address toolkit package supports the cleaning and processing of address data registered as a `pyspark.sql.DataFrame`.
8
+ The package includes functions for cleaning, validating, extracting and contextualising addresses and address components.
9
+ Additionally, `workflows` have been created as an 'out-of-the-box' application combining the functions across `cleaning`, `validating` and `extracting`.
10
+
11
+ To install the package, run:
12
+ ```
13
+ pip install address-toolkit
14
+ ```
15
+
16
+ To register a spark dataframe from a CSV, the following code can be run.
17
+
18
+ ```
19
+ from pyspark.sql import SparkSession
20
+
21
+
22
+ spark = SparkSession.builder.master("local").appName("test").getOrCreate()
23
+ df = spark.read.csv("test.csv")
24
+ ```
25
+
26
+ A recommended first step in using this package is to apply the `clean_punctuation` function from `address_toolkit.cleaning`.
27
+
28
+ ```
29
+ from address_toolkit.cleaning import clean_punctuation
30
+
31
+ df = clean_punctuation(df, 'address_column', create_flag = True, overwrite = True)
32
+ ```
33
+
34
+ ## Main Package Contents:
35
+
36
+ | Folder Name | Description | Includes |
37
+ | ------------- | ------------- | ------------- |
38
+ | `cleaning` | Contains functions to clean addresses | `clean_punctuation`, `denoise_addresses`, `deduplicate_addresses`, `deduplicate_postcodes`, `rectify_postcodes`, `standardise_street_types`, `prettify_addresses` |
39
+ | `validating` | Contains functions to validate addresses | `validate_from_list`, `validate_from_regex`, `validate_postcodes` |
40
+ | `extracting` | Contains functions to extract address components | `extract_from_list`, `extract_from_regex`, `extract_postcodes` |
41
+ | `contextualising` | Contains functions to contextualise addresses | `contextualise_from_component` |
42
+ | `workflows` | Contains functions to streamline processing | `clean_addresses`, `validate_addresses`, `extract_address_components` |
43
+
44
+ ## Supplementary Package Contents (Resources):
45
+
46
+ | Resource | Includes |
47
+ | ------------- | ------------- |
48
+ | UK Postcode Regex | `postcode_regex` |
49
+ | Unit Address Level Regex | `flat_regex`, `room_regex`, `unit_regex`, `block_regex`, `apartment_regex`, `floor_regex` |
50
+ | Noise Regex | `consecutive_letters_regex` |
51
+ | Miscellaneous Regex | `misc_numbers_regex`, `standalone_numbers_regex`, `txt_before_numbers_regex`, `end_address_numbers_regex`, `start_address_numbers_regex`, `end_address_identifier_regex` |
52
+ | Address Component Lists | `town_list`, `city_list`, `village_list`, `hamlet_list`, `suburb_list`, `bay_list`, `place_list`, `district_list`, `county_list`, `allowed_country_list`, `disallowed_country_list` |
53
+ | Keyword Component Lists | `contextual_keywords`, `misc_keywords_list`, `misc_special_keywords` |
54
+ | Address Component Lookups | `town_lookup`, `village_lookup`, `bay_lookup`, `hamlet_lookup`, `suburb_lookup` |
55
+
56
+ Note:
57
+ For unit address level RegEx patterns, alternatives i.e. `flat_regex_alternative` are available which are less 'strict' in their matching.
58
+
59
+ Address Component lists and lookups have been created from [Ordnance Survey Open Names](https://osdatahub.os.uk/data/downloads/open/OpenNames).
60
+ Contains OS data © Crown copyright and database right 2026.
61
+ Licensed under the Open Government License v3.0. This dataset is updated quarterly in January, April, July and October.
62
+
63
+ Postcode processing is based upon the [Annex C - Valid post code format](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/611951/Appendix_C_ILR_2017_to_2018_v1_Published_28April17.pdf) specification.
64
+
65
+ ## Example Usage
66
+ See `tutorial.ipynb` for full use of all the functions within `cleaning`, `validating`, `extracting`, `contextualising` and `workflows`.
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
File without changes
@@ -0,0 +1,12 @@
1
+ from address_toolkit.cleaning.cleaning import (
2
+ clean_punctuation,
3
+ deduplicate_addresses,
4
+ deduplicate_postcodes,
5
+ denoise_addresses,
6
+ prettify_addresses,
7
+ rectify_postcodes,
8
+ standardise_street_types,
9
+ )
10
+
11
+ __all__ = ['clean_punctuation', 'denoise_addresses', 'deduplicate_addresses', 'deduplicate_postcodes',
12
+ 'rectify_postcodes', 'standardise_street_types','prettify_addresses']