ugrc-sweeper 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ugrc-sweeper-2.0.1/LICENSE +9 -0
  2. ugrc-sweeper-2.0.1/MANIFEST.in +3 -0
  3. ugrc-sweeper-2.0.1/PKG-INFO +160 -0
  4. ugrc-sweeper-2.0.1/pyproject.toml +10 -0
  5. ugrc-sweeper-2.0.1/setup.cfg +4 -0
  6. ugrc-sweeper-2.0.1/setup.py +65 -0
  7. ugrc-sweeper-2.0.1/src/sweeper/__init__.py +0 -0
  8. ugrc-sweeper-2.0.1/src/sweeper/__main__.py +207 -0
  9. ugrc-sweeper-2.0.1/src/sweeper/address_parser.py +246 -0
  10. ugrc-sweeper-2.0.1/src/sweeper/backup.py +54 -0
  11. ugrc-sweeper-2.0.1/src/sweeper/config.py +36 -0
  12. ugrc-sweeper-2.0.1/src/sweeper/report.py +181 -0
  13. ugrc-sweeper-2.0.1/src/sweeper/street_types.json +321 -0
  14. ugrc-sweeper-2.0.1/src/sweeper/sweepers/UseLimitations.html +10 -0
  15. ugrc-sweeper-2.0.1/src/sweeper/sweepers/__init__.py +0 -0
  16. ugrc-sweeper-2.0.1/src/sweeper/sweepers/addresses.py +71 -0
  17. ugrc-sweeper-2.0.1/src/sweeper/sweepers/base.py +17 -0
  18. ugrc-sweeper-2.0.1/src/sweeper/sweepers/duplicates.py +128 -0
  19. ugrc-sweeper-2.0.1/src/sweeper/sweepers/empties.py +66 -0
  20. ugrc-sweeper-2.0.1/src/sweeper/sweepers/invalids.py +19 -0
  21. ugrc-sweeper-2.0.1/src/sweeper/sweepers/metadata.py +291 -0
  22. ugrc-sweeper-2.0.1/src/sweeper/utilities.py +27 -0
  23. ugrc-sweeper-2.0.1/src/sweeper/workspace_info.py +123 -0
  24. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/PKG-INFO +160 -0
  25. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/SOURCES.txt +28 -0
  26. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/dependency_links.txt +1 -0
  27. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/entry_points.txt +3 -0
  28. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/not-zip-safe +1 -0
  29. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/requires.txt +14 -0
  30. ugrc-sweeper-2.0.1/src/ugrc_sweeper.egg-info/top_level.txt +1 -0
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) UGRC contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,3 @@
1
+ include src/sweeper/*.json
2
+ include src/sweeper/sweepers/*.html
3
+ exclude config.json
@@ -0,0 +1,160 @@
1
+ Metadata-Version: 2.1
2
+ Name: ugrc-sweeper
3
+ Version: 2.0.1
4
+ Summary: CLI tool for making good data
5
+ Home-page: https://github.com/agrc/sweeper
6
+ Author: UGRC
7
+ Author-email: ugrc-developers@utah.gov
8
+ License: MIT
9
+ Platform: UNKNOWN
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: Unix
14
+ Classifier: Operating System :: POSIX
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Classifier: Programming Language :: Python
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Topic :: Utilities
19
+ Requires-Python: >=3
20
+ Description-Content-Type: text/markdown
21
+ Provides-Extra: tests
22
+ License-File: LICENSE
23
+
24
+ # ugrc-sweeper [![PyPI version](https://badge.fury.io/py/ugrc-sweeper.svg)](https://badge.fury.io/py/ugrc-sweeper)[![Push Events](https://github.com/agrc/sweeper/actions/workflows/push.yml/badge.svg)](https://github.com/agrc/sweeper/actions/workflows/push.yml)
25
+
26
+ The data cleaning service.
27
+
28
+ ![sweeper_sm](https://user-images.githubusercontent.com/325813/90411835-91c4c080-e069-11ea-9d03-f3e60421b835.png)
29
+
30
+ ## Available Sweepers
31
+
32
+ ### Addresses
33
+
34
+ Checks that addresses have minimum required parts and optionally normalizes them.
35
+
36
+ ### Duplicates
37
+
38
+ Checks for duplicate features.
39
+
40
+ ### Empties
41
+
42
+ Checks for empty geometries.
43
+
44
+ ### Metadata
45
+
46
+ Checks to make sure that the metadata meets [the Basic SGID Metadata Requirements](https://gis.utah.gov/about/policy/metadata/#basic-sgid-metadata).
47
+
48
+ #### Tags
49
+
50
+ Checks to make sure that existing tags are cased appropriately. This mean that the are title-cased other than known abbreviations (e.g. UGRC, BLM) and articles (e.g. a, the, of).
51
+
52
+ This check also verifies that the data set contains a tag that matches the database name (e.g. `SGID`) and the schema (e.g. `Cadastre`).
53
+
54
+ `--try-fix` adds missing required tags and title-cases any existing tags.
55
+
56
+ #### Summary
57
+
58
+ Checks to make sure that the summary is less than 2048 characters (a limitation of AGOL) and that it is shorter than the description.
59
+
60
+ #### Description
61
+
62
+ Checks to make sure that the description contains a link to a data page on gis.utah.gov.
63
+
64
+ #### Use Limitations
65
+
66
+ Checks to make sure that the text in this section matches the [official text for UGRC](src/sweeper/sweepers/UseLimitations.html).
67
+
68
+ `--try-fix` updates the text to match the official text.
69
+
70
+ ## Parsing Addresses
71
+
72
+ This project contains a module that can be used as a standalone address parser, `sweeper.address_parser`. This allows developer to take advantage of sweepers advanced address parsing and normalization without having to run the entire sweeper process.
73
+
74
+ ### Usage Example
75
+
76
+ ```python
77
+ from sweeper.address_parser import Address
78
+
79
+ address = Address('123 South Main Street')
80
+ print(address)
81
+
82
+ '''
83
+ --> Parsed Address:
84
+ {'address_number': '123',
85
+ 'normalized': '123 S MAIN ST',
86
+ 'prefix_direction': 'S',
87
+ 'street_name': 'MAIN',
88
+ 'street_type': 'ST'}
89
+ '''
90
+ ```
91
+
92
+ ### Available Address class properties
93
+
94
+ All properties default to None if there is no parsed value.
95
+
96
+ `address_number`
97
+
98
+ `address_number_suffix`
99
+
100
+ `prefix_direction`
101
+
102
+ `street_name`
103
+
104
+ `street_direction`
105
+
106
+ `street_type`
107
+
108
+ `unit_type`
109
+
110
+ `unit_id`
111
+ If no `unit_type` is found, this property is prefixed with `#` (e.g. `# 3`). If `unit_type` is found, `#` is stripped from this property.
112
+
113
+ `city`
114
+
115
+ `zip_code`
116
+
117
+ `po_box`
118
+ The PO Box if a po-box-type address was entered (e.g. `po_box` would be `1` for `p.o. box 1`).
119
+
120
+ `normalized`
121
+ A normalized string representing the entire address that was passed into the constructor. PO Boxes are normalized in this format `PO BOX <number>`.
122
+
123
+ ## Installation (requires Pro 2.7+)
124
+
125
+ <!-- Current conda install arcpy -c esri seems to be wonky; just clone to be safe -->
126
+
127
+ 1. clone arcgis conda environment
128
+ - `conda create --name sweeper --clone arcgispro-py3`
129
+ 1. activate environment
130
+ - `activate sweeper`
131
+ 1. install sweeper
132
+ - `pip install ugrc-sweeper`
133
+ 1. Optionally duplicate `config.sample.json` as `config.json` in the folder where you will run sweeper.
134
+
135
+ > [!CAUTION]
136
+ > This is required for the following functions:
137
+ >
138
+ > - `--scheduled` argument (required for sending emails)
139
+ > - `--change-detect` argument
140
+ > - using user-specific connection files via the `CONNECTIONS_FOLDER` config value
141
+
142
+ ## Exclusions
143
+
144
+ Tables can be skipped by adding values to the `EXCLUSIONS.<sweeper_key>` config array. These values are matched against table names using [fnmatch](https://docs.python.org/3/library/fnmatch.html#fnmatch.fnmatch). Note that these do not apply when using the `--table-name` argument.
145
+
146
+ ## Development
147
+
148
+ 1. clone arcgis conda environment
149
+ - `conda create --name sweeper --clone arcgispro-py3`
150
+ 1. activate environment
151
+ - `activate sweeper`
152
+ 1. install required dependencies to work on sweeper
153
+ - `pip install -e ".[tests]"`
154
+ 1. `test_metadata.py` uses a SQL database that needs to be restored via `src/sweeper/tests/data/Sweeper.bak` to your local SQL Server.
155
+ 1. run sweeper: `sweeper`
156
+ 1. test: `pytest`
157
+ 1. lint: `ruff check .`
158
+ 1. format: `ruff format .`
159
+
160
+
@@ -0,0 +1,10 @@
1
+ [tool.ruff]
2
+ line-length = 120
3
+ [tool.ruff.lint]
4
+ ignore = ["E501"]
5
+ [tool.pytest.ini_options]
6
+ minversion = "6.0"
7
+ testpaths = ["tests", "src"]
8
+ norecursedirs = [".env", "data", "maps", ".github", ".vscode"]
9
+ console_output_style = "count"
10
+ addopts = "--cov-branch --cov=sweeper --cov-report term --cov-report xml:cov.xml --instafail -p no:faulthandler"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+ """
4
+ setup.py
5
+ A module that installs sweeper as a module
6
+ """
7
+
8
+ import glob
9
+ from os.path import basename, splitext
10
+ from pathlib import Path
11
+
12
+ from setuptools import find_packages, setup
13
+
14
+ setup(
15
+ name="ugrc-sweeper",
16
+ version="2.0.1",
17
+ license="MIT",
18
+ description="CLI tool for making good data",
19
+ long_description=(Path(__file__).parent / "readme.md").read_text(),
20
+ long_description_content_type="text/markdown",
21
+ author="UGRC",
22
+ author_email="ugrc-developers@utah.gov",
23
+ url="https://github.com/agrc/sweeper",
24
+ packages=find_packages("src"),
25
+ package_dir={"": "src"},
26
+ py_modules=[splitext(basename(i))[0] for i in glob.glob("src/*.py")],
27
+ python_requires=">=3",
28
+ include_package_data=True,
29
+ zip_safe=False,
30
+ classifiers=[
31
+ "Development Status :: 5 - Production/Stable",
32
+ "Intended Audience :: Developers",
33
+ "License :: OSI Approved :: MIT License",
34
+ "Operating System :: Unix",
35
+ "Operating System :: POSIX",
36
+ "Operating System :: Microsoft :: Windows",
37
+ "Programming Language :: Python",
38
+ "Programming Language :: Python :: 3 :: Only",
39
+ "Topic :: Utilities",
40
+ ],
41
+ keywords=[],
42
+ install_requires=[
43
+ "agrc-supervisor==3.*",
44
+ "agrc-usaddress==0.*",
45
+ "beautifulsoup4==4.*",
46
+ "docopt==0.*",
47
+ "html5lib==1.*",
48
+ "xxhash==3.*",
49
+ ],
50
+ dependency_links=[],
51
+ extras_require={
52
+ "tests": [
53
+ "pytest-cov==5.*",
54
+ "pytest-instafail==0.5.*",
55
+ "pytest-mock==3.*",
56
+ "pytest-watch==4.*",
57
+ "pytest==8.*",
58
+ "ruff==0.*",
59
+ ],
60
+ },
61
+ setup_requires=[
62
+ "pytest-runner",
63
+ ],
64
+ entry_points={"console_scripts": ["sweeper = sweeper.__main__:main"]},
65
+ )
File without changes
@@ -0,0 +1,207 @@
1
+ #!/usr/bin/env python
2
+ # * coding: utf8 *
3
+ """
4
+ sweeper
5
+
6
+ Usage:
7
+ sweeper sweep duplicates --workspace=<workspace> [--table-name=<table_name> --verbose --try-fix --change-detect --scheduled --save-report=<report_path> --backup-to=<backup_path>]
8
+ sweeper sweep empties --workspace=<workspace> [--table-name=<table_name> --verbose --try-fix --change-detect --scheduled --save-report=<report_path> --backup-to=<backup_path>]
9
+ sweeper sweep invalids --workspace=<workspace> [--table-name=<table_name> --verbose --try-fix --change-detect --scheduled --save-report=<report_path> --backup-to=<backup_path>]
10
+ sweeper sweep addresses --workspace=<workspace> --table-name=<table-name> --field-name=<field_name> [--verbose --try-fix --save-report=<report_path> --backup-to=<backup_path>]
11
+ sweeper sweep metadata --workspace=<workspace> [--table-name=<table_name> --verbose --try-fix --change-detect --scheduled --save-report=<report_path> --backup-to=<backup_path>]
12
+ sweeper sweep --workspace=<workspace> [--table-name=<table_name> --verbose --try-fix --change-detect --scheduled --save-report=<report_path> --backup-to=<backup_path>]
13
+
14
+ Arguments:
15
+ workspace - path to workspace eg: `c:\\my.gdb`
16
+ table_name - name of feature class or table eg: `Roads` (needs to be fully qualified (eg: `SGID.Transportation.Roads`) for metadata sweeper)
17
+ report_path - folder to save report to eg: `c:\\temp`
18
+ backup_path - place to create a temp gdb and import original table
19
+ field_name - name of the field to check
20
+
21
+ Examples:
22
+ sweeper sweep --workspace=c:\\data\\thing --try-fix --save-report=c:\\temp --backup-to=c:\\temp\\backup.gdb
23
+ sweeper sweep addresses --workspace=c:\\data\\thing --try-fix --save-report=c:\\temp --backup-to=c:\\temp\\backup.gdb --field-name=ADDRESS
24
+ """
25
+
26
+ import datetime
27
+ import logging
28
+ import logging.handlers
29
+ import sys
30
+ from pathlib import Path
31
+
32
+ import pkg_resources
33
+ from docopt import docopt
34
+ from supervisor.message_handlers import SendGridHandler
35
+ from supervisor.models import MessageDetails, Supervisor
36
+
37
+ from . import backup, config, report, utilities, workspace_info
38
+ from .sweepers.addresses import AddressTest
39
+ from .sweepers.duplicates import DuplicateTest
40
+ from .sweepers.empties import EmptyTest
41
+ from .sweepers.metadata import MetadataTest
42
+
43
+
44
+ def main():
45
+ """Main entry point for program. Parse arguments and pass to sweeper modules."""
46
+ args = docopt(__doc__, version=pkg_resources.require("ugrc-sweeper")[0].version)
47
+
48
+ log = setup_logging(args["--save-report"], args["--scheduled"])
49
+
50
+ if args["--scheduled"]:
51
+ #: set up supervisor, add email handler
52
+ sweeper_supervisor = Supervisor()
53
+ sweeper_supervisor.add_message_handler(
54
+ SendGridHandler(
55
+ {
56
+ "from_address": "noreply@utah.gov",
57
+ "to_addresses": config.get_config("TO_ADDRESSES"),
58
+ "api_key": config.get_config("SENDGRID_API_KEY"),
59
+ },
60
+ client_name="ugrc-sweeper",
61
+ client_version=pkg_resources.require("ugrc-sweeper")[0].version,
62
+ )
63
+ )
64
+
65
+ #: backup input file before quality checks
66
+ if args["--backup-to"]:
67
+ backup.backup_data(args["--workspace"], args["--table-name"], args["--backup-to"])
68
+
69
+ #: create a list to hold the instantiated objects.
70
+ closet = []
71
+
72
+ #: check what quality check to run.
73
+ if args["duplicates"]:
74
+ closet.append(DuplicateTest(args["--workspace"], args["--table-name"]))
75
+ elif args["invalids"]:
76
+ raise NotImplementedError('"Invalids" sweep/check not implemented yet.')
77
+ elif args["empties"]:
78
+ closet.append(EmptyTest(args["--workspace"], args["--table-name"]))
79
+ elif args["addresses"]:
80
+ closet.append(AddressTest(args["--workspace"], args["--table-name"], args["--field-name"]))
81
+ elif args["metadata"]:
82
+ closet.append(MetadataTest(args["--workspace"], args["--table-name"]))
83
+ else:
84
+ closet.append(DuplicateTest(args["--workspace"], args["--table-name"]))
85
+ closet.append(EmptyTest(args["--workspace"], args["--table-name"]))
86
+ closet.append(MetadataTest(args["--workspace"], args["--table-name"]))
87
+
88
+ reports = execute_sweepers(closet, args["--try-fix"], args["--change-detect"], log)
89
+
90
+ report.print_report(reports)
91
+
92
+ if args["--save-report"]:
93
+ report.save_report(reports, args["--save-report"])
94
+
95
+ if args["--scheduled"]:
96
+ report.add_to_log(reports)
97
+
98
+ final_message = report.format_message(reports)
99
+ log.info(final_message.getvalue())
100
+
101
+ #: Build and send summary message
102
+ summary_message = MessageDetails()
103
+ summary_message.message = final_message.getvalue()
104
+ summary_message.attachments = [config.LOG_FILE_PATH]
105
+ summary_message.subject = f"Sweeper Report {datetime.datetime.today()}"
106
+
107
+ sweeper_supervisor.notify(summary_message)
108
+
109
+
110
+ def execute_sweepers(closet, try_fix, using_change_detection, log):
111
+ """
112
+ orchestrate the sweeper calls.
113
+
114
+ closet: array of sweepers.
115
+ try_fix: bool whether to fix or not.
116
+ """
117
+
118
+ feature_class_names = []
119
+ reports = []
120
+
121
+ def run_tool(tool):
122
+ reports.append(tool.sweep())
123
+
124
+ if try_fix:
125
+ reports.append(tool.try_fix())
126
+
127
+ #: run sweeper again to ensure all errors were fixed.
128
+ reports.append(tool.sweep())
129
+
130
+ log.info(f"running {len(closet)} sweepers. try fix: {try_fix}")
131
+ for tool in closet:
132
+ log.info(f"running sweeper: {tool.key}")
133
+ if tool.table_name:
134
+ run_tool(tool)
135
+
136
+ continue
137
+
138
+ #: get feature class names once
139
+ if len(feature_class_names) == 0:
140
+ if using_change_detection:
141
+ log.info("Getting table names from change detection table")
142
+ feature_class_names = workspace_info.get_change_detection()
143
+ else:
144
+ log.info("Missing table name, executing over workspace")
145
+ feature_class_names = workspace_info.get_featureclasses(tool.workspace)
146
+ if any("SGID." in fc for fc in feature_class_names):
147
+ feature_class_names = [fc.split("SGID.", 1)[1] for fc in feature_class_names if "SGID." in fc]
148
+
149
+ #: apply exclusions
150
+ if config.has_config():
151
+ try:
152
+ exclusions_config = config.get_config("EXCLUSIONS")
153
+ except KeyError:
154
+ exclusions_config = {}
155
+
156
+ exclusions = exclusions_config.get(tool.key, [])
157
+ feature_class_names = utilities.apply_exclusions(feature_class_names, exclusions)
158
+
159
+ log.info(f"feature_class_names is: {feature_class_names}")
160
+
161
+ if using_change_detection and feature_class_names is None:
162
+ #: reset variable to empty list
163
+ log.info("Change detection found no updated tables")
164
+ feature_class_names = []
165
+
166
+ continue
167
+
168
+ #: explode sweeper class for each feature class
169
+ for table_name in feature_class_names:
170
+ new_tool = tool.clone(table_name, tool.workspace)
171
+
172
+ run_tool(new_tool)
173
+
174
+ if using_change_detection:
175
+ workspace_info.update_last_check_date()
176
+
177
+ return reports
178
+
179
+
180
+ def setup_logging(save_report, scheduled):
181
+ logger = logging.getLogger("sweeper")
182
+ logger.setLevel(logging.INFO)
183
+
184
+ formatter = logging.Formatter(
185
+ fmt="%(levelname)-7s %(asctime)s %(module)10s:%(lineno)5s %(message)s", datefmt="%m-%d %H:%M:%S"
186
+ )
187
+
188
+ #: always set up console_handler
189
+ console_handler = logging.StreamHandler(stream=sys.stdout)
190
+ console_handler.setFormatter(formatter)
191
+
192
+ #: use log file when report location not provided and when running from scheduled task
193
+ if scheduled and not save_report:
194
+ log_file = Path(config.LOG_FILE_PATH)
195
+ file_handler = logging.handlers.RotatingFileHandler(log_file, backupCount=10)
196
+ file_handler.doRollover()
197
+ file_handler.setFormatter(formatter)
198
+
199
+ logger.addHandler(file_handler)
200
+
201
+ logger.addHandler(console_handler)
202
+
203
+ return logger
204
+
205
+
206
+ if __name__ == "__main__":
207
+ sys.exit(main())