django-db-anonymiser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_db_anonymiser-0.1.0/PKG-INFO +98 -0
- django_db_anonymiser-0.1.0/README.md +77 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/__main__.py +68 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/config.py +373 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/dump/__init__.py +47 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/dump/mysql.py +196 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/dump/postgres.py +170 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/constant.py +14 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/derived.py +14 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/string.py +31 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/times.py +11 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/user.py +145 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/session.py +146 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_config.py +256 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_dump.py +123 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_dump_mysql.py +196 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_dump_postgres.py +177 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_main.py +91 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_constant.py +29 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_derived.py +19 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_string.py +44 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_times.py +18 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_user.py +67 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_session.py +36 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_utils_mysql.py +112 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_utils_postgres.py +86 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/utils/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/utils/mysql.py +161 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/utils/postgres.py +145 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/faker.py +91 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/management/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/management/commands/__init__.py +0 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/management/commands/dump_and_anonymise.py +105 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/tests/test_command.py +90 -0
- django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/tests/test_faker.py +116 -0
- django_db_anonymiser-0.1.0/pyproject.toml +31 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: django-db-anonymiser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Django app to create configurable anonymised DB dumps.
|
|
5
|
+
Author: Brendan Smith
|
|
6
|
+
Author-email: brendan.smith@digital.trade.gov.uk
|
|
7
|
+
Requires-Python: >3.9.1,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: boto3 (>=1.40.33,<2.0.0)
|
|
14
|
+
Requires-Dist: django (>=4.2.10,<5.0.0)
|
|
15
|
+
Requires-Dist: django-environ (>=0.12.0,<0.13.0)
|
|
16
|
+
Requires-Dist: faker (>=4.18.0)
|
|
17
|
+
Requires-Dist: psycopg2-binary (>=2.9.10,<3.0.0)
|
|
18
|
+
Requires-Dist: pymysql (>=1.1.2,<2.0.0)
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# django-db-anonymiser
|
|
22
|
+
Django app to create configurable anonymised DB dumps.
|
|
23
|
+
|
|
24
|
+
django-db-anonymiser provides a django app with a management command `dump_and_anonymise`.
|
|
25
|
+
This command runs a `pg_dump` against a postgresql DB, applies anonymisation functions to
|
|
26
|
+
data dumped from the DB and then writes the anonymised dump to S3.
|
|
27
|
+
See here for lite-api's example anonymisation configuration; https://github.com/uktrade/lite-api/blob/dev/api/conf/anonymise_model_config.yaml
|
|
28
|
+
|
|
29
|
+
This pattern is designed as a replacement for Lite's old DB anonymisation process (although it is general purpose and can be used for any django project which uses postgresql).
|
|
30
|
+
The previous process was baked in to an airflow installation and involved making
|
|
31
|
+
a `pg_dump` from production, anonymising that dump with python and pushing the
|
|
32
|
+
file to S3. See; https://github.com/uktrade/lite-airflow-dags/blob/master/dags/export_lite_db.py
|
|
33
|
+
|
|
34
|
+
django-db-anonymiser follows the same overall pattern, but aims to achieve it
|
|
35
|
+
through a django management command instead of running on top of airflow. In addition,
|
|
36
|
+
the configuration for how DB columns are anonymised can be configured in simple YAML.
|
|
37
|
+
|
|
38
|
+
**Note:** This repository depends upon code forked from https://github.com/andersinno/python-database-sanitizer
|
|
39
|
+
This is housed under the `database_sanitizer` directory and has been forked from the above repository
|
|
40
|
+
because it is unmaintained.
|
|
41
|
+
|
|
42
|
+
## Getting started
|
|
43
|
+
|
|
44
|
+
- Add `faker>=4.18.0`, `boto3>=1.26.17` to python requirements; it is assumed python/psycopg and co are already installed.
|
|
45
|
+
- Either add this github repository as a submodule to your django application named `django_db_anonymiser` or install the python package (django-db-anonymiser)[] from PyPI.
|
|
46
|
+
- Add `django_db_anonymiser.db_anonymiser` to `INSTALLED_APPS`
|
|
47
|
+
- Set the following django settings;
|
|
48
|
+
- `DB_ANONYMISER_CONFIG_LOCATION` - the location of your anonymisation yaml file
|
|
49
|
+
- `DB_ANONYMISER_AWS_ENDPOINT_URL` - optional, custom URL for AWS (e.g. if using minio)
|
|
50
|
+
- `DB_ANONYMISER_AWS_ACCESS_KEY_ID` - AWS access key ID for the S3 bucket to upload dumps to
|
|
51
|
+
- `DB_ANONYMISER_AWS_SECRET_ACCESS_KEY` - AWS secret key for the S3 bucket to upload dumps to
|
|
52
|
+
- `DB_ANONYMISER_AWS_REGION` - AWS region for the S3 bucket to upload dumps to
|
|
53
|
+
- `DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME` - AWS bucket name for the S3 bucket to upload dumps to
|
|
54
|
+
|
|
55
|
+
## Running tests
|
|
56
|
+
|
|
57
|
+
For local unit testing from the root of the repository run:
|
|
58
|
+
|
|
59
|
+
$ poetry run pytest
|
|
60
|
+
|
|
61
|
+
**Note:** Currently for full test coverage, it is necessary to run tests in circleci, where we spin up a postgres db and test
|
|
62
|
+
the `db_anonymiser` command directly
|
|
63
|
+
|
|
64
|
+
## Publishing
|
|
65
|
+
|
|
66
|
+
Publishing to PyPI is currently a manual process:
|
|
67
|
+
|
|
68
|
+
1. Acquire API token from [Passman](https://passman.ci.uktrade.digital/secret/0f3d699a-1c7a-4e92-a235-6c756f678dd5/).
|
|
69
|
+
- Request access from the SRE team.
|
|
70
|
+
- _Note: You will need access to the `platform` group in Passman._
|
|
71
|
+
2. Run `poetry config pypi-token.pypi <token>` to add the token to your Poetry configuration.
|
|
72
|
+
|
|
73
|
+
Update the version, as the same version cannot be published to PyPI.
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
poetry version patch
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
More options for the `version` command can be found in the [Poetry documentation](https://python-poetry.org/docs/cli/#version). For example, for a minor version bump: `poetry version minor`.
|
|
80
|
+
|
|
81
|
+
Build the Python package.
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
poetry build
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Publish the Python package.
|
|
88
|
+
|
|
89
|
+
_Note: Make sure your Pull Request (PR) is approved and contains the version upgrade in `pyproject.toml` before publishing the package._
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
poetry publish
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Check the [PyPI Release history](https://pypi.org/project/dbt-platform-helper/#history) to make sure the package has been updated.
|
|
96
|
+
|
|
97
|
+
For an optional manual check, install the package locally and test everything works as expected.
|
|
98
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# django-db-anonymiser
|
|
2
|
+
Django app to create configurable anonymised DB dumps.
|
|
3
|
+
|
|
4
|
+
django-db-anonymiser provides a django app with a management command `dump_and_anonymise`.
|
|
5
|
+
This command runs a `pg_dump` against a postgresql DB, applies anonymisation functions to
|
|
6
|
+
data dumped from the DB and then writes the anonymised dump to S3.
|
|
7
|
+
See here for lite-api's example anonymisation configuration; https://github.com/uktrade/lite-api/blob/dev/api/conf/anonymise_model_config.yaml
|
|
8
|
+
|
|
9
|
+
This pattern is designed as a replacement for Lite's old DB anonymisation process (although it is general purpose and can be used for any django project which uses postgresql).
|
|
10
|
+
The previous process was baked in to an airflow installation and involved making
|
|
11
|
+
a `pg_dump` from production, anonymising that dump with python and pushing the
|
|
12
|
+
file to S3. See; https://github.com/uktrade/lite-airflow-dags/blob/master/dags/export_lite_db.py
|
|
13
|
+
|
|
14
|
+
django-db-anonymiser follows the same overall pattern, but aims to achieve it
|
|
15
|
+
through a django management command instead of running on top of airflow. In addition,
|
|
16
|
+
the configuration for how DB columns are anonymised can be configured in simple YAML.
|
|
17
|
+
|
|
18
|
+
**Note:** This repository depends upon code forked from https://github.com/andersinno/python-database-sanitizer
|
|
19
|
+
This is housed under the `database_sanitizer` directory and has been forked from the above repository
|
|
20
|
+
because it is unmaintained.
|
|
21
|
+
|
|
22
|
+
## Getting started
|
|
23
|
+
|
|
24
|
+
- Add `faker>=4.18.0`, `boto3>=1.26.17` to python requirements; it is assumed python/psycopg and co are already installed.
|
|
25
|
+
- Either add this github repository as a submodule to your django application named `django_db_anonymiser` or install the python package (django-db-anonymiser)[] from PyPI.
|
|
26
|
+
- Add `django_db_anonymiser.db_anonymiser` to `INSTALLED_APPS`
|
|
27
|
+
- Set the following django settings;
|
|
28
|
+
- `DB_ANONYMISER_CONFIG_LOCATION` - the location of your anonymisation yaml file
|
|
29
|
+
- `DB_ANONYMISER_AWS_ENDPOINT_URL` - optional, custom URL for AWS (e.g. if using minio)
|
|
30
|
+
- `DB_ANONYMISER_AWS_ACCESS_KEY_ID` - AWS access key ID for the S3 bucket to upload dumps to
|
|
31
|
+
- `DB_ANONYMISER_AWS_SECRET_ACCESS_KEY` - AWS secret key for the S3 bucket to upload dumps to
|
|
32
|
+
- `DB_ANONYMISER_AWS_REGION` - AWS region for the S3 bucket to upload dumps to
|
|
33
|
+
- `DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME` - AWS bucket name for the S3 bucket to upload dumps to
|
|
34
|
+
|
|
35
|
+
## Running tests
|
|
36
|
+
|
|
37
|
+
For local unit testing from the root of the repository run:
|
|
38
|
+
|
|
39
|
+
$ poetry run pytest
|
|
40
|
+
|
|
41
|
+
**Note:** Currently for full test coverage, it is necessary to run tests in circleci, where we spin up a postgres db and test
|
|
42
|
+
the `db_anonymiser` command directly
|
|
43
|
+
|
|
44
|
+
## Publishing
|
|
45
|
+
|
|
46
|
+
Publishing to PyPI is currently a manual process:
|
|
47
|
+
|
|
48
|
+
1. Acquire API token from [Passman](https://passman.ci.uktrade.digital/secret/0f3d699a-1c7a-4e92-a235-6c756f678dd5/).
|
|
49
|
+
- Request access from the SRE team.
|
|
50
|
+
- _Note: You will need access to the `platform` group in Passman._
|
|
51
|
+
2. Run `poetry config pypi-token.pypi <token>` to add the token to your Poetry configuration.
|
|
52
|
+
|
|
53
|
+
Update the version, as the same version cannot be published to PyPI.
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
poetry version patch
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
More options for the `version` command can be found in the [Poetry documentation](https://python-poetry.org/docs/cli/#version). For example, for a minor version bump: `poetry version minor`.
|
|
60
|
+
|
|
61
|
+
Build the Python package.
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
poetry build
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Publish the Python package.
|
|
68
|
+
|
|
69
|
+
_Note: Make sure your Pull Request (PR) is approved and contains the version upgrade in `pyproject.toml` before publishing the package._
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
poetry publish
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Check the [PyPI Release history](https://pypi.org/project/dbt-platform-helper/#history) to make sure the package has been updated.
|
|
76
|
+
|
|
77
|
+
For an optional manual check, install the package locally and test everything works as expected.
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import codecs
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
import six
|
|
11
|
+
|
|
12
|
+
from .config import Configuration
|
|
13
|
+
from .dump import run
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main(argv=sys.argv):
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog=(argv[0] if len(argv) else "database-sanitizer"),
|
|
19
|
+
description="Sanitizes contents of databases.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--config",
|
|
23
|
+
"-c",
|
|
24
|
+
type=str,
|
|
25
|
+
dest="config",
|
|
26
|
+
help="Path to the sanitizer configuration file.",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--output",
|
|
30
|
+
"-o",
|
|
31
|
+
type=str,
|
|
32
|
+
dest="output",
|
|
33
|
+
help=(
|
|
34
|
+
"Path to the file where the sanitized database will be written "
|
|
35
|
+
"into. If omitted, standard output will be used instead."
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"url",
|
|
40
|
+
help="Database URL to which to connect into and sanitize contents.",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
args = parser.parse_args(args=argv[1:])
|
|
44
|
+
output = sys.stdout
|
|
45
|
+
if six.PY2:
|
|
46
|
+
output = codecs.getwriter("utf-8")(output)
|
|
47
|
+
config = None
|
|
48
|
+
|
|
49
|
+
if args.config:
|
|
50
|
+
conf_dir = os.path.realpath(os.path.dirname(args.config))
|
|
51
|
+
sys.path.insert(0, conf_dir)
|
|
52
|
+
config = Configuration.from_file(args.config)
|
|
53
|
+
if args.output:
|
|
54
|
+
output = open(args.output, "w")
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
run(
|
|
58
|
+
url=args.url,
|
|
59
|
+
output=output,
|
|
60
|
+
config=config,
|
|
61
|
+
)
|
|
62
|
+
finally:
|
|
63
|
+
if args.output:
|
|
64
|
+
output.close()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
main()
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
|
|
7
|
+
import six
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
__all__ = ("Configuration", "ConfigurationError")
|
|
11
|
+
|
|
12
|
+
SKIP_ROWS_CONFIG_VALUE = "skip_rows"
|
|
13
|
+
MYSQLDUMP_DEFAULT_PARAMETERS = ["--single-transaction"]
|
|
14
|
+
PG_DUMP_DEFAULT_PARAMETERS = []
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ConfigurationError(ValueError):
|
|
18
|
+
"""
|
|
19
|
+
Custom exception type used to indicate configuration file errors.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Configuration(object):
|
|
24
|
+
"""
|
|
25
|
+
Object representation of database sanitizer configuration, usually read
|
|
26
|
+
from a YAML file.
|
|
27
|
+
"""
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.sanitizers = {}
|
|
30
|
+
self.skip_rows_for_tables = []
|
|
31
|
+
self.addon_packages = []
|
|
32
|
+
self.mysqldump_params = []
|
|
33
|
+
self.pg_dump_params = []
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def from_file(cls, filename):
|
|
37
|
+
"""
|
|
38
|
+
Reads configuration from given path to a file in local file system and
|
|
39
|
+
returns parsed version of it.
|
|
40
|
+
|
|
41
|
+
:param filename: Path to the YAML file in local file system where the
|
|
42
|
+
configuration will be read from.
|
|
43
|
+
:type filename: str
|
|
44
|
+
|
|
45
|
+
:return: Configuration instance parsed from given configuration file.
|
|
46
|
+
:rtype: Configuration
|
|
47
|
+
"""
|
|
48
|
+
instance = cls()
|
|
49
|
+
|
|
50
|
+
with open(filename, "rb") as file_stream:
|
|
51
|
+
config_data = yaml.safe_load(file_stream)
|
|
52
|
+
|
|
53
|
+
instance.load(config_data)
|
|
54
|
+
|
|
55
|
+
return instance
|
|
56
|
+
|
|
57
|
+
def load(self, config_data):
|
|
58
|
+
"""
|
|
59
|
+
Loads sanitizers according to rulesets defined in given already parsed
|
|
60
|
+
configuration file.
|
|
61
|
+
|
|
62
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
63
|
+
:type config_data: dict[str,any]
|
|
64
|
+
"""
|
|
65
|
+
if not isinstance(config_data, dict):
|
|
66
|
+
raise ConfigurationError(
|
|
67
|
+
"Configuration data is %s instead of dict." % (
|
|
68
|
+
type(config_data),
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self.load_addon_packages(config_data)
|
|
73
|
+
self.load_sanitizers(config_data)
|
|
74
|
+
self.load_dump_extra_parameters(config_data)
|
|
75
|
+
|
|
76
|
+
def load_dump_extra_parameters(self, config_data):
|
|
77
|
+
"""
|
|
78
|
+
Loads extra parameters for mysqldump and/or pg_dump CLI usage. These
|
|
79
|
+
parameters should be added to the mysqldump and/or pg_dump command call
|
|
80
|
+
when taking a dump.
|
|
81
|
+
|
|
82
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
83
|
+
:type config_data: dict[str,any]
|
|
84
|
+
"""
|
|
85
|
+
section_config = config_data.get("config", {})
|
|
86
|
+
if not isinstance(section_config, dict):
|
|
87
|
+
raise ConfigurationError(
|
|
88
|
+
"'config' is %s instead of dict" % (
|
|
89
|
+
type(section_config),
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
section_extra_parameters = section_config.get("extra_parameters", {})
|
|
94
|
+
if not isinstance(section_extra_parameters, dict):
|
|
95
|
+
raise ConfigurationError(
|
|
96
|
+
"'config.extra_parameters' is %s instead of dict" % (
|
|
97
|
+
type(section_extra_parameters),
|
|
98
|
+
),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
mysqldump_params = section_extra_parameters.get("mysqldump", MYSQLDUMP_DEFAULT_PARAMETERS)
|
|
102
|
+
if not isinstance(mysqldump_params, list):
|
|
103
|
+
raise ConfigurationError(
|
|
104
|
+
"'config.extra_parameters.mysqldump' is %s instead of list" % (
|
|
105
|
+
type(mysqldump_params),
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
pg_dump_params = section_extra_parameters.get("pg_dump", PG_DUMP_DEFAULT_PARAMETERS)
|
|
110
|
+
if not isinstance(pg_dump_params, list):
|
|
111
|
+
raise ConfigurationError(
|
|
112
|
+
"'config.extra_parameters.pg_dump' is %s instead of list" % (
|
|
113
|
+
type(pg_dump_params),
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self.mysqldump_params = mysqldump_params
|
|
118
|
+
self.pg_dump_params = pg_dump_params
|
|
119
|
+
|
|
120
|
+
def load_addon_packages(self, config_data):
|
|
121
|
+
"""
|
|
122
|
+
Loads the module paths from which the configuration will attempt to
|
|
123
|
+
load sanitizers from. These must be stored as a list of strings under
|
|
124
|
+
"config.addons" section of the configuration data.
|
|
125
|
+
|
|
126
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
127
|
+
:type config_data: dict[str,any]
|
|
128
|
+
"""
|
|
129
|
+
section_config = config_data.get("config")
|
|
130
|
+
if not isinstance(section_config, dict):
|
|
131
|
+
if section_config is None:
|
|
132
|
+
return
|
|
133
|
+
raise ConfigurationError(
|
|
134
|
+
"'config' is %s instead of dict" % (
|
|
135
|
+
type(section_config),
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
section_addons = section_config.get("addons", [])
|
|
140
|
+
if not isinstance(section_addons, list):
|
|
141
|
+
raise ConfigurationError(
|
|
142
|
+
"'config.addons' is %s instead of list" % (
|
|
143
|
+
type(section_addons),
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
for index, module_path in enumerate(section_addons):
|
|
148
|
+
if not isinstance(module_path, str):
|
|
149
|
+
raise ConfigurationError(
|
|
150
|
+
"Item %d in 'config.addons' is %s instead of string" % (
|
|
151
|
+
index,
|
|
152
|
+
type(module_path),
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self.addon_packages = list(section_addons)
|
|
157
|
+
|
|
158
|
+
def load_sanitizers(self, config_data):
|
|
159
|
+
"""
|
|
160
|
+
Loads sanitizers possibly defined in the configuration under dictionary
|
|
161
|
+
called "strategy", which should contain mapping of database tables with
|
|
162
|
+
column names mapped into sanitizer function names.
|
|
163
|
+
|
|
164
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
165
|
+
:type config_data: dict[str,any]
|
|
166
|
+
"""
|
|
167
|
+
section_strategy = config_data.get("strategy")
|
|
168
|
+
if not isinstance(section_strategy, dict):
|
|
169
|
+
if section_strategy is None:
|
|
170
|
+
return
|
|
171
|
+
if section_strategy != SKIP_ROWS_CONFIG_VALUE:
|
|
172
|
+
raise ConfigurationError(
|
|
173
|
+
"'strategy' is %s instead of dict" % (
|
|
174
|
+
type(section_strategy),
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
for table_name, column_data in six.iteritems(section_strategy):
|
|
179
|
+
if column_data == SKIP_ROWS_CONFIG_VALUE:
|
|
180
|
+
self.skip_rows_for_tables.append(table_name)
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
if not isinstance(column_data, dict):
|
|
184
|
+
if column_data is None:
|
|
185
|
+
continue
|
|
186
|
+
raise ConfigurationError(
|
|
187
|
+
"'strategy.%s' is %s instead of dict" % (
|
|
188
|
+
table_name,
|
|
189
|
+
type(column_data),
|
|
190
|
+
),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
for column_name, sanitizer_name in six.iteritems(column_data):
|
|
194
|
+
if sanitizer_name is None:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
if not isinstance(sanitizer_name, str):
|
|
198
|
+
raise ConfigurationError(
|
|
199
|
+
"'strategy.%s.%s' is %s instead of string" % (
|
|
200
|
+
table_name,
|
|
201
|
+
column_name,
|
|
202
|
+
type(sanitizer_name),
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
sanitizer_callback = self.find_sanitizer(sanitizer_name)
|
|
207
|
+
sanitizer_key = "%s.%s" % (table_name, column_name)
|
|
208
|
+
self.sanitizers[sanitizer_key] = sanitizer_callback
|
|
209
|
+
|
|
210
|
+
def find_sanitizer(self, name):
|
|
211
|
+
"""
|
|
212
|
+
Searches for a sanitizer function with given name. The name should
|
|
213
|
+
contain two parts separated from each other with a dot, the first
|
|
214
|
+
part being the module name while the second being name of the function
|
|
215
|
+
contained in the module, when it's being prefixed with "sanitize_".
|
|
216
|
+
|
|
217
|
+
The lookup process consists from three attempts, which are:
|
|
218
|
+
|
|
219
|
+
1. First package to look the module will be top level package called
|
|
220
|
+
"sanitizers".
|
|
221
|
+
2. Module will be looked under the "addon" packages, if they have been
|
|
222
|
+
defined.
|
|
223
|
+
3. Finally the sanitation function will be looked from the builtin
|
|
224
|
+
sanitizers located in "database_sanitizer.sanitizers" package.
|
|
225
|
+
|
|
226
|
+
If none of these provide any results, ConfigurationError will be
|
|
227
|
+
thrown.
|
|
228
|
+
|
|
229
|
+
:param name: "Full name" of the sanitation function containing name
|
|
230
|
+
of the module as well as name of the function.
|
|
231
|
+
:type name: str
|
|
232
|
+
|
|
233
|
+
:return: First function which can be imported with the given name.
|
|
234
|
+
:rtype: callable
|
|
235
|
+
"""
|
|
236
|
+
# Split the sanitizer name into two parts, one containing the Python
|
|
237
|
+
# module name, while second containing portion of the function name
|
|
238
|
+
# we are looking for.
|
|
239
|
+
name_parts = name.split(".")
|
|
240
|
+
if len(name_parts) < 2:
|
|
241
|
+
raise ConfigurationError(
|
|
242
|
+
"Unable to separate module name from function name in '%s'" % (
|
|
243
|
+
name,
|
|
244
|
+
),
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
module_name_suffix = ".".join(name_parts[:-1])
|
|
248
|
+
function_name = "sanitize_%s" % (name_parts[-1],)
|
|
249
|
+
|
|
250
|
+
# Phase 1: Look for custom sanitizer under a top level package called
|
|
251
|
+
# "sanitizers".
|
|
252
|
+
module_name = "sanitizers.%s" % (module_name_suffix,)
|
|
253
|
+
callback = self.find_sanitizer_from_module(
|
|
254
|
+
module_name=module_name,
|
|
255
|
+
function_name=function_name,
|
|
256
|
+
)
|
|
257
|
+
if callback:
|
|
258
|
+
return callback
|
|
259
|
+
|
|
260
|
+
# Phase 2: Look for the sanitizer under "addon" packages, if any of
|
|
261
|
+
# such have been defined.
|
|
262
|
+
for addon_package_name in self.addon_packages:
|
|
263
|
+
module_name = "%s.%s" % (
|
|
264
|
+
addon_package_name,
|
|
265
|
+
module_name_suffix,
|
|
266
|
+
)
|
|
267
|
+
callback = self.find_sanitizer_from_module(
|
|
268
|
+
module_name=module_name,
|
|
269
|
+
function_name=function_name,
|
|
270
|
+
)
|
|
271
|
+
if callback:
|
|
272
|
+
return callback
|
|
273
|
+
|
|
274
|
+
# Phase 3: Look from builtin sanitizers.
|
|
275
|
+
module_name = "database_sanitizer.sanitizers.%s" % (module_name_suffix,)
|
|
276
|
+
callback = self.find_sanitizer_from_module(
|
|
277
|
+
module_name=module_name,
|
|
278
|
+
function_name=function_name,
|
|
279
|
+
)
|
|
280
|
+
if callback:
|
|
281
|
+
return callback
|
|
282
|
+
|
|
283
|
+
# Give up.
|
|
284
|
+
raise ConfigurationError("Unable to find sanitizer called '%s'" % (
|
|
285
|
+
name,
|
|
286
|
+
))
|
|
287
|
+
|
|
288
|
+
@staticmethod
|
|
289
|
+
def find_sanitizer_from_module(module_name, function_name):
|
|
290
|
+
"""
|
|
291
|
+
Attempts to find sanitizer function from given module. If the module
|
|
292
|
+
cannot be imported, or function with given name does not exist in it,
|
|
293
|
+
nothing will be returned by this method. Otherwise the found sanitizer
|
|
294
|
+
function will be returned.
|
|
295
|
+
|
|
296
|
+
:param module_name: Name of the module to import the function from.
|
|
297
|
+
:type module_name: str
|
|
298
|
+
|
|
299
|
+
:param function_name: Name of the function to look for inside the
|
|
300
|
+
module.
|
|
301
|
+
:type function_name: str
|
|
302
|
+
|
|
303
|
+
:return: Sanitizer function found from the module, if it can be
|
|
304
|
+
imported and it indeed contains function with the given name.
|
|
305
|
+
Otherwise None will be returned instead.
|
|
306
|
+
:rtype: callback|None
|
|
307
|
+
"""
|
|
308
|
+
try:
|
|
309
|
+
module = importlib.import_module(module_name)
|
|
310
|
+
except ImportError:
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
# Look for the function inside the module. At this point it could be
|
|
314
|
+
# pretty much anything.
|
|
315
|
+
callback = getattr(module, function_name, None)
|
|
316
|
+
|
|
317
|
+
# Function does not exist in this module? Give up.
|
|
318
|
+
if callback is None:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
# It's actually callable function? Return it.
|
|
322
|
+
if callable(callback):
|
|
323
|
+
return callback
|
|
324
|
+
|
|
325
|
+
# Sanitizer seems to be something else than a function. Throw an
|
|
326
|
+
# exception to report such problem.
|
|
327
|
+
raise ConfigurationError("'%s' in '%s' is %s instead of function" % (
|
|
328
|
+
function_name,
|
|
329
|
+
module_name,
|
|
330
|
+
type(callback),
|
|
331
|
+
))
|
|
332
|
+
|
|
333
|
+
def get_sanitizer_for(self, table_name, column_name):
|
|
334
|
+
"""
|
|
335
|
+
Get sanitizer for given table and column name.
|
|
336
|
+
|
|
337
|
+
:param table_name: Name of the database table.
|
|
338
|
+
:type table_name: str
|
|
339
|
+
|
|
340
|
+
:param column_name: Name of the database column.
|
|
341
|
+
:type column_name: str
|
|
342
|
+
|
|
343
|
+
:return: Sanitizer function or None if nothing is configured
|
|
344
|
+
:rtype: Optional[Callable[[Optional[str]], Optional[str]]]
|
|
345
|
+
"""
|
|
346
|
+
sanitizer_key = "%s.%s" % (table_name, column_name)
|
|
347
|
+
return self.sanitizers.get(sanitizer_key)
|
|
348
|
+
|
|
349
|
+
def sanitize(self, table_name, column_name, value):
|
|
350
|
+
"""
|
|
351
|
+
Sanitizes given value extracted from the database according to the
|
|
352
|
+
sanitation configuration.
|
|
353
|
+
|
|
354
|
+
TODO: Add support for dates, booleans and other types found in SQL than
|
|
355
|
+
string.
|
|
356
|
+
|
|
357
|
+
:param table_name: Name of the database table from which the value is
|
|
358
|
+
from.
|
|
359
|
+
:type table_name: str
|
|
360
|
+
|
|
361
|
+
:param column_name: Name of the database column from which the value is
|
|
362
|
+
from.
|
|
363
|
+
:type column_name: str
|
|
364
|
+
|
|
365
|
+
:param value: Value from the database, either in text form or None if
|
|
366
|
+
the value is null.
|
|
367
|
+
:type value: str|None
|
|
368
|
+
|
|
369
|
+
:return: Sanitized version of the given value.
|
|
370
|
+
:rtype: str|None
|
|
371
|
+
"""
|
|
372
|
+
sanitizer_callback = self.get_sanitizer_for(table_name, column_name)
|
|
373
|
+
return sanitizer_callback(value) if sanitizer_callback else value
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
|
|
7
|
+
from six.moves.urllib import parse as urlparse
|
|
8
|
+
|
|
9
|
+
from .. import session
|
|
10
|
+
|
|
11
|
+
SUPPORTED_DATABASE_MODULES = {
|
|
12
|
+
"mysql": "django_db_anonymiser.database_sanitizer.dump.mysql",
|
|
13
|
+
"postgres": "django_db_anonymiser.database_sanitizer.dump.postgres",
|
|
14
|
+
"postgresql": "django_db_anonymiser.database_sanitizer.dump.postgres",
|
|
15
|
+
"postgis": "django_db_anonymiser.database_sanitizer.dump.postgres",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Register supported database schemes.
|
|
20
|
+
for scheme in SUPPORTED_DATABASE_MODULES.keys():
|
|
21
|
+
urlparse.uses_netloc.append(scheme)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run(url, output, config):
|
|
25
|
+
"""
|
|
26
|
+
Extracts database dump from given database URL and outputs sanitized
|
|
27
|
+
copy of it into given stream.
|
|
28
|
+
|
|
29
|
+
:param url: URL to the database which is to be sanitized.
|
|
30
|
+
:type url: str
|
|
31
|
+
|
|
32
|
+
:param output: Stream where sanitized copy of the database dump will be
|
|
33
|
+
written into.
|
|
34
|
+
:type output: file
|
|
35
|
+
|
|
36
|
+
:param config: Optional sanitizer configuration to be used for sanitation
|
|
37
|
+
of the values stored in the database.
|
|
38
|
+
:type config: database_sanitizer.config.Configuration|None
|
|
39
|
+
"""
|
|
40
|
+
parsed_url = urlparse.urlparse(url)
|
|
41
|
+
db_module_path = SUPPORTED_DATABASE_MODULES.get(parsed_url.scheme)
|
|
42
|
+
if not db_module_path:
|
|
43
|
+
raise ValueError("Unsupported database scheme: '%s'" % (parsed_url.scheme,))
|
|
44
|
+
db_module = importlib.import_module(db_module_path)
|
|
45
|
+
session.reset()
|
|
46
|
+
for line in db_module.sanitize(url=parsed_url, config=config):
|
|
47
|
+
output.write(line + "\n")
|