django-db-anonymiser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. django_db_anonymiser-0.1.0/PKG-INFO +98 -0
  2. django_db_anonymiser-0.1.0/README.md +77 -0
  3. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/__init__.py +0 -0
  4. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/__main__.py +68 -0
  5. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/config.py +373 -0
  6. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/dump/__init__.py +47 -0
  7. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/dump/mysql.py +196 -0
  8. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/dump/postgres.py +170 -0
  9. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/__init__.py +0 -0
  10. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/constant.py +14 -0
  11. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/derived.py +14 -0
  12. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/string.py +31 -0
  13. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/times.py +11 -0
  14. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/sanitizers/user.py +145 -0
  15. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/session.py +146 -0
  16. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/__init__.py +0 -0
  17. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_config.py +256 -0
  18. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_dump.py +123 -0
  19. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_dump_mysql.py +196 -0
  20. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_dump_postgres.py +177 -0
  21. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_main.py +91 -0
  22. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_constant.py +29 -0
  23. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_derived.py +19 -0
  24. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_string.py +44 -0
  25. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_times.py +18 -0
  26. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_sanitizers_user.py +67 -0
  27. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_session.py +36 -0
  28. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_utils_mysql.py +112 -0
  29. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/tests/test_utils_postgres.py +86 -0
  30. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/utils/__init__.py +0 -0
  31. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/utils/mysql.py +161 -0
  32. django_db_anonymiser-0.1.0/django_db_anonymiser/database_sanitizer/utils/postgres.py +145 -0
  33. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/__init__.py +0 -0
  34. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/faker.py +91 -0
  35. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/management/__init__.py +0 -0
  36. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/management/commands/__init__.py +0 -0
  37. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/management/commands/dump_and_anonymise.py +105 -0
  38. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/tests/test_command.py +90 -0
  39. django_db_anonymiser-0.1.0/django_db_anonymiser/db_anonymiser/tests/test_faker.py +116 -0
  40. django_db_anonymiser-0.1.0/pyproject.toml +31 -0
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.3
2
+ Name: django-db-anonymiser
3
+ Version: 0.1.0
4
+ Summary: Django app to create configurable anonymised DB dumps.
5
+ Author: Brendan Smith
6
+ Author-email: brendan.smith@digital.trade.gov.uk
7
+ Requires-Python: >3.9.1,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: boto3 (>=1.40.33,<2.0.0)
14
+ Requires-Dist: django (>=4.2.10,<5.0.0)
15
+ Requires-Dist: django-environ (>=0.12.0,<0.13.0)
16
+ Requires-Dist: faker (>=4.18.0)
17
+ Requires-Dist: psycopg2-binary (>=2.9.10,<3.0.0)
18
+ Requires-Dist: pymysql (>=1.1.2,<2.0.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # django-db-anonymiser
22
+ Django app to create configurable anonymised DB dumps.
23
+
24
+ django-db-anonymiser provides a django app with a management command `dump_and_anonymise`.
25
+ This command runs a `pg_dump` against a postgresql DB, applies anonymisation functions to
26
+ data dumped from the DB and then writes the anonymised dump to S3.
27
+ See here for lite-api's example anonymisation configuration; https://github.com/uktrade/lite-api/blob/dev/api/conf/anonymise_model_config.yaml
28
+
29
+ This pattern is designed as a replacement for Lite's old DB anonymisation process (although it is general purpose and can be used for any django project which uses postgresql).
30
+ The previous process was baked in to an airflow installation and involved making
31
+ a `pg_dump` from production, anonymising that dump with python and pushing the
32
+ file to S3. See; https://github.com/uktrade/lite-airflow-dags/blob/master/dags/export_lite_db.py
33
+
34
+ django-db-anonymiser follows the same overall pattern, but aims to achieve it
35
+ through a django management command instead of running on top of airflow. In addition,
36
+ the configuration for how DB columns are anonymised can be configured in simple YAML.
37
+
38
+ **Note:** This repository depends upon code forked from https://github.com/andersinno/python-database-sanitizer
39
+ This is housed under the `database_sanitizer` directory and has been forked from the above repository
40
+ because it is unmaintained.
41
+
42
+ ## Getting started
43
+
44
+ - Add `faker>=4.18.0`, `boto3>=1.26.17` to python requirements; it is assumed python/psycopg and co are already installed.
45
+ - Either add this github repository as a submodule to your django application named `django_db_anonymiser` or install the python package (django-db-anonymiser)[] from PyPI.
46
+ - Add `django_db_anonymiser.db_anonymiser` to `INSTALLED_APPS`
47
+ - Set the following django settings;
48
+ - `DB_ANONYMISER_CONFIG_LOCATION` - the location of your anonymisation yaml file
49
+ - `DB_ANONYMISER_AWS_ENDPOINT_URL` - optional, custom URL for AWS (e.g. if using minio)
50
+ - `DB_ANONYMISER_AWS_ACCESS_KEY_ID` - AWS access key ID for the S3 bucket to upload dumps to
51
+ - `DB_ANONYMISER_AWS_SECRET_ACCESS_KEY` - AWS secret key for the S3 bucket to upload dumps to
52
+ - `DB_ANONYMISER_AWS_REGION` - AWS region for the S3 bucket to upload dumps to
53
+ - `DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME` - AWS bucket name for the S3 bucket to upload dumps to
54
+
55
+ ## Running tests
56
+
57
+ For local unit testing from the root of the repository run:
58
+
59
+ $ poetry run pytest
60
+
61
+ **Note:** Currently for full test coverage, it is necessary to run tests in circleci, where we spin up a postgres db and test
62
+ the `db_anonymiser` command directly
63
+
64
+ ## Publishing
65
+
66
+ Publishing to PyPI is currently a manual process:
67
+
68
+ 1. Acquire API token from [Passman](https://passman.ci.uktrade.digital/secret/0f3d699a-1c7a-4e92-a235-6c756f678dd5/).
69
+ - Request access from the SRE team.
70
+ - _Note: You will need access to the `platform` group in Passman._
71
+ 2. Run `poetry config pypi-token.pypi <token>` to add the token to your Poetry configuration.
72
+
73
+ Update the version, as the same version cannot be published to PyPI.
74
+
75
+ ```
76
+ poetry version patch
77
+ ```
78
+
79
+ More options for the `version` command can be found in the [Poetry documentation](https://python-poetry.org/docs/cli/#version). For example, for a minor version bump: `poetry version minor`.
80
+
81
+ Build the Python package.
82
+
83
+ ```
84
+ poetry build
85
+ ```
86
+
87
+ Publish the Python package.
88
+
89
+ _Note: Make sure your Pull Request (PR) is approved and contains the version upgrade in `pyproject.toml` before publishing the package._
90
+
91
+ ```
92
+ poetry publish
93
+ ```
94
+
95
+ Check the [PyPI Release history](https://pypi.org/project/dbt-platform-helper/#history) to make sure the package has been updated.
96
+
97
+ For an optional manual check, install the package locally and test everything works as expected.
98
+
@@ -0,0 +1,77 @@
1
+ # django-db-anonymiser
2
+ Django app to create configurable anonymised DB dumps.
3
+
4
+ django-db-anonymiser provides a django app with a management command `dump_and_anonymise`.
5
+ This command runs a `pg_dump` against a postgresql DB, applies anonymisation functions to
6
+ data dumped from the DB and then writes the anonymised dump to S3.
7
+ See here for lite-api's example anonymisation configuration; https://github.com/uktrade/lite-api/blob/dev/api/conf/anonymise_model_config.yaml
8
+
9
+ This pattern is designed as a replacement for Lite's old DB anonymisation process (although it is general purpose and can be used for any django project which uses postgresql).
10
+ The previous process was baked in to an airflow installation and involved making
11
+ a `pg_dump` from production, anonymising that dump with python and pushing the
12
+ file to S3. See; https://github.com/uktrade/lite-airflow-dags/blob/master/dags/export_lite_db.py
13
+
14
+ django-db-anonymiser follows the same overall pattern, but aims to achieve it
15
+ through a django management command instead of running on top of airflow. In addition,
16
+ the configuration for how DB columns are anonymised can be configured in simple YAML.
17
+
18
+ **Note:** This repository depends upon code forked from https://github.com/andersinno/python-database-sanitizer
19
+ This is housed under the `database_sanitizer` directory and has been forked from the above repository
20
+ because it is unmaintained.
21
+
22
+ ## Getting started
23
+
24
+ - Add `faker>=4.18.0`, `boto3>=1.26.17` to python requirements; it is assumed python/psycopg and co are already installed.
25
+ - Either add this github repository as a submodule to your django application named `django_db_anonymiser` or install the python package (django-db-anonymiser)[] from PyPI.
26
+ - Add `django_db_anonymiser.db_anonymiser` to `INSTALLED_APPS`
27
+ - Set the following django settings;
28
+ - `DB_ANONYMISER_CONFIG_LOCATION` - the location of your anonymisation yaml file
29
+ - `DB_ANONYMISER_AWS_ENDPOINT_URL` - optional, custom URL for AWS (e.g. if using minio)
30
+ - `DB_ANONYMISER_AWS_ACCESS_KEY_ID` - AWS access key ID for the S3 bucket to upload dumps to
31
+ - `DB_ANONYMISER_AWS_SECRET_ACCESS_KEY` - AWS secret key for the S3 bucket to upload dumps to
32
+ - `DB_ANONYMISER_AWS_REGION` - AWS region for the S3 bucket to upload dumps to
33
+ - `DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME` - AWS bucket name for the S3 bucket to upload dumps to
34
+
35
+ ## Running tests
36
+
37
+ For local unit testing from the root of the repository run:
38
+
39
+ $ poetry run pytest
40
+
41
+ **Note:** Currently for full test coverage, it is necessary to run tests in circleci, where we spin up a postgres db and test
42
+ the `db_anonymiser` command directly
43
+
44
+ ## Publishing
45
+
46
+ Publishing to PyPI is currently a manual process:
47
+
48
+ 1. Acquire API token from [Passman](https://passman.ci.uktrade.digital/secret/0f3d699a-1c7a-4e92-a235-6c756f678dd5/).
49
+ - Request access from the SRE team.
50
+ - _Note: You will need access to the `platform` group in Passman._
51
+ 2. Run `poetry config pypi-token.pypi <token>` to add the token to your Poetry configuration.
52
+
53
+ Update the version, as the same version cannot be published to PyPI.
54
+
55
+ ```
56
+ poetry version patch
57
+ ```
58
+
59
+ More options for the `version` command can be found in the [Poetry documentation](https://python-poetry.org/docs/cli/#version). For example, for a minor version bump: `poetry version minor`.
60
+
61
+ Build the Python package.
62
+
63
+ ```
64
+ poetry build
65
+ ```
66
+
67
+ Publish the Python package.
68
+
69
+ _Note: Make sure your Pull Request (PR) is approved and contains the version upgrade in `pyproject.toml` before publishing the package._
70
+
71
+ ```
72
+ poetry publish
73
+ ```
74
+
75
+ Check the [PyPI Release history](https://pypi.org/project/dbt-platform-helper/#history) to make sure the package has been updated.
76
+
77
+ For an optional manual check, install the package locally and test everything works as expected.
@@ -0,0 +1,68 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import argparse
6
+ import codecs
7
+ import os
8
+ import sys
9
+
10
+ import six
11
+
12
+ from .config import Configuration
13
+ from .dump import run
14
+
15
+
16
+ def main(argv=sys.argv):
17
+ parser = argparse.ArgumentParser(
18
+ prog=(argv[0] if len(argv) else "database-sanitizer"),
19
+ description="Sanitizes contents of databases.",
20
+ )
21
+ parser.add_argument(
22
+ "--config",
23
+ "-c",
24
+ type=str,
25
+ dest="config",
26
+ help="Path to the sanitizer configuration file.",
27
+ )
28
+ parser.add_argument(
29
+ "--output",
30
+ "-o",
31
+ type=str,
32
+ dest="output",
33
+ help=(
34
+ "Path to the file where the sanitized database will be written "
35
+ "into. If omitted, standard output will be used instead."
36
+ ),
37
+ )
38
+ parser.add_argument(
39
+ "url",
40
+ help="Database URL to which to connect into and sanitize contents.",
41
+ )
42
+
43
+ args = parser.parse_args(args=argv[1:])
44
+ output = sys.stdout
45
+ if six.PY2:
46
+ output = codecs.getwriter("utf-8")(output)
47
+ config = None
48
+
49
+ if args.config:
50
+ conf_dir = os.path.realpath(os.path.dirname(args.config))
51
+ sys.path.insert(0, conf_dir)
52
+ config = Configuration.from_file(args.config)
53
+ if args.output:
54
+ output = open(args.output, "w")
55
+
56
+ try:
57
+ run(
58
+ url=args.url,
59
+ output=output,
60
+ config=config,
61
+ )
62
+ finally:
63
+ if args.output:
64
+ output.close()
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()
@@ -0,0 +1,373 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import importlib
6
+
7
+ import six
8
+ import yaml
9
+
10
+ __all__ = ("Configuration", "ConfigurationError")
11
+
12
+ SKIP_ROWS_CONFIG_VALUE = "skip_rows"
13
+ MYSQLDUMP_DEFAULT_PARAMETERS = ["--single-transaction"]
14
+ PG_DUMP_DEFAULT_PARAMETERS = []
15
+
16
+
17
+ class ConfigurationError(ValueError):
18
+ """
19
+ Custom exception type used to indicate configuration file errors.
20
+ """
21
+
22
+
23
+ class Configuration(object):
24
+ """
25
+ Object representation of database sanitizer configuration, usually read
26
+ from a YAML file.
27
+ """
28
+ def __init__(self):
29
+ self.sanitizers = {}
30
+ self.skip_rows_for_tables = []
31
+ self.addon_packages = []
32
+ self.mysqldump_params = []
33
+ self.pg_dump_params = []
34
+
35
+ @classmethod
36
+ def from_file(cls, filename):
37
+ """
38
+ Reads configuration from given path to a file in local file system and
39
+ returns parsed version of it.
40
+
41
+ :param filename: Path to the YAML file in local file system where the
42
+ configuration will be read from.
43
+ :type filename: str
44
+
45
+ :return: Configuration instance parsed from given configuration file.
46
+ :rtype: Configuration
47
+ """
48
+ instance = cls()
49
+
50
+ with open(filename, "rb") as file_stream:
51
+ config_data = yaml.safe_load(file_stream)
52
+
53
+ instance.load(config_data)
54
+
55
+ return instance
56
+
57
+ def load(self, config_data):
58
+ """
59
+ Loads sanitizers according to rulesets defined in given already parsed
60
+ configuration file.
61
+
62
+ :param config_data: Already parsed configuration data, as dictionary.
63
+ :type config_data: dict[str,any]
64
+ """
65
+ if not isinstance(config_data, dict):
66
+ raise ConfigurationError(
67
+ "Configuration data is %s instead of dict." % (
68
+ type(config_data),
69
+ )
70
+ )
71
+
72
+ self.load_addon_packages(config_data)
73
+ self.load_sanitizers(config_data)
74
+ self.load_dump_extra_parameters(config_data)
75
+
76
+ def load_dump_extra_parameters(self, config_data):
77
+ """
78
+ Loads extra parameters for mysqldump and/or pg_dump CLI usage. These
79
+ parameters should be added to the mysqldump and/or pg_dump command call
80
+ when taking a dump.
81
+
82
+ :param config_data: Already parsed configuration data, as dictionary.
83
+ :type config_data: dict[str,any]
84
+ """
85
+ section_config = config_data.get("config", {})
86
+ if not isinstance(section_config, dict):
87
+ raise ConfigurationError(
88
+ "'config' is %s instead of dict" % (
89
+ type(section_config),
90
+ ),
91
+ )
92
+
93
+ section_extra_parameters = section_config.get("extra_parameters", {})
94
+ if not isinstance(section_extra_parameters, dict):
95
+ raise ConfigurationError(
96
+ "'config.extra_parameters' is %s instead of dict" % (
97
+ type(section_extra_parameters),
98
+ ),
99
+ )
100
+
101
+ mysqldump_params = section_extra_parameters.get("mysqldump", MYSQLDUMP_DEFAULT_PARAMETERS)
102
+ if not isinstance(mysqldump_params, list):
103
+ raise ConfigurationError(
104
+ "'config.extra_parameters.mysqldump' is %s instead of list" % (
105
+ type(mysqldump_params),
106
+ ),
107
+ )
108
+
109
+ pg_dump_params = section_extra_parameters.get("pg_dump", PG_DUMP_DEFAULT_PARAMETERS)
110
+ if not isinstance(pg_dump_params, list):
111
+ raise ConfigurationError(
112
+ "'config.extra_parameters.pg_dump' is %s instead of list" % (
113
+ type(pg_dump_params),
114
+ ),
115
+ )
116
+
117
+ self.mysqldump_params = mysqldump_params
118
+ self.pg_dump_params = pg_dump_params
119
+
120
+ def load_addon_packages(self, config_data):
121
+ """
122
+ Loads the module paths from which the configuration will attempt to
123
+ load sanitizers from. These must be stored as a list of strings under
124
+ "config.addons" section of the configuration data.
125
+
126
+ :param config_data: Already parsed configuration data, as dictionary.
127
+ :type config_data: dict[str,any]
128
+ """
129
+ section_config = config_data.get("config")
130
+ if not isinstance(section_config, dict):
131
+ if section_config is None:
132
+ return
133
+ raise ConfigurationError(
134
+ "'config' is %s instead of dict" % (
135
+ type(section_config),
136
+ ),
137
+ )
138
+
139
+ section_addons = section_config.get("addons", [])
140
+ if not isinstance(section_addons, list):
141
+ raise ConfigurationError(
142
+ "'config.addons' is %s instead of list" % (
143
+ type(section_addons),
144
+ ),
145
+ )
146
+
147
+ for index, module_path in enumerate(section_addons):
148
+ if not isinstance(module_path, str):
149
+ raise ConfigurationError(
150
+ "Item %d in 'config.addons' is %s instead of string" % (
151
+ index,
152
+ type(module_path),
153
+ ),
154
+ )
155
+
156
+ self.addon_packages = list(section_addons)
157
+
158
+ def load_sanitizers(self, config_data):
159
+ """
160
+ Loads sanitizers possibly defined in the configuration under dictionary
161
+ called "strategy", which should contain mapping of database tables with
162
+ column names mapped into sanitizer function names.
163
+
164
+ :param config_data: Already parsed configuration data, as dictionary.
165
+ :type config_data: dict[str,any]
166
+ """
167
+ section_strategy = config_data.get("strategy")
168
+ if not isinstance(section_strategy, dict):
169
+ if section_strategy is None:
170
+ return
171
+ if section_strategy != SKIP_ROWS_CONFIG_VALUE:
172
+ raise ConfigurationError(
173
+ "'strategy' is %s instead of dict" % (
174
+ type(section_strategy),
175
+ ),
176
+ )
177
+
178
+ for table_name, column_data in six.iteritems(section_strategy):
179
+ if column_data == SKIP_ROWS_CONFIG_VALUE:
180
+ self.skip_rows_for_tables.append(table_name)
181
+ continue
182
+
183
+ if not isinstance(column_data, dict):
184
+ if column_data is None:
185
+ continue
186
+ raise ConfigurationError(
187
+ "'strategy.%s' is %s instead of dict" % (
188
+ table_name,
189
+ type(column_data),
190
+ ),
191
+ )
192
+
193
+ for column_name, sanitizer_name in six.iteritems(column_data):
194
+ if sanitizer_name is None:
195
+ continue
196
+
197
+ if not isinstance(sanitizer_name, str):
198
+ raise ConfigurationError(
199
+ "'strategy.%s.%s' is %s instead of string" % (
200
+ table_name,
201
+ column_name,
202
+ type(sanitizer_name),
203
+ ),
204
+ )
205
+
206
+ sanitizer_callback = self.find_sanitizer(sanitizer_name)
207
+ sanitizer_key = "%s.%s" % (table_name, column_name)
208
+ self.sanitizers[sanitizer_key] = sanitizer_callback
209
+
210
+ def find_sanitizer(self, name):
211
+ """
212
+ Searches for a sanitizer function with given name. The name should
213
+ contain two parts separated from each other with a dot, the first
214
+ part being the module name while the second being name of the function
215
+ contained in the module, when it's being prefixed with "sanitize_".
216
+
217
+ The lookup process consists from three attempts, which are:
218
+
219
+ 1. First package to look the module will be top level package called
220
+ "sanitizers".
221
+ 2. Module will be looked under the "addon" packages, if they have been
222
+ defined.
223
+ 3. Finally the sanitation function will be looked from the builtin
224
+ sanitizers located in "database_sanitizer.sanitizers" package.
225
+
226
+ If none of these provide any results, ConfigurationError will be
227
+ thrown.
228
+
229
+ :param name: "Full name" of the sanitation function containing name
230
+ of the module as well as name of the function.
231
+ :type name: str
232
+
233
+ :return: First function which can be imported with the given name.
234
+ :rtype: callable
235
+ """
236
+ # Split the sanitizer name into two parts, one containing the Python
237
+ # module name, while second containing portion of the function name
238
+ # we are looking for.
239
+ name_parts = name.split(".")
240
+ if len(name_parts) < 2:
241
+ raise ConfigurationError(
242
+ "Unable to separate module name from function name in '%s'" % (
243
+ name,
244
+ ),
245
+ )
246
+
247
+ module_name_suffix = ".".join(name_parts[:-1])
248
+ function_name = "sanitize_%s" % (name_parts[-1],)
249
+
250
+ # Phase 1: Look for custom sanitizer under a top level package called
251
+ # "sanitizers".
252
+ module_name = "sanitizers.%s" % (module_name_suffix,)
253
+ callback = self.find_sanitizer_from_module(
254
+ module_name=module_name,
255
+ function_name=function_name,
256
+ )
257
+ if callback:
258
+ return callback
259
+
260
+ # Phase 2: Look for the sanitizer under "addon" packages, if any of
261
+ # such have been defined.
262
+ for addon_package_name in self.addon_packages:
263
+ module_name = "%s.%s" % (
264
+ addon_package_name,
265
+ module_name_suffix,
266
+ )
267
+ callback = self.find_sanitizer_from_module(
268
+ module_name=module_name,
269
+ function_name=function_name,
270
+ )
271
+ if callback:
272
+ return callback
273
+
274
+ # Phase 3: Look from builtin sanitizers.
275
+ module_name = "database_sanitizer.sanitizers.%s" % (module_name_suffix,)
276
+ callback = self.find_sanitizer_from_module(
277
+ module_name=module_name,
278
+ function_name=function_name,
279
+ )
280
+ if callback:
281
+ return callback
282
+
283
+ # Give up.
284
+ raise ConfigurationError("Unable to find sanitizer called '%s'" % (
285
+ name,
286
+ ))
287
+
288
+ @staticmethod
289
+ def find_sanitizer_from_module(module_name, function_name):
290
+ """
291
+ Attempts to find sanitizer function from given module. If the module
292
+ cannot be imported, or function with given name does not exist in it,
293
+ nothing will be returned by this method. Otherwise the found sanitizer
294
+ function will be returned.
295
+
296
+ :param module_name: Name of the module to import the function from.
297
+ :type module_name: str
298
+
299
+ :param function_name: Name of the function to look for inside the
300
+ module.
301
+ :type function_name: str
302
+
303
+ :return: Sanitizer function found from the module, if it can be
304
+ imported and it indeed contains function with the given name.
305
+ Otherwise None will be returned instead.
306
+ :rtype: callback|None
307
+ """
308
+ try:
309
+ module = importlib.import_module(module_name)
310
+ except ImportError:
311
+ return None
312
+
313
+ # Look for the function inside the module. At this point it could be
314
+ # pretty much anything.
315
+ callback = getattr(module, function_name, None)
316
+
317
+ # Function does not exist in this module? Give up.
318
+ if callback is None:
319
+ return None
320
+
321
+ # It's actually callable function? Return it.
322
+ if callable(callback):
323
+ return callback
324
+
325
+ # Sanitizer seems to be something else than a function. Throw an
326
+ # exception to report such problem.
327
+ raise ConfigurationError("'%s' in '%s' is %s instead of function" % (
328
+ function_name,
329
+ module_name,
330
+ type(callback),
331
+ ))
332
+
333
+ def get_sanitizer_for(self, table_name, column_name):
334
+ """
335
+ Get sanitizer for given table and column name.
336
+
337
+ :param table_name: Name of the database table.
338
+ :type table_name: str
339
+
340
+ :param column_name: Name of the database column.
341
+ :type column_name: str
342
+
343
+ :return: Sanitizer function or None if nothing is configured
344
+ :rtype: Optional[Callable[[Optional[str]], Optional[str]]]
345
+ """
346
+ sanitizer_key = "%s.%s" % (table_name, column_name)
347
+ return self.sanitizers.get(sanitizer_key)
348
+
349
+ def sanitize(self, table_name, column_name, value):
350
+ """
351
+ Sanitizes given value extracted from the database according to the
352
+ sanitation configuration.
353
+
354
+ TODO: Add support for dates, booleans and other types found in SQL than
355
+ string.
356
+
357
+ :param table_name: Name of the database table from which the value is
358
+ from.
359
+ :type table_name: str
360
+
361
+ :param column_name: Name of the database column from which the value is
362
+ from.
363
+ :type column_name: str
364
+
365
+ :param value: Value from the database, either in text form or None if
366
+ the value is null.
367
+ :type value: str|None
368
+
369
+ :return: Sanitized version of the given value.
370
+ :rtype: str|None
371
+ """
372
+ sanitizer_callback = self.get_sanitizer_for(table_name, column_name)
373
+ return sanitizer_callback(value) if sanitizer_callback else value
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import importlib
6
+
7
+ from six.moves.urllib import parse as urlparse
8
+
9
+ from .. import session
10
+
11
+ SUPPORTED_DATABASE_MODULES = {
12
+ "mysql": "django_db_anonymiser.database_sanitizer.dump.mysql",
13
+ "postgres": "django_db_anonymiser.database_sanitizer.dump.postgres",
14
+ "postgresql": "django_db_anonymiser.database_sanitizer.dump.postgres",
15
+ "postgis": "django_db_anonymiser.database_sanitizer.dump.postgres",
16
+ }
17
+
18
+
19
+ # Register supported database schemes.
20
+ for scheme in SUPPORTED_DATABASE_MODULES.keys():
21
+ urlparse.uses_netloc.append(scheme)
22
+
23
+
24
+ def run(url, output, config):
25
+ """
26
+ Extracts database dump from given database URL and outputs sanitized
27
+ copy of it into given stream.
28
+
29
+ :param url: URL to the database which is to be sanitized.
30
+ :type url: str
31
+
32
+ :param output: Stream where sanitized copy of the database dump will be
33
+ written into.
34
+ :type output: file
35
+
36
+ :param config: Optional sanitizer configuration to be used for sanitation
37
+ of the values stored in the database.
38
+ :type config: database_sanitizer.config.Configuration|None
39
+ """
40
+ parsed_url = urlparse.urlparse(url)
41
+ db_module_path = SUPPORTED_DATABASE_MODULES.get(parsed_url.scheme)
42
+ if not db_module_path:
43
+ raise ValueError("Unsupported database scheme: '%s'" % (parsed_url.scheme,))
44
+ db_module = importlib.import_module(db_module_path)
45
+ session.reset()
46
+ for line in db_module.sanitize(url=parsed_url, config=config):
47
+ output.write(line + "\n")