django-db-anonymiser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_db_anonymiser/database_sanitizer/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/__main__.py +68 -0
- django_db_anonymiser/database_sanitizer/config.py +373 -0
- django_db_anonymiser/database_sanitizer/dump/__init__.py +47 -0
- django_db_anonymiser/database_sanitizer/dump/mysql.py +196 -0
- django_db_anonymiser/database_sanitizer/dump/postgres.py +170 -0
- django_db_anonymiser/database_sanitizer/sanitizers/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/sanitizers/constant.py +14 -0
- django_db_anonymiser/database_sanitizer/sanitizers/derived.py +14 -0
- django_db_anonymiser/database_sanitizer/sanitizers/string.py +31 -0
- django_db_anonymiser/database_sanitizer/sanitizers/times.py +11 -0
- django_db_anonymiser/database_sanitizer/sanitizers/user.py +145 -0
- django_db_anonymiser/database_sanitizer/session.py +146 -0
- django_db_anonymiser/database_sanitizer/tests/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/tests/test_config.py +256 -0
- django_db_anonymiser/database_sanitizer/tests/test_dump.py +123 -0
- django_db_anonymiser/database_sanitizer/tests/test_dump_mysql.py +196 -0
- django_db_anonymiser/database_sanitizer/tests/test_dump_postgres.py +177 -0
- django_db_anonymiser/database_sanitizer/tests/test_main.py +91 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_constant.py +29 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_derived.py +19 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_string.py +44 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_times.py +18 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_user.py +67 -0
- django_db_anonymiser/database_sanitizer/tests/test_session.py +36 -0
- django_db_anonymiser/database_sanitizer/tests/test_utils_mysql.py +112 -0
- django_db_anonymiser/database_sanitizer/tests/test_utils_postgres.py +86 -0
- django_db_anonymiser/database_sanitizer/utils/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/utils/mysql.py +161 -0
- django_db_anonymiser/database_sanitizer/utils/postgres.py +145 -0
- django_db_anonymiser/db_anonymiser/__init__.py +0 -0
- django_db_anonymiser/db_anonymiser/faker.py +91 -0
- django_db_anonymiser/db_anonymiser/management/__init__.py +0 -0
- django_db_anonymiser/db_anonymiser/management/commands/__init__.py +0 -0
- django_db_anonymiser/db_anonymiser/management/commands/dump_and_anonymise.py +105 -0
- django_db_anonymiser/db_anonymiser/tests/test_command.py +90 -0
- django_db_anonymiser/db_anonymiser/tests/test_faker.py +116 -0
- django_db_anonymiser-0.1.0.dist-info/METADATA +98 -0
- django_db_anonymiser-0.1.0.dist-info/RECORD +40 -0
- django_db_anonymiser-0.1.0.dist-info/WHEEL +4 -0
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import codecs
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
import six
|
|
11
|
+
|
|
12
|
+
from .config import Configuration
|
|
13
|
+
from .dump import run
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main(argv=sys.argv):
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog=(argv[0] if len(argv) else "database-sanitizer"),
|
|
19
|
+
description="Sanitizes contents of databases.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--config",
|
|
23
|
+
"-c",
|
|
24
|
+
type=str,
|
|
25
|
+
dest="config",
|
|
26
|
+
help="Path to the sanitizer configuration file.",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--output",
|
|
30
|
+
"-o",
|
|
31
|
+
type=str,
|
|
32
|
+
dest="output",
|
|
33
|
+
help=(
|
|
34
|
+
"Path to the file where the sanitized database will be written "
|
|
35
|
+
"into. If omitted, standard output will be used instead."
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"url",
|
|
40
|
+
help="Database URL to which to connect into and sanitize contents.",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
args = parser.parse_args(args=argv[1:])
|
|
44
|
+
output = sys.stdout
|
|
45
|
+
if six.PY2:
|
|
46
|
+
output = codecs.getwriter("utf-8")(output)
|
|
47
|
+
config = None
|
|
48
|
+
|
|
49
|
+
if args.config:
|
|
50
|
+
conf_dir = os.path.realpath(os.path.dirname(args.config))
|
|
51
|
+
sys.path.insert(0, conf_dir)
|
|
52
|
+
config = Configuration.from_file(args.config)
|
|
53
|
+
if args.output:
|
|
54
|
+
output = open(args.output, "w")
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
run(
|
|
58
|
+
url=args.url,
|
|
59
|
+
output=output,
|
|
60
|
+
config=config,
|
|
61
|
+
)
|
|
62
|
+
finally:
|
|
63
|
+
if args.output:
|
|
64
|
+
output.close()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
main()
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
|
|
7
|
+
import six
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
__all__ = ("Configuration", "ConfigurationError")
|
|
11
|
+
|
|
12
|
+
SKIP_ROWS_CONFIG_VALUE = "skip_rows"
|
|
13
|
+
MYSQLDUMP_DEFAULT_PARAMETERS = ["--single-transaction"]
|
|
14
|
+
PG_DUMP_DEFAULT_PARAMETERS = []
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ConfigurationError(ValueError):
|
|
18
|
+
"""
|
|
19
|
+
Custom exception type used to indicate configuration file errors.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Configuration(object):
|
|
24
|
+
"""
|
|
25
|
+
Object representation of database sanitizer configuration, usually read
|
|
26
|
+
from a YAML file.
|
|
27
|
+
"""
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.sanitizers = {}
|
|
30
|
+
self.skip_rows_for_tables = []
|
|
31
|
+
self.addon_packages = []
|
|
32
|
+
self.mysqldump_params = []
|
|
33
|
+
self.pg_dump_params = []
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def from_file(cls, filename):
|
|
37
|
+
"""
|
|
38
|
+
Reads configuration from given path to a file in local file system and
|
|
39
|
+
returns parsed version of it.
|
|
40
|
+
|
|
41
|
+
:param filename: Path to the YAML file in local file system where the
|
|
42
|
+
configuration will be read from.
|
|
43
|
+
:type filename: str
|
|
44
|
+
|
|
45
|
+
:return: Configuration instance parsed from given configuration file.
|
|
46
|
+
:rtype: Configuration
|
|
47
|
+
"""
|
|
48
|
+
instance = cls()
|
|
49
|
+
|
|
50
|
+
with open(filename, "rb") as file_stream:
|
|
51
|
+
config_data = yaml.safe_load(file_stream)
|
|
52
|
+
|
|
53
|
+
instance.load(config_data)
|
|
54
|
+
|
|
55
|
+
return instance
|
|
56
|
+
|
|
57
|
+
def load(self, config_data):
|
|
58
|
+
"""
|
|
59
|
+
Loads sanitizers according to rulesets defined in given already parsed
|
|
60
|
+
configuration file.
|
|
61
|
+
|
|
62
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
63
|
+
:type config_data: dict[str,any]
|
|
64
|
+
"""
|
|
65
|
+
if not isinstance(config_data, dict):
|
|
66
|
+
raise ConfigurationError(
|
|
67
|
+
"Configuration data is %s instead of dict." % (
|
|
68
|
+
type(config_data),
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self.load_addon_packages(config_data)
|
|
73
|
+
self.load_sanitizers(config_data)
|
|
74
|
+
self.load_dump_extra_parameters(config_data)
|
|
75
|
+
|
|
76
|
+
def load_dump_extra_parameters(self, config_data):
|
|
77
|
+
"""
|
|
78
|
+
Loads extra parameters for mysqldump and/or pg_dump CLI usage. These
|
|
79
|
+
parameters should be added to the mysqldump and/or pg_dump command call
|
|
80
|
+
when taking a dump.
|
|
81
|
+
|
|
82
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
83
|
+
:type config_data: dict[str,any]
|
|
84
|
+
"""
|
|
85
|
+
section_config = config_data.get("config", {})
|
|
86
|
+
if not isinstance(section_config, dict):
|
|
87
|
+
raise ConfigurationError(
|
|
88
|
+
"'config' is %s instead of dict" % (
|
|
89
|
+
type(section_config),
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
section_extra_parameters = section_config.get("extra_parameters", {})
|
|
94
|
+
if not isinstance(section_extra_parameters, dict):
|
|
95
|
+
raise ConfigurationError(
|
|
96
|
+
"'config.extra_parameters' is %s instead of dict" % (
|
|
97
|
+
type(section_extra_parameters),
|
|
98
|
+
),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
mysqldump_params = section_extra_parameters.get("mysqldump", MYSQLDUMP_DEFAULT_PARAMETERS)
|
|
102
|
+
if not isinstance(mysqldump_params, list):
|
|
103
|
+
raise ConfigurationError(
|
|
104
|
+
"'config.extra_parameters.mysqldump' is %s instead of list" % (
|
|
105
|
+
type(mysqldump_params),
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
pg_dump_params = section_extra_parameters.get("pg_dump", PG_DUMP_DEFAULT_PARAMETERS)
|
|
110
|
+
if not isinstance(pg_dump_params, list):
|
|
111
|
+
raise ConfigurationError(
|
|
112
|
+
"'config.extra_parameters.pg_dump' is %s instead of list" % (
|
|
113
|
+
type(pg_dump_params),
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self.mysqldump_params = mysqldump_params
|
|
118
|
+
self.pg_dump_params = pg_dump_params
|
|
119
|
+
|
|
120
|
+
def load_addon_packages(self, config_data):
|
|
121
|
+
"""
|
|
122
|
+
Loads the module paths from which the configuration will attempt to
|
|
123
|
+
load sanitizers from. These must be stored as a list of strings under
|
|
124
|
+
"config.addons" section of the configuration data.
|
|
125
|
+
|
|
126
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
127
|
+
:type config_data: dict[str,any]
|
|
128
|
+
"""
|
|
129
|
+
section_config = config_data.get("config")
|
|
130
|
+
if not isinstance(section_config, dict):
|
|
131
|
+
if section_config is None:
|
|
132
|
+
return
|
|
133
|
+
raise ConfigurationError(
|
|
134
|
+
"'config' is %s instead of dict" % (
|
|
135
|
+
type(section_config),
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
section_addons = section_config.get("addons", [])
|
|
140
|
+
if not isinstance(section_addons, list):
|
|
141
|
+
raise ConfigurationError(
|
|
142
|
+
"'config.addons' is %s instead of list" % (
|
|
143
|
+
type(section_addons),
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
for index, module_path in enumerate(section_addons):
|
|
148
|
+
if not isinstance(module_path, str):
|
|
149
|
+
raise ConfigurationError(
|
|
150
|
+
"Item %d in 'config.addons' is %s instead of string" % (
|
|
151
|
+
index,
|
|
152
|
+
type(module_path),
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self.addon_packages = list(section_addons)
|
|
157
|
+
|
|
158
|
+
def load_sanitizers(self, config_data):
|
|
159
|
+
"""
|
|
160
|
+
Loads sanitizers possibly defined in the configuration under dictionary
|
|
161
|
+
called "strategy", which should contain mapping of database tables with
|
|
162
|
+
column names mapped into sanitizer function names.
|
|
163
|
+
|
|
164
|
+
:param config_data: Already parsed configuration data, as dictionary.
|
|
165
|
+
:type config_data: dict[str,any]
|
|
166
|
+
"""
|
|
167
|
+
section_strategy = config_data.get("strategy")
|
|
168
|
+
if not isinstance(section_strategy, dict):
|
|
169
|
+
if section_strategy is None:
|
|
170
|
+
return
|
|
171
|
+
if section_strategy != SKIP_ROWS_CONFIG_VALUE:
|
|
172
|
+
raise ConfigurationError(
|
|
173
|
+
"'strategy' is %s instead of dict" % (
|
|
174
|
+
type(section_strategy),
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
for table_name, column_data in six.iteritems(section_strategy):
|
|
179
|
+
if column_data == SKIP_ROWS_CONFIG_VALUE:
|
|
180
|
+
self.skip_rows_for_tables.append(table_name)
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
if not isinstance(column_data, dict):
|
|
184
|
+
if column_data is None:
|
|
185
|
+
continue
|
|
186
|
+
raise ConfigurationError(
|
|
187
|
+
"'strategy.%s' is %s instead of dict" % (
|
|
188
|
+
table_name,
|
|
189
|
+
type(column_data),
|
|
190
|
+
),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
for column_name, sanitizer_name in six.iteritems(column_data):
|
|
194
|
+
if sanitizer_name is None:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
if not isinstance(sanitizer_name, str):
|
|
198
|
+
raise ConfigurationError(
|
|
199
|
+
"'strategy.%s.%s' is %s instead of string" % (
|
|
200
|
+
table_name,
|
|
201
|
+
column_name,
|
|
202
|
+
type(sanitizer_name),
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
sanitizer_callback = self.find_sanitizer(sanitizer_name)
|
|
207
|
+
sanitizer_key = "%s.%s" % (table_name, column_name)
|
|
208
|
+
self.sanitizers[sanitizer_key] = sanitizer_callback
|
|
209
|
+
|
|
210
|
+
def find_sanitizer(self, name):
|
|
211
|
+
"""
|
|
212
|
+
Searches for a sanitizer function with given name. The name should
|
|
213
|
+
contain two parts separated from each other with a dot, the first
|
|
214
|
+
part being the module name while the second being name of the function
|
|
215
|
+
contained in the module, when it's being prefixed with "sanitize_".
|
|
216
|
+
|
|
217
|
+
The lookup process consists from three attempts, which are:
|
|
218
|
+
|
|
219
|
+
1. First package to look the module will be top level package called
|
|
220
|
+
"sanitizers".
|
|
221
|
+
2. Module will be looked under the "addon" packages, if they have been
|
|
222
|
+
defined.
|
|
223
|
+
3. Finally the sanitation function will be looked from the builtin
|
|
224
|
+
sanitizers located in "database_sanitizer.sanitizers" package.
|
|
225
|
+
|
|
226
|
+
If none of these provide any results, ConfigurationError will be
|
|
227
|
+
thrown.
|
|
228
|
+
|
|
229
|
+
:param name: "Full name" of the sanitation function containing name
|
|
230
|
+
of the module as well as name of the function.
|
|
231
|
+
:type name: str
|
|
232
|
+
|
|
233
|
+
:return: First function which can be imported with the given name.
|
|
234
|
+
:rtype: callable
|
|
235
|
+
"""
|
|
236
|
+
# Split the sanitizer name into two parts, one containing the Python
|
|
237
|
+
# module name, while second containing portion of the function name
|
|
238
|
+
# we are looking for.
|
|
239
|
+
name_parts = name.split(".")
|
|
240
|
+
if len(name_parts) < 2:
|
|
241
|
+
raise ConfigurationError(
|
|
242
|
+
"Unable to separate module name from function name in '%s'" % (
|
|
243
|
+
name,
|
|
244
|
+
),
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
module_name_suffix = ".".join(name_parts[:-1])
|
|
248
|
+
function_name = "sanitize_%s" % (name_parts[-1],)
|
|
249
|
+
|
|
250
|
+
# Phase 1: Look for custom sanitizer under a top level package called
|
|
251
|
+
# "sanitizers".
|
|
252
|
+
module_name = "sanitizers.%s" % (module_name_suffix,)
|
|
253
|
+
callback = self.find_sanitizer_from_module(
|
|
254
|
+
module_name=module_name,
|
|
255
|
+
function_name=function_name,
|
|
256
|
+
)
|
|
257
|
+
if callback:
|
|
258
|
+
return callback
|
|
259
|
+
|
|
260
|
+
# Phase 2: Look for the sanitizer under "addon" packages, if any of
|
|
261
|
+
# such have been defined.
|
|
262
|
+
for addon_package_name in self.addon_packages:
|
|
263
|
+
module_name = "%s.%s" % (
|
|
264
|
+
addon_package_name,
|
|
265
|
+
module_name_suffix,
|
|
266
|
+
)
|
|
267
|
+
callback = self.find_sanitizer_from_module(
|
|
268
|
+
module_name=module_name,
|
|
269
|
+
function_name=function_name,
|
|
270
|
+
)
|
|
271
|
+
if callback:
|
|
272
|
+
return callback
|
|
273
|
+
|
|
274
|
+
# Phase 3: Look from builtin sanitizers.
|
|
275
|
+
module_name = "database_sanitizer.sanitizers.%s" % (module_name_suffix,)
|
|
276
|
+
callback = self.find_sanitizer_from_module(
|
|
277
|
+
module_name=module_name,
|
|
278
|
+
function_name=function_name,
|
|
279
|
+
)
|
|
280
|
+
if callback:
|
|
281
|
+
return callback
|
|
282
|
+
|
|
283
|
+
# Give up.
|
|
284
|
+
raise ConfigurationError("Unable to find sanitizer called '%s'" % (
|
|
285
|
+
name,
|
|
286
|
+
))
|
|
287
|
+
|
|
288
|
+
@staticmethod
|
|
289
|
+
def find_sanitizer_from_module(module_name, function_name):
|
|
290
|
+
"""
|
|
291
|
+
Attempts to find sanitizer function from given module. If the module
|
|
292
|
+
cannot be imported, or function with given name does not exist in it,
|
|
293
|
+
nothing will be returned by this method. Otherwise the found sanitizer
|
|
294
|
+
function will be returned.
|
|
295
|
+
|
|
296
|
+
:param module_name: Name of the module to import the function from.
|
|
297
|
+
:type module_name: str
|
|
298
|
+
|
|
299
|
+
:param function_name: Name of the function to look for inside the
|
|
300
|
+
module.
|
|
301
|
+
:type function_name: str
|
|
302
|
+
|
|
303
|
+
:return: Sanitizer function found from the module, if it can be
|
|
304
|
+
imported and it indeed contains function with the given name.
|
|
305
|
+
Otherwise None will be returned instead.
|
|
306
|
+
:rtype: callback|None
|
|
307
|
+
"""
|
|
308
|
+
try:
|
|
309
|
+
module = importlib.import_module(module_name)
|
|
310
|
+
except ImportError:
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
# Look for the function inside the module. At this point it could be
|
|
314
|
+
# pretty much anything.
|
|
315
|
+
callback = getattr(module, function_name, None)
|
|
316
|
+
|
|
317
|
+
# Function does not exist in this module? Give up.
|
|
318
|
+
if callback is None:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
# It's actually callable function? Return it.
|
|
322
|
+
if callable(callback):
|
|
323
|
+
return callback
|
|
324
|
+
|
|
325
|
+
# Sanitizer seems to be something else than a function. Throw an
|
|
326
|
+
# exception to report such problem.
|
|
327
|
+
raise ConfigurationError("'%s' in '%s' is %s instead of function" % (
|
|
328
|
+
function_name,
|
|
329
|
+
module_name,
|
|
330
|
+
type(callback),
|
|
331
|
+
))
|
|
332
|
+
|
|
333
|
+
def get_sanitizer_for(self, table_name, column_name):
|
|
334
|
+
"""
|
|
335
|
+
Get sanitizer for given table and column name.
|
|
336
|
+
|
|
337
|
+
:param table_name: Name of the database table.
|
|
338
|
+
:type table_name: str
|
|
339
|
+
|
|
340
|
+
:param column_name: Name of the database column.
|
|
341
|
+
:type column_name: str
|
|
342
|
+
|
|
343
|
+
:return: Sanitizer function or None if nothing is configured
|
|
344
|
+
:rtype: Optional[Callable[[Optional[str]], Optional[str]]]
|
|
345
|
+
"""
|
|
346
|
+
sanitizer_key = "%s.%s" % (table_name, column_name)
|
|
347
|
+
return self.sanitizers.get(sanitizer_key)
|
|
348
|
+
|
|
349
|
+
def sanitize(self, table_name, column_name, value):
|
|
350
|
+
"""
|
|
351
|
+
Sanitizes given value extracted from the database according to the
|
|
352
|
+
sanitation configuration.
|
|
353
|
+
|
|
354
|
+
TODO: Add support for dates, booleans and other types found in SQL than
|
|
355
|
+
string.
|
|
356
|
+
|
|
357
|
+
:param table_name: Name of the database table from which the value is
|
|
358
|
+
from.
|
|
359
|
+
:type table_name: str
|
|
360
|
+
|
|
361
|
+
:param column_name: Name of the database column from which the value is
|
|
362
|
+
from.
|
|
363
|
+
:type column_name: str
|
|
364
|
+
|
|
365
|
+
:param value: Value from the database, either in text form or None if
|
|
366
|
+
the value is null.
|
|
367
|
+
:type value: str|None
|
|
368
|
+
|
|
369
|
+
:return: Sanitized version of the given value.
|
|
370
|
+
:rtype: str|None
|
|
371
|
+
"""
|
|
372
|
+
sanitizer_callback = self.get_sanitizer_for(table_name, column_name)
|
|
373
|
+
return sanitizer_callback(value) if sanitizer_callback else value
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
|
|
7
|
+
from six.moves.urllib import parse as urlparse
|
|
8
|
+
|
|
9
|
+
from .. import session
|
|
10
|
+
|
|
11
|
+
SUPPORTED_DATABASE_MODULES = {
|
|
12
|
+
"mysql": "django_db_anonymiser.database_sanitizer.dump.mysql",
|
|
13
|
+
"postgres": "django_db_anonymiser.database_sanitizer.dump.postgres",
|
|
14
|
+
"postgresql": "django_db_anonymiser.database_sanitizer.dump.postgres",
|
|
15
|
+
"postgis": "django_db_anonymiser.database_sanitizer.dump.postgres",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Register supported database schemes.
|
|
20
|
+
for scheme in SUPPORTED_DATABASE_MODULES.keys():
|
|
21
|
+
urlparse.uses_netloc.append(scheme)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run(url, output, config):
|
|
25
|
+
"""
|
|
26
|
+
Extracts database dump from given database URL and outputs sanitized
|
|
27
|
+
copy of it into given stream.
|
|
28
|
+
|
|
29
|
+
:param url: URL to the database which is to be sanitized.
|
|
30
|
+
:type url: str
|
|
31
|
+
|
|
32
|
+
:param output: Stream where sanitized copy of the database dump will be
|
|
33
|
+
written into.
|
|
34
|
+
:type output: file
|
|
35
|
+
|
|
36
|
+
:param config: Optional sanitizer configuration to be used for sanitation
|
|
37
|
+
of the values stored in the database.
|
|
38
|
+
:type config: database_sanitizer.config.Configuration|None
|
|
39
|
+
"""
|
|
40
|
+
parsed_url = urlparse.urlparse(url)
|
|
41
|
+
db_module_path = SUPPORTED_DATABASE_MODULES.get(parsed_url.scheme)
|
|
42
|
+
if not db_module_path:
|
|
43
|
+
raise ValueError("Unsupported database scheme: '%s'" % (parsed_url.scheme,))
|
|
44
|
+
db_module = importlib.import_module(db_module_path)
|
|
45
|
+
session.reset()
|
|
46
|
+
for line in db_module.sanitize(url=parsed_url, config=config):
|
|
47
|
+
output.write(line + "\n")
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import codecs
|
|
6
|
+
import re
|
|
7
|
+
import subprocess
|
|
8
|
+
import io
|
|
9
|
+
|
|
10
|
+
from ..utils.mysql import (
|
|
11
|
+
decode_mysql_literal,
|
|
12
|
+
encode_mysql_literal,
|
|
13
|
+
get_mysqldump_args_and_env_from_url,
|
|
14
|
+
)
|
|
15
|
+
from ..config import MYSQLDUMP_DEFAULT_PARAMETERS
|
|
16
|
+
|
|
17
|
+
#: Regular expression which matches `INSERT INTO` statements produced by the
|
|
18
|
+
#: `mysqldump` utility, even when extended inserts have been enabled.
|
|
19
|
+
INSERT_INTO_PATTERN = re.compile(
|
|
20
|
+
r"^INSERT INTO `(?P<table>[^`]*)`"
|
|
21
|
+
r" \((?P<columns>.*)\)"
|
|
22
|
+
r" VALUES (?P<values>.*);$"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
#: Regular expression which matches various kinds of MySQL literals.
|
|
27
|
+
VALUE_PATTERN = re.compile(
|
|
28
|
+
r"""
|
|
29
|
+
# Group 1:
|
|
30
|
+
(
|
|
31
|
+
'(?:[^']|''|\\')*(?<![\\])' # String literal
|
|
32
|
+
| # or...
|
|
33
|
+
[^',()]+ # NULL, TRUE, etc.
|
|
34
|
+
)
|
|
35
|
+
# Group 2:
|
|
36
|
+
(
|
|
37
|
+
[,)] # Comma or closing parenthesis.
|
|
38
|
+
)
|
|
39
|
+
""",
|
|
40
|
+
re.VERBOSE,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def sanitize(url, config):
|
|
45
|
+
"""
|
|
46
|
+
Obtains dump of MySQL database by executing `mysqldump` command and
|
|
47
|
+
sanitizes it output.
|
|
48
|
+
|
|
49
|
+
:param url: URL to the database which is going to be sanitized, parsed by
|
|
50
|
+
Python's URL parser.
|
|
51
|
+
:type url: urllib.urlparse.ParseResult
|
|
52
|
+
|
|
53
|
+
:param config: Optional sanitizer configuration to be used for sanitation
|
|
54
|
+
of the values stored in the database.
|
|
55
|
+
:type config: database_sanitizer.config.Configuration|None
|
|
56
|
+
"""
|
|
57
|
+
if url.scheme != "mysql":
|
|
58
|
+
raise ValueError("Unsupported database type: '%s'" % (url.scheme,))
|
|
59
|
+
|
|
60
|
+
args, env = get_mysqldump_args_and_env_from_url(url=url)
|
|
61
|
+
|
|
62
|
+
extra_params = MYSQLDUMP_DEFAULT_PARAMETERS
|
|
63
|
+
if config:
|
|
64
|
+
extra_params = config.mysqldump_params
|
|
65
|
+
|
|
66
|
+
process = subprocess.Popen(
|
|
67
|
+
args=["mysqldump"] + args + extra_params,
|
|
68
|
+
env=env,
|
|
69
|
+
stdout=subprocess.PIPE,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return sanitize_from_stream(stream=process.stdout, config=config)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def sanitize_from_stream(stream, config):
|
|
76
|
+
"""
|
|
77
|
+
Reads dump of MySQL database from given stream and sanitizes it.
|
|
78
|
+
|
|
79
|
+
:param stream: Stream where the database dump is expected to be available
|
|
80
|
+
from, such as stdout of `mysqldump` process.
|
|
81
|
+
:type stream: file
|
|
82
|
+
|
|
83
|
+
:param config: Optional sanitizer configuration to be used for sanitation
|
|
84
|
+
of the values stored in the database.
|
|
85
|
+
:type config: database_sanitizer.config.Configuration|None
|
|
86
|
+
"""
|
|
87
|
+
for line in io.TextIOWrapper(stream, encoding="utf-8"):
|
|
88
|
+
# Eat the trailing new line.
|
|
89
|
+
line = line.rstrip("\n")
|
|
90
|
+
|
|
91
|
+
# If there is no configuration it means that there are no sanitizers
|
|
92
|
+
# available.
|
|
93
|
+
if not config:
|
|
94
|
+
yield line
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# Does the line contain `INSERT INTO` statement? If not, use the line
|
|
98
|
+
# as-is and continue into next one.
|
|
99
|
+
insert_into_match = INSERT_INTO_PATTERN.match(line)
|
|
100
|
+
if not insert_into_match:
|
|
101
|
+
yield line
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
table_name = insert_into_match.group("table")
|
|
105
|
+
column_names = parse_column_names(insert_into_match.group("columns"))
|
|
106
|
+
|
|
107
|
+
# Skip `INSERT INTO` statement if table rows are configured
|
|
108
|
+
# to be skipped.
|
|
109
|
+
if table_name in config.skip_rows_for_tables:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
# Collect sanitizers possibly used for this table and place them into
|
|
113
|
+
# a dictionary from which we can look them up by index later.
|
|
114
|
+
sanitizers = {}
|
|
115
|
+
for index, column_name in enumerate(column_names):
|
|
116
|
+
sanitizer = config.get_sanitizer_for(
|
|
117
|
+
table_name=table_name,
|
|
118
|
+
column_name=column_name,
|
|
119
|
+
)
|
|
120
|
+
if sanitizer:
|
|
121
|
+
sanitizers[index] = sanitizer
|
|
122
|
+
|
|
123
|
+
# If this table has no sanitizers available, use the line as-is and
|
|
124
|
+
# continue into next line.
|
|
125
|
+
if len(sanitizers) == 0:
|
|
126
|
+
yield line
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
# Constructs list of tuples containing sanitized column names.
|
|
130
|
+
sanitized_value_tuples = []
|
|
131
|
+
for values in parse_values(insert_into_match.group("values")):
|
|
132
|
+
if len(column_names) != len(values):
|
|
133
|
+
raise ValueError("Mismatch between column names and values")
|
|
134
|
+
sanitized_values = []
|
|
135
|
+
for index, value in enumerate(values):
|
|
136
|
+
sanitizer_callback = sanitizers.get(index)
|
|
137
|
+
if sanitizer_callback:
|
|
138
|
+
value = sanitizer_callback(value)
|
|
139
|
+
sanitized_values.append(encode_mysql_literal(value))
|
|
140
|
+
sanitized_value_tuples.append(sanitized_values)
|
|
141
|
+
|
|
142
|
+
# Finally create new `INSERT INTO` statement from the sanitized values.
|
|
143
|
+
yield "INSERT INTO `%s` (%s) VALUES %s;" % (
|
|
144
|
+
table_name,
|
|
145
|
+
", ".join("`" + column_name + "`" for column_name in column_names),
|
|
146
|
+
",".join(
|
|
147
|
+
"(" + ",".join(value_tuple) + ")"
|
|
148
|
+
for value_tuple in sanitized_value_tuples
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def parse_column_names(text):
|
|
154
|
+
"""
|
|
155
|
+
Extracts column names from a string containing quoted and comma separated
|
|
156
|
+
column names of a table.
|
|
157
|
+
|
|
158
|
+
:param text: Line extracted from MySQL's `INSERT INTO` statement containing
|
|
159
|
+
quoted and comma separated column names.
|
|
160
|
+
:type text: str
|
|
161
|
+
|
|
162
|
+
:return: Tuple containing just the column names.
|
|
163
|
+
:rtype: tuple[str]
|
|
164
|
+
"""
|
|
165
|
+
return tuple(
|
|
166
|
+
re.sub(r"^`(.*)`$", r"\1", column_data.strip())
|
|
167
|
+
for column_data in text.split(",")
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def parse_values(text):
|
|
172
|
+
"""
|
|
173
|
+
Parses values from a string containing values from extended format `INSERT
|
|
174
|
+
INTO` statement. Values will be yielded from the function as tuples, with
|
|
175
|
+
one tuple per row in the table.
|
|
176
|
+
|
|
177
|
+
:param text: Text extracted from MySQL's `INSERT INTO` statement containing
|
|
178
|
+
quoted and comma separated column values.
|
|
179
|
+
:type text: str
|
|
180
|
+
"""
|
|
181
|
+
assert text.startswith("(")
|
|
182
|
+
pos = 1
|
|
183
|
+
values = []
|
|
184
|
+
text_len = len(text)
|
|
185
|
+
while pos < text_len:
|
|
186
|
+
match = VALUE_PATTERN.match(text, pos)
|
|
187
|
+
if not match:
|
|
188
|
+
break
|
|
189
|
+
value = match.group(1)
|
|
190
|
+
values.append(decode_mysql_literal(value.strip()))
|
|
191
|
+
pos += len(value) + 1
|
|
192
|
+
if match.group(2) == ")":
|
|
193
|
+
# Skip comma and open parenthesis ",("
|
|
194
|
+
pos += 2
|
|
195
|
+
yield tuple(values)
|
|
196
|
+
values = []
|