django-db-anonymiser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. django_db_anonymiser/database_sanitizer/__init__.py +0 -0
  2. django_db_anonymiser/database_sanitizer/__main__.py +68 -0
  3. django_db_anonymiser/database_sanitizer/config.py +373 -0
  4. django_db_anonymiser/database_sanitizer/dump/__init__.py +47 -0
  5. django_db_anonymiser/database_sanitizer/dump/mysql.py +196 -0
  6. django_db_anonymiser/database_sanitizer/dump/postgres.py +170 -0
  7. django_db_anonymiser/database_sanitizer/sanitizers/__init__.py +0 -0
  8. django_db_anonymiser/database_sanitizer/sanitizers/constant.py +14 -0
  9. django_db_anonymiser/database_sanitizer/sanitizers/derived.py +14 -0
  10. django_db_anonymiser/database_sanitizer/sanitizers/string.py +31 -0
  11. django_db_anonymiser/database_sanitizer/sanitizers/times.py +11 -0
  12. django_db_anonymiser/database_sanitizer/sanitizers/user.py +145 -0
  13. django_db_anonymiser/database_sanitizer/session.py +146 -0
  14. django_db_anonymiser/database_sanitizer/tests/__init__.py +0 -0
  15. django_db_anonymiser/database_sanitizer/tests/test_config.py +256 -0
  16. django_db_anonymiser/database_sanitizer/tests/test_dump.py +123 -0
  17. django_db_anonymiser/database_sanitizer/tests/test_dump_mysql.py +196 -0
  18. django_db_anonymiser/database_sanitizer/tests/test_dump_postgres.py +177 -0
  19. django_db_anonymiser/database_sanitizer/tests/test_main.py +91 -0
  20. django_db_anonymiser/database_sanitizer/tests/test_sanitizers_constant.py +29 -0
  21. django_db_anonymiser/database_sanitizer/tests/test_sanitizers_derived.py +19 -0
  22. django_db_anonymiser/database_sanitizer/tests/test_sanitizers_string.py +44 -0
  23. django_db_anonymiser/database_sanitizer/tests/test_sanitizers_times.py +18 -0
  24. django_db_anonymiser/database_sanitizer/tests/test_sanitizers_user.py +67 -0
  25. django_db_anonymiser/database_sanitizer/tests/test_session.py +36 -0
  26. django_db_anonymiser/database_sanitizer/tests/test_utils_mysql.py +112 -0
  27. django_db_anonymiser/database_sanitizer/tests/test_utils_postgres.py +86 -0
  28. django_db_anonymiser/database_sanitizer/utils/__init__.py +0 -0
  29. django_db_anonymiser/database_sanitizer/utils/mysql.py +161 -0
  30. django_db_anonymiser/database_sanitizer/utils/postgres.py +145 -0
  31. django_db_anonymiser/db_anonymiser/__init__.py +0 -0
  32. django_db_anonymiser/db_anonymiser/faker.py +91 -0
  33. django_db_anonymiser/db_anonymiser/management/__init__.py +0 -0
  34. django_db_anonymiser/db_anonymiser/management/commands/__init__.py +0 -0
  35. django_db_anonymiser/db_anonymiser/management/commands/dump_and_anonymise.py +105 -0
  36. django_db_anonymiser/db_anonymiser/tests/test_command.py +90 -0
  37. django_db_anonymiser/db_anonymiser/tests/test_faker.py +116 -0
  38. django_db_anonymiser-0.1.0.dist-info/METADATA +98 -0
  39. django_db_anonymiser-0.1.0.dist-info/RECORD +40 -0
  40. django_db_anonymiser-0.1.0.dist-info/WHEEL +4 -0
File without changes
@@ -0,0 +1,68 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import argparse
6
+ import codecs
7
+ import os
8
+ import sys
9
+
10
+ import six
11
+
12
+ from .config import Configuration
13
+ from .dump import run
14
+
15
+
16
+ def main(argv=sys.argv):
17
+ parser = argparse.ArgumentParser(
18
+ prog=(argv[0] if len(argv) else "database-sanitizer"),
19
+ description="Sanitizes contents of databases.",
20
+ )
21
+ parser.add_argument(
22
+ "--config",
23
+ "-c",
24
+ type=str,
25
+ dest="config",
26
+ help="Path to the sanitizer configuration file.",
27
+ )
28
+ parser.add_argument(
29
+ "--output",
30
+ "-o",
31
+ type=str,
32
+ dest="output",
33
+ help=(
34
+ "Path to the file where the sanitized database will be written "
35
+ "into. If omitted, standard output will be used instead."
36
+ ),
37
+ )
38
+ parser.add_argument(
39
+ "url",
40
+ help="Database URL to which to connect into and sanitize contents.",
41
+ )
42
+
43
+ args = parser.parse_args(args=argv[1:])
44
+ output = sys.stdout
45
+ if six.PY2:
46
+ output = codecs.getwriter("utf-8")(output)
47
+ config = None
48
+
49
+ if args.config:
50
+ conf_dir = os.path.realpath(os.path.dirname(args.config))
51
+ sys.path.insert(0, conf_dir)
52
+ config = Configuration.from_file(args.config)
53
+ if args.output:
54
+ output = open(args.output, "w")
55
+
56
+ try:
57
+ run(
58
+ url=args.url,
59
+ output=output,
60
+ config=config,
61
+ )
62
+ finally:
63
+ if args.output:
64
+ output.close()
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()
@@ -0,0 +1,373 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import importlib
6
+
7
+ import six
8
+ import yaml
9
+
10
+ __all__ = ("Configuration", "ConfigurationError")
11
+
12
+ SKIP_ROWS_CONFIG_VALUE = "skip_rows"
13
+ MYSQLDUMP_DEFAULT_PARAMETERS = ["--single-transaction"]
14
+ PG_DUMP_DEFAULT_PARAMETERS = []
15
+
16
+
17
+ class ConfigurationError(ValueError):
18
+ """
19
+ Custom exception type used to indicate configuration file errors.
20
+ """
21
+
22
+
23
+ class Configuration(object):
24
+ """
25
+ Object representation of database sanitizer configuration, usually read
26
+ from a YAML file.
27
+ """
28
+ def __init__(self):
29
+ self.sanitizers = {}
30
+ self.skip_rows_for_tables = []
31
+ self.addon_packages = []
32
+ self.mysqldump_params = []
33
+ self.pg_dump_params = []
34
+
35
+ @classmethod
36
+ def from_file(cls, filename):
37
+ """
38
+ Reads configuration from given path to a file in local file system and
39
+ returns parsed version of it.
40
+
41
+ :param filename: Path to the YAML file in local file system where the
42
+ configuration will be read from.
43
+ :type filename: str
44
+
45
+ :return: Configuration instance parsed from given configuration file.
46
+ :rtype: Configuration
47
+ """
48
+ instance = cls()
49
+
50
+ with open(filename, "rb") as file_stream:
51
+ config_data = yaml.safe_load(file_stream)
52
+
53
+ instance.load(config_data)
54
+
55
+ return instance
56
+
57
+ def load(self, config_data):
58
+ """
59
+ Loads sanitizers according to rulesets defined in given already parsed
60
+ configuration file.
61
+
62
+ :param config_data: Already parsed configuration data, as dictionary.
63
+ :type config_data: dict[str,any]
64
+ """
65
+ if not isinstance(config_data, dict):
66
+ raise ConfigurationError(
67
+ "Configuration data is %s instead of dict." % (
68
+ type(config_data),
69
+ )
70
+ )
71
+
72
+ self.load_addon_packages(config_data)
73
+ self.load_sanitizers(config_data)
74
+ self.load_dump_extra_parameters(config_data)
75
+
76
+ def load_dump_extra_parameters(self, config_data):
77
+ """
78
+ Loads extra parameters for mysqldump and/or pg_dump CLI usage. These
79
+ parameters should be added to the mysqldump and/or pg_dump command call
80
+ when taking a dump.
81
+
82
+ :param config_data: Already parsed configuration data, as dictionary.
83
+ :type config_data: dict[str,any]
84
+ """
85
+ section_config = config_data.get("config", {})
86
+ if not isinstance(section_config, dict):
87
+ raise ConfigurationError(
88
+ "'config' is %s instead of dict" % (
89
+ type(section_config),
90
+ ),
91
+ )
92
+
93
+ section_extra_parameters = section_config.get("extra_parameters", {})
94
+ if not isinstance(section_extra_parameters, dict):
95
+ raise ConfigurationError(
96
+ "'config.extra_parameters' is %s instead of dict" % (
97
+ type(section_extra_parameters),
98
+ ),
99
+ )
100
+
101
+ mysqldump_params = section_extra_parameters.get("mysqldump", MYSQLDUMP_DEFAULT_PARAMETERS)
102
+ if not isinstance(mysqldump_params, list):
103
+ raise ConfigurationError(
104
+ "'config.extra_parameters.mysqldump' is %s instead of list" % (
105
+ type(mysqldump_params),
106
+ ),
107
+ )
108
+
109
+ pg_dump_params = section_extra_parameters.get("pg_dump", PG_DUMP_DEFAULT_PARAMETERS)
110
+ if not isinstance(pg_dump_params, list):
111
+ raise ConfigurationError(
112
+ "'config.extra_parameters.pg_dump' is %s instead of list" % (
113
+ type(pg_dump_params),
114
+ ),
115
+ )
116
+
117
+ self.mysqldump_params = mysqldump_params
118
+ self.pg_dump_params = pg_dump_params
119
+
120
+ def load_addon_packages(self, config_data):
121
+ """
122
+ Loads the module paths from which the configuration will attempt to
123
+ load sanitizers from. These must be stored as a list of strings under
124
+ "config.addons" section of the configuration data.
125
+
126
+ :param config_data: Already parsed configuration data, as dictionary.
127
+ :type config_data: dict[str,any]
128
+ """
129
+ section_config = config_data.get("config")
130
+ if not isinstance(section_config, dict):
131
+ if section_config is None:
132
+ return
133
+ raise ConfigurationError(
134
+ "'config' is %s instead of dict" % (
135
+ type(section_config),
136
+ ),
137
+ )
138
+
139
+ section_addons = section_config.get("addons", [])
140
+ if not isinstance(section_addons, list):
141
+ raise ConfigurationError(
142
+ "'config.addons' is %s instead of list" % (
143
+ type(section_addons),
144
+ ),
145
+ )
146
+
147
+ for index, module_path in enumerate(section_addons):
148
+ if not isinstance(module_path, str):
149
+ raise ConfigurationError(
150
+ "Item %d in 'config.addons' is %s instead of string" % (
151
+ index,
152
+ type(module_path),
153
+ ),
154
+ )
155
+
156
+ self.addon_packages = list(section_addons)
157
+
158
+ def load_sanitizers(self, config_data):
159
+ """
160
+ Loads sanitizers possibly defined in the configuration under dictionary
161
+ called "strategy", which should contain mapping of database tables with
162
+ column names mapped into sanitizer function names.
163
+
164
+ :param config_data: Already parsed configuration data, as dictionary.
165
+ :type config_data: dict[str,any]
166
+ """
167
+ section_strategy = config_data.get("strategy")
168
+ if not isinstance(section_strategy, dict):
169
+ if section_strategy is None:
170
+ return
171
+ if section_strategy != SKIP_ROWS_CONFIG_VALUE:
172
+ raise ConfigurationError(
173
+ "'strategy' is %s instead of dict" % (
174
+ type(section_strategy),
175
+ ),
176
+ )
177
+
178
+ for table_name, column_data in six.iteritems(section_strategy):
179
+ if column_data == SKIP_ROWS_CONFIG_VALUE:
180
+ self.skip_rows_for_tables.append(table_name)
181
+ continue
182
+
183
+ if not isinstance(column_data, dict):
184
+ if column_data is None:
185
+ continue
186
+ raise ConfigurationError(
187
+ "'strategy.%s' is %s instead of dict" % (
188
+ table_name,
189
+ type(column_data),
190
+ ),
191
+ )
192
+
193
+ for column_name, sanitizer_name in six.iteritems(column_data):
194
+ if sanitizer_name is None:
195
+ continue
196
+
197
+ if not isinstance(sanitizer_name, str):
198
+ raise ConfigurationError(
199
+ "'strategy.%s.%s' is %s instead of string" % (
200
+ table_name,
201
+ column_name,
202
+ type(sanitizer_name),
203
+ ),
204
+ )
205
+
206
+ sanitizer_callback = self.find_sanitizer(sanitizer_name)
207
+ sanitizer_key = "%s.%s" % (table_name, column_name)
208
+ self.sanitizers[sanitizer_key] = sanitizer_callback
209
+
210
+ def find_sanitizer(self, name):
211
+ """
212
+ Searches for a sanitizer function with given name. The name should
213
+ contain two parts separated from each other with a dot, the first
214
+ part being the module name while the second being name of the function
215
+ contained in the module, when it's being prefixed with "sanitize_".
216
+
217
+ The lookup process consists from three attempts, which are:
218
+
219
+ 1. First package to look the module will be top level package called
220
+ "sanitizers".
221
+ 2. Module will be looked under the "addon" packages, if they have been
222
+ defined.
223
+ 3. Finally the sanitation function will be looked from the builtin
224
+ sanitizers located in "database_sanitizer.sanitizers" package.
225
+
226
+ If none of these provide any results, ConfigurationError will be
227
+ thrown.
228
+
229
+ :param name: "Full name" of the sanitation function containing name
230
+ of the module as well as name of the function.
231
+ :type name: str
232
+
233
+ :return: First function which can be imported with the given name.
234
+ :rtype: callable
235
+ """
236
+ # Split the sanitizer name into two parts, one containing the Python
237
+ # module name, while second containing portion of the function name
238
+ # we are looking for.
239
+ name_parts = name.split(".")
240
+ if len(name_parts) < 2:
241
+ raise ConfigurationError(
242
+ "Unable to separate module name from function name in '%s'" % (
243
+ name,
244
+ ),
245
+ )
246
+
247
+ module_name_suffix = ".".join(name_parts[:-1])
248
+ function_name = "sanitize_%s" % (name_parts[-1],)
249
+
250
+ # Phase 1: Look for custom sanitizer under a top level package called
251
+ # "sanitizers".
252
+ module_name = "sanitizers.%s" % (module_name_suffix,)
253
+ callback = self.find_sanitizer_from_module(
254
+ module_name=module_name,
255
+ function_name=function_name,
256
+ )
257
+ if callback:
258
+ return callback
259
+
260
+ # Phase 2: Look for the sanitizer under "addon" packages, if any of
261
+ # such have been defined.
262
+ for addon_package_name in self.addon_packages:
263
+ module_name = "%s.%s" % (
264
+ addon_package_name,
265
+ module_name_suffix,
266
+ )
267
+ callback = self.find_sanitizer_from_module(
268
+ module_name=module_name,
269
+ function_name=function_name,
270
+ )
271
+ if callback:
272
+ return callback
273
+
274
+ # Phase 3: Look from builtin sanitizers.
275
+ module_name = "database_sanitizer.sanitizers.%s" % (module_name_suffix,)
276
+ callback = self.find_sanitizer_from_module(
277
+ module_name=module_name,
278
+ function_name=function_name,
279
+ )
280
+ if callback:
281
+ return callback
282
+
283
+ # Give up.
284
+ raise ConfigurationError("Unable to find sanitizer called '%s'" % (
285
+ name,
286
+ ))
287
+
288
+ @staticmethod
289
+ def find_sanitizer_from_module(module_name, function_name):
290
+ """
291
+ Attempts to find sanitizer function from given module. If the module
292
+ cannot be imported, or function with given name does not exist in it,
293
+ nothing will be returned by this method. Otherwise the found sanitizer
294
+ function will be returned.
295
+
296
+ :param module_name: Name of the module to import the function from.
297
+ :type module_name: str
298
+
299
+ :param function_name: Name of the function to look for inside the
300
+ module.
301
+ :type function_name: str
302
+
303
+ :return: Sanitizer function found from the module, if it can be
304
+ imported and it indeed contains function with the given name.
305
+ Otherwise None will be returned instead.
306
+ :rtype: callback|None
307
+ """
308
+ try:
309
+ module = importlib.import_module(module_name)
310
+ except ImportError:
311
+ return None
312
+
313
+ # Look for the function inside the module. At this point it could be
314
+ # pretty much anything.
315
+ callback = getattr(module, function_name, None)
316
+
317
+ # Function does not exist in this module? Give up.
318
+ if callback is None:
319
+ return None
320
+
321
+ # It's actually callable function? Return it.
322
+ if callable(callback):
323
+ return callback
324
+
325
+ # Sanitizer seems to be something else than a function. Throw an
326
+ # exception to report such problem.
327
+ raise ConfigurationError("'%s' in '%s' is %s instead of function" % (
328
+ function_name,
329
+ module_name,
330
+ type(callback),
331
+ ))
332
+
333
+ def get_sanitizer_for(self, table_name, column_name):
334
+ """
335
+ Get sanitizer for given table and column name.
336
+
337
+ :param table_name: Name of the database table.
338
+ :type table_name: str
339
+
340
+ :param column_name: Name of the database column.
341
+ :type column_name: str
342
+
343
+ :return: Sanitizer function or None if nothing is configured
344
+ :rtype: Optional[Callable[[Optional[str]], Optional[str]]]
345
+ """
346
+ sanitizer_key = "%s.%s" % (table_name, column_name)
347
+ return self.sanitizers.get(sanitizer_key)
348
+
349
+ def sanitize(self, table_name, column_name, value):
350
+ """
351
+ Sanitizes given value extracted from the database according to the
352
+ sanitation configuration.
353
+
354
+ TODO: Add support for dates, booleans and other types found in SQL than
355
+ string.
356
+
357
+ :param table_name: Name of the database table from which the value is
358
+ from.
359
+ :type table_name: str
360
+
361
+ :param column_name: Name of the database column from which the value is
362
+ from.
363
+ :type column_name: str
364
+
365
+ :param value: Value from the database, either in text form or None if
366
+ the value is null.
367
+ :type value: str|None
368
+
369
+ :return: Sanitized version of the given value.
370
+ :rtype: str|None
371
+ """
372
+ sanitizer_callback = self.get_sanitizer_for(table_name, column_name)
373
+ return sanitizer_callback(value) if sanitizer_callback else value
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import importlib
6
+
7
+ from six.moves.urllib import parse as urlparse
8
+
9
+ from .. import session
10
+
11
+ SUPPORTED_DATABASE_MODULES = {
12
+ "mysql": "django_db_anonymiser.database_sanitizer.dump.mysql",
13
+ "postgres": "django_db_anonymiser.database_sanitizer.dump.postgres",
14
+ "postgresql": "django_db_anonymiser.database_sanitizer.dump.postgres",
15
+ "postgis": "django_db_anonymiser.database_sanitizer.dump.postgres",
16
+ }
17
+
18
+
19
+ # Register supported database schemes.
20
+ for scheme in SUPPORTED_DATABASE_MODULES.keys():
21
+ urlparse.uses_netloc.append(scheme)
22
+
23
+
24
+ def run(url, output, config):
25
+ """
26
+ Extracts database dump from given database URL and outputs sanitized
27
+ copy of it into given stream.
28
+
29
+ :param url: URL to the database which is to be sanitized.
30
+ :type url: str
31
+
32
+ :param output: Stream where sanitized copy of the database dump will be
33
+ written into.
34
+ :type output: file
35
+
36
+ :param config: Optional sanitizer configuration to be used for sanitation
37
+ of the values stored in the database.
38
+ :type config: database_sanitizer.config.Configuration|None
39
+ """
40
+ parsed_url = urlparse.urlparse(url)
41
+ db_module_path = SUPPORTED_DATABASE_MODULES.get(parsed_url.scheme)
42
+ if not db_module_path:
43
+ raise ValueError("Unsupported database scheme: '%s'" % (parsed_url.scheme,))
44
+ db_module = importlib.import_module(db_module_path)
45
+ session.reset()
46
+ for line in db_module.sanitize(url=parsed_url, config=config):
47
+ output.write(line + "\n")
@@ -0,0 +1,196 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ import codecs
6
+ import re
7
+ import subprocess
8
+ import io
9
+
10
+ from ..utils.mysql import (
11
+ decode_mysql_literal,
12
+ encode_mysql_literal,
13
+ get_mysqldump_args_and_env_from_url,
14
+ )
15
+ from ..config import MYSQLDUMP_DEFAULT_PARAMETERS
16
+
17
+ #: Regular expression which matches `INSERT INTO` statements produced by the
18
+ #: `mysqldump` utility, even when extended inserts have been enabled.
19
+ INSERT_INTO_PATTERN = re.compile(
20
+ r"^INSERT INTO `(?P<table>[^`]*)`"
21
+ r" \((?P<columns>.*)\)"
22
+ r" VALUES (?P<values>.*);$"
23
+ )
24
+
25
+
26
+ #: Regular expression which matches various kinds of MySQL literals.
27
+ VALUE_PATTERN = re.compile(
28
+ r"""
29
+ # Group 1:
30
+ (
31
+ '(?:[^']|''|\\')*(?<![\\])' # String literal
32
+ | # or...
33
+ [^',()]+ # NULL, TRUE, etc.
34
+ )
35
+ # Group 2:
36
+ (
37
+ [,)] # Comma or closing parenthesis.
38
+ )
39
+ """,
40
+ re.VERBOSE,
41
+ )
42
+
43
+
44
+ def sanitize(url, config):
45
+ """
46
+ Obtains dump of MySQL database by executing `mysqldump` command and
47
+ sanitizes it output.
48
+
49
+ :param url: URL to the database which is going to be sanitized, parsed by
50
+ Python's URL parser.
51
+ :type url: urllib.urlparse.ParseResult
52
+
53
+ :param config: Optional sanitizer configuration to be used for sanitation
54
+ of the values stored in the database.
55
+ :type config: database_sanitizer.config.Configuration|None
56
+ """
57
+ if url.scheme != "mysql":
58
+ raise ValueError("Unsupported database type: '%s'" % (url.scheme,))
59
+
60
+ args, env = get_mysqldump_args_and_env_from_url(url=url)
61
+
62
+ extra_params = MYSQLDUMP_DEFAULT_PARAMETERS
63
+ if config:
64
+ extra_params = config.mysqldump_params
65
+
66
+ process = subprocess.Popen(
67
+ args=["mysqldump"] + args + extra_params,
68
+ env=env,
69
+ stdout=subprocess.PIPE,
70
+ )
71
+
72
+ return sanitize_from_stream(stream=process.stdout, config=config)
73
+
74
+
75
+ def sanitize_from_stream(stream, config):
76
+ """
77
+ Reads dump of MySQL database from given stream and sanitizes it.
78
+
79
+ :param stream: Stream where the database dump is expected to be available
80
+ from, such as stdout of `mysqldump` process.
81
+ :type stream: file
82
+
83
+ :param config: Optional sanitizer configuration to be used for sanitation
84
+ of the values stored in the database.
85
+ :type config: database_sanitizer.config.Configuration|None
86
+ """
87
+ for line in io.TextIOWrapper(stream, encoding="utf-8"):
88
+ # Eat the trailing new line.
89
+ line = line.rstrip("\n")
90
+
91
+ # If there is no configuration it means that there are no sanitizers
92
+ # available.
93
+ if not config:
94
+ yield line
95
+ continue
96
+
97
+ # Does the line contain `INSERT INTO` statement? If not, use the line
98
+ # as-is and continue into next one.
99
+ insert_into_match = INSERT_INTO_PATTERN.match(line)
100
+ if not insert_into_match:
101
+ yield line
102
+ continue
103
+
104
+ table_name = insert_into_match.group("table")
105
+ column_names = parse_column_names(insert_into_match.group("columns"))
106
+
107
+ # Skip `INSERT INTO` statement if table rows are configured
108
+ # to be skipped.
109
+ if table_name in config.skip_rows_for_tables:
110
+ continue
111
+
112
+ # Collect sanitizers possibly used for this table and place them into
113
+ # a dictionary from which we can look them up by index later.
114
+ sanitizers = {}
115
+ for index, column_name in enumerate(column_names):
116
+ sanitizer = config.get_sanitizer_for(
117
+ table_name=table_name,
118
+ column_name=column_name,
119
+ )
120
+ if sanitizer:
121
+ sanitizers[index] = sanitizer
122
+
123
+ # If this table has no sanitizers available, use the line as-is and
124
+ # continue into next line.
125
+ if len(sanitizers) == 0:
126
+ yield line
127
+ continue
128
+
129
+ # Constructs list of tuples containing sanitized column names.
130
+ sanitized_value_tuples = []
131
+ for values in parse_values(insert_into_match.group("values")):
132
+ if len(column_names) != len(values):
133
+ raise ValueError("Mismatch between column names and values")
134
+ sanitized_values = []
135
+ for index, value in enumerate(values):
136
+ sanitizer_callback = sanitizers.get(index)
137
+ if sanitizer_callback:
138
+ value = sanitizer_callback(value)
139
+ sanitized_values.append(encode_mysql_literal(value))
140
+ sanitized_value_tuples.append(sanitized_values)
141
+
142
+ # Finally create new `INSERT INTO` statement from the sanitized values.
143
+ yield "INSERT INTO `%s` (%s) VALUES %s;" % (
144
+ table_name,
145
+ ", ".join("`" + column_name + "`" for column_name in column_names),
146
+ ",".join(
147
+ "(" + ",".join(value_tuple) + ")"
148
+ for value_tuple in sanitized_value_tuples
149
+ ),
150
+ )
151
+
152
+
153
+ def parse_column_names(text):
154
+ """
155
+ Extracts column names from a string containing quoted and comma separated
156
+ column names of a table.
157
+
158
+ :param text: Line extracted from MySQL's `INSERT INTO` statement containing
159
+ quoted and comma separated column names.
160
+ :type text: str
161
+
162
+ :return: Tuple containing just the column names.
163
+ :rtype: tuple[str]
164
+ """
165
+ return tuple(
166
+ re.sub(r"^`(.*)`$", r"\1", column_data.strip())
167
+ for column_data in text.split(",")
168
+ )
169
+
170
+
171
+ def parse_values(text):
172
+ """
173
+ Parses values from a string containing values from extended format `INSERT
174
+ INTO` statement. Values will be yielded from the function as tuples, with
175
+ one tuple per row in the table.
176
+
177
+ :param text: Text extracted from MySQL's `INSERT INTO` statement containing
178
+ quoted and comma separated column values.
179
+ :type text: str
180
+ """
181
+ assert text.startswith("(")
182
+ pos = 1
183
+ values = []
184
+ text_len = len(text)
185
+ while pos < text_len:
186
+ match = VALUE_PATTERN.match(text, pos)
187
+ if not match:
188
+ break
189
+ value = match.group(1)
190
+ values.append(decode_mysql_literal(value.strip()))
191
+ pos += len(value) + 1
192
+ if match.group(2) == ")":
193
+ # Skip comma and open parenthesis ",("
194
+ pos += 2
195
+ yield tuple(values)
196
+ values = []