django-db-anonymiser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_db_anonymiser/database_sanitizer/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/__main__.py +68 -0
- django_db_anonymiser/database_sanitizer/config.py +373 -0
- django_db_anonymiser/database_sanitizer/dump/__init__.py +47 -0
- django_db_anonymiser/database_sanitizer/dump/mysql.py +196 -0
- django_db_anonymiser/database_sanitizer/dump/postgres.py +170 -0
- django_db_anonymiser/database_sanitizer/sanitizers/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/sanitizers/constant.py +14 -0
- django_db_anonymiser/database_sanitizer/sanitizers/derived.py +14 -0
- django_db_anonymiser/database_sanitizer/sanitizers/string.py +31 -0
- django_db_anonymiser/database_sanitizer/sanitizers/times.py +11 -0
- django_db_anonymiser/database_sanitizer/sanitizers/user.py +145 -0
- django_db_anonymiser/database_sanitizer/session.py +146 -0
- django_db_anonymiser/database_sanitizer/tests/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/tests/test_config.py +256 -0
- django_db_anonymiser/database_sanitizer/tests/test_dump.py +123 -0
- django_db_anonymiser/database_sanitizer/tests/test_dump_mysql.py +196 -0
- django_db_anonymiser/database_sanitizer/tests/test_dump_postgres.py +177 -0
- django_db_anonymiser/database_sanitizer/tests/test_main.py +91 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_constant.py +29 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_derived.py +19 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_string.py +44 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_times.py +18 -0
- django_db_anonymiser/database_sanitizer/tests/test_sanitizers_user.py +67 -0
- django_db_anonymiser/database_sanitizer/tests/test_session.py +36 -0
- django_db_anonymiser/database_sanitizer/tests/test_utils_mysql.py +112 -0
- django_db_anonymiser/database_sanitizer/tests/test_utils_postgres.py +86 -0
- django_db_anonymiser/database_sanitizer/utils/__init__.py +0 -0
- django_db_anonymiser/database_sanitizer/utils/mysql.py +161 -0
- django_db_anonymiser/database_sanitizer/utils/postgres.py +145 -0
- django_db_anonymiser/db_anonymiser/__init__.py +0 -0
- django_db_anonymiser/db_anonymiser/faker.py +91 -0
- django_db_anonymiser/db_anonymiser/management/__init__.py +0 -0
- django_db_anonymiser/db_anonymiser/management/commands/__init__.py +0 -0
- django_db_anonymiser/db_anonymiser/management/commands/dump_and_anonymise.py +105 -0
- django_db_anonymiser/db_anonymiser/tests/test_command.py +90 -0
- django_db_anonymiser/db_anonymiser/tests/test_faker.py +116 -0
- django_db_anonymiser-0.1.0.dist-info/METADATA +98 -0
- django_db_anonymiser-0.1.0.dist-info/RECORD +40 -0
- django_db_anonymiser-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import unicode_literals
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import pymysql
|
|
8
|
+
import six
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_mysqldump_args_and_env_from_url(url):
|
|
12
|
+
"""
|
|
13
|
+
Constructs list of command line arguments and dictionary of environment
|
|
14
|
+
variables that can be given to `mysqldump` executable to obtain database
|
|
15
|
+
dump of the database described in given URL.
|
|
16
|
+
|
|
17
|
+
:param url: Parsed database URL.
|
|
18
|
+
:type url: urllib.urlparse.ParseResult
|
|
19
|
+
|
|
20
|
+
:return: List of command line arguments as well as dictionary of
|
|
21
|
+
environment variables that can be used to launch the MySQL dump
|
|
22
|
+
process to obtain dump of the database.
|
|
23
|
+
:rtype: tuple[list[str],dict[str,str]]
|
|
24
|
+
"""
|
|
25
|
+
args = [
|
|
26
|
+
# Without this, `INSERT INTO` statements will exclude column names from
|
|
27
|
+
# the output, which are required for sanitation.
|
|
28
|
+
"--complete-insert",
|
|
29
|
+
|
|
30
|
+
# This enables use for "exteded inserts" where multiple rows of a table
|
|
31
|
+
# are included in a single `INSERT INTO` statement (contents of the
|
|
32
|
+
# entire table even, if it's within limits). We use it to increase the
|
|
33
|
+
# performance of the sanitation and to decrease the dump size.
|
|
34
|
+
"--extended-insert",
|
|
35
|
+
|
|
36
|
+
# This makes the `mysqldump` to attempt to limit size of a single line
|
|
37
|
+
# into 10 megabytes. We use it to reduce memory consumption.
|
|
38
|
+
"--net_buffer_length=10240",
|
|
39
|
+
|
|
40
|
+
# Hostname of the database to connect into, should be always present in
|
|
41
|
+
# the parsed database URL.
|
|
42
|
+
"-h",
|
|
43
|
+
url.hostname,
|
|
44
|
+
]
|
|
45
|
+
env = {}
|
|
46
|
+
|
|
47
|
+
if url.port is not None:
|
|
48
|
+
args.extend(("-P", six.text_type(url.port)))
|
|
49
|
+
|
|
50
|
+
if url.username:
|
|
51
|
+
args.extend(("-u", url.username))
|
|
52
|
+
|
|
53
|
+
if url.password:
|
|
54
|
+
env["MYSQL_PWD"] = url.password
|
|
55
|
+
|
|
56
|
+
if len(url.path) < 2 or not url.path.startswith("/"):
|
|
57
|
+
raise ValueError("Name of the database is missing from the URL")
|
|
58
|
+
|
|
59
|
+
args.append(url.path[1:])
|
|
60
|
+
|
|
61
|
+
return args, env
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
MYSQL_NULL_PATTERN = re.compile(r"^NULL$", re.IGNORECASE)
|
|
65
|
+
MYSQL_BOOLEAN_PATTERN = re.compile(r"^(TRUE|FALSE)$", re.IGNORECASE)
|
|
66
|
+
MYSQL_FLOAT_PATTERN = re.compile(r"^[+-]?\d*\.\d+([eE][+-]?\d+)?$")
|
|
67
|
+
MYSQL_INT_PATTERN = re.compile(r"^\d+$")
|
|
68
|
+
MYSQL_STRING_PATTERN = re.compile(r"'(?:[^']|''|\\')*(?<![\\])'")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def decode_mysql_literal(text):
|
|
72
|
+
"""
|
|
73
|
+
Attempts to decode given MySQL literal into Python value.
|
|
74
|
+
|
|
75
|
+
:param text: Value to be decoded, as MySQL literal.
|
|
76
|
+
:type text: str
|
|
77
|
+
|
|
78
|
+
:return: Python version of the given MySQL literal.
|
|
79
|
+
:rtype: any
|
|
80
|
+
"""
|
|
81
|
+
if MYSQL_NULL_PATTERN.match(text):
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
if MYSQL_BOOLEAN_PATTERN.match(text):
|
|
85
|
+
return text.lower() == "true"
|
|
86
|
+
|
|
87
|
+
if MYSQL_FLOAT_PATTERN.match(text):
|
|
88
|
+
return float(text)
|
|
89
|
+
|
|
90
|
+
if MYSQL_INT_PATTERN.match(text):
|
|
91
|
+
return int(text)
|
|
92
|
+
|
|
93
|
+
if MYSQL_STRING_PATTERN.match(text):
|
|
94
|
+
return decode_mysql_string_literal(text)
|
|
95
|
+
|
|
96
|
+
raise ValueError("Unable to decode given value: %r" % (text,))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
MYSQL_STRING_ESCAPE_SEQUENCE_PATTERN = re.compile(r"\\(.)")
|
|
100
|
+
MYSQL_STRING_ESCAPE_SEQUENCE_MAPPING = {
|
|
101
|
+
"\\0": "\000",
|
|
102
|
+
"\\b": "\b",
|
|
103
|
+
"\\n": "\n",
|
|
104
|
+
"\\r": "\r",
|
|
105
|
+
"\\t": "\t",
|
|
106
|
+
"\\Z": "\032",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def decode_mysql_string_literal(text):
|
|
111
|
+
"""
|
|
112
|
+
Removes quotes and decodes escape sequences from given MySQL string literal
|
|
113
|
+
returning the result.
|
|
114
|
+
|
|
115
|
+
:param text: MySQL string literal, with the quotes still included.
|
|
116
|
+
:type text: str
|
|
117
|
+
|
|
118
|
+
:return: Given string literal with quotes removed and escape sequences
|
|
119
|
+
decoded.
|
|
120
|
+
:rtype: str
|
|
121
|
+
"""
|
|
122
|
+
assert text.startswith("'")
|
|
123
|
+
assert text.endswith("'")
|
|
124
|
+
|
|
125
|
+
# Ditch quotes from the string literal.
|
|
126
|
+
text = text[1:-1]
|
|
127
|
+
|
|
128
|
+
return MYSQL_STRING_ESCAPE_SEQUENCE_PATTERN.sub(
|
|
129
|
+
unescape_single_character,
|
|
130
|
+
text,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def unescape_single_character(match):
|
|
135
|
+
"""
|
|
136
|
+
Unescape a single escape sequence found from a MySQL string literal,
|
|
137
|
+
according to the rules defined at:
|
|
138
|
+
https://dev.mysql.com/doc/refman/5.6/en/string-literals.html#character-escape-sequences
|
|
139
|
+
|
|
140
|
+
:param match: Regular expression match object.
|
|
141
|
+
|
|
142
|
+
:return: Unescaped version of given escape sequence.
|
|
143
|
+
:rtype: str
|
|
144
|
+
"""
|
|
145
|
+
value = match.group(0)
|
|
146
|
+
assert value.startswith("\\")
|
|
147
|
+
return MYSQL_STRING_ESCAPE_SEQUENCE_MAPPING.get(value) or value[1:]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def encode_mysql_literal(value):
|
|
151
|
+
"""
|
|
152
|
+
Converts given Python value into MySQL literal, suitable to be used inside
|
|
153
|
+
`INSERT INTO` statement.
|
|
154
|
+
|
|
155
|
+
:param value: Value to convert into MySQL literal.
|
|
156
|
+
:type value: any
|
|
157
|
+
|
|
158
|
+
:return: Given value encoded into MySQL literal.
|
|
159
|
+
:rtype: str
|
|
160
|
+
"""
|
|
161
|
+
return pymysql.converters.escape_item(value, "utf-8")
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Contains utilities for working with Postgres `COPY` command, mainly encoding
|
|
4
|
+
and decoding values in the custom format used by Postgres.
|
|
5
|
+
|
|
6
|
+
Documentation about copy command and the text format used by it can be found
|
|
7
|
+
from:
|
|
8
|
+
https://www.postgresql.org/docs/9.2/static/sql-copy.html
|
|
9
|
+
|
|
10
|
+
For decoding we use a regular expression to find the escape sequences
|
|
11
|
+
and invoke `unescape_single_character` function for each occurence.
|
|
12
|
+
Allowed escape sequences are precalculated into `DECODE_MAP` to make the
|
|
13
|
+
lookups faster.
|
|
14
|
+
|
|
15
|
+
For encoding we use a string translation table `ENCODE_TRANSLATE_TABLE`,
|
|
16
|
+
which maps the "forbidden" characters to escape sequences. This is used
|
|
17
|
+
with `str.translate`, which is very fast way to escape characters.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import unicode_literals
|
|
21
|
+
|
|
22
|
+
import itertools
|
|
23
|
+
import re
|
|
24
|
+
|
|
25
|
+
import six
|
|
26
|
+
|
|
27
|
+
#: Representation of NULL value in Postgres COPY statement.
|
|
28
|
+
POSTGRES_COPY_NULL_VALUE = "\\N"
|
|
29
|
+
|
|
30
|
+
ENCODE_MAP = {
|
|
31
|
+
'\\': '\\\\',
|
|
32
|
+
'\b': '\\b',
|
|
33
|
+
'\f': '\\f',
|
|
34
|
+
'\n': '\\n',
|
|
35
|
+
'\r': '\\r',
|
|
36
|
+
'\t': '\\t',
|
|
37
|
+
'\v': '\\v',
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
ENCODE_TRANSLATE_TABLE = [
|
|
41
|
+
ENCODE_MAP.get(six.unichr(n), six.unichr(n))
|
|
42
|
+
for n in range(256)
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
DECODE_REGEX = re.compile(r"""
|
|
46
|
+
\\ # a backslash
|
|
47
|
+
(?: # followed by one of these (in non-capturing parenthesis):
|
|
48
|
+
[0-7]{1,3} # 1, 2 or 3 octal digits
|
|
49
|
+
| # or
|
|
50
|
+
x[0-9a-fA-F]{1,2} # 'x' followed by 1 or 2 hexadecimal digits
|
|
51
|
+
| # or
|
|
52
|
+
. # any character
|
|
53
|
+
| # or
|
|
54
|
+
\Z # end of string
|
|
55
|
+
)
|
|
56
|
+
""", re.VERBOSE)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def decode_copy_value(value):
|
|
60
|
+
"""
|
|
61
|
+
Decodes value received as part of Postgres `COPY` command.
|
|
62
|
+
|
|
63
|
+
:param value: Value to decode.
|
|
64
|
+
:type value: str
|
|
65
|
+
|
|
66
|
+
:return: Either None if the value is NULL string, or the given value where
|
|
67
|
+
escape sequences have been decoded from.
|
|
68
|
+
:rtype: str|None
|
|
69
|
+
"""
|
|
70
|
+
# Test for null values first.
|
|
71
|
+
if value == POSTGRES_COPY_NULL_VALUE:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
# If there is no backslash present, there's nothing to decode.
|
|
75
|
+
#
|
|
76
|
+
# This early return provides a little speed-up, because it's very
|
|
77
|
+
# common to not have anything to decode and then simple search for
|
|
78
|
+
# backslash is faster than the regex sub below.
|
|
79
|
+
if '\\' not in value:
|
|
80
|
+
return value
|
|
81
|
+
|
|
82
|
+
return DECODE_REGEX.sub(unescape_single_character, value)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def unescape_single_character(match):
|
|
86
|
+
"""
|
|
87
|
+
Unescape a single escape sequence found by regular expression.
|
|
88
|
+
|
|
89
|
+
:param match: Regular expression match object
|
|
90
|
+
:rtype: str
|
|
91
|
+
:raises: ValueError if the escape sequence is invalid
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
return DECODE_MAP[match.group(0)]
|
|
95
|
+
except KeyError:
|
|
96
|
+
value = match.group(0)
|
|
97
|
+
if value == '\\':
|
|
98
|
+
raise ValueError("Unterminated escape sequence encountered")
|
|
99
|
+
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"Unrecognized escape sequence encountered: {}".format(value))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def encode_copy_value(value):
|
|
105
|
+
"""
|
|
106
|
+
Encodes given value into format suitable for Postgres `COPY` statement.
|
|
107
|
+
|
|
108
|
+
:param value: Value to encode.
|
|
109
|
+
:type value: str|None
|
|
110
|
+
|
|
111
|
+
:return: Given value encoded into format that is suitable to be used in the
|
|
112
|
+
`COPY` command.
|
|
113
|
+
:rtype: str
|
|
114
|
+
"""
|
|
115
|
+
if value is None:
|
|
116
|
+
return POSTGRES_COPY_NULL_VALUE
|
|
117
|
+
|
|
118
|
+
return value.translate(ENCODE_TRANSLATE_TABLE)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _generate_decode_map():
|
|
122
|
+
# Initialize the map by inverting the encode map
|
|
123
|
+
decode_map = {
|
|
124
|
+
encoded_char: char
|
|
125
|
+
for (char, encoded_char) in ENCODE_MAP.items()
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# Add entries for 1-3 octal digits and 1-2 hexadecimal digits
|
|
129
|
+
digit_encode_params = [
|
|
130
|
+
# (base, prefix, lengths, digit_chars)
|
|
131
|
+
(8, '\\', [1, 2, 3], '01234567'),
|
|
132
|
+
(16, '\\x', [1, 2], '0123456789abcdefABCDEF')
|
|
133
|
+
]
|
|
134
|
+
for (base, prefix, lengths, digit_chars) in digit_encode_params:
|
|
135
|
+
for length in lengths:
|
|
136
|
+
for digits in itertools.product(digit_chars, repeat=length):
|
|
137
|
+
digit_string = ''.join(digits)
|
|
138
|
+
value = int(digit_string, base=base)
|
|
139
|
+
char = six.unichr(value)
|
|
140
|
+
decode_map[prefix + digit_string] = char
|
|
141
|
+
|
|
142
|
+
return decode_map
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
DECODE_MAP = _generate_decode_map()
|
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from faker import Faker
|
|
4
|
+
|
|
5
|
+
fake = Faker("en-GB")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def sanitize_name(value):
|
|
9
|
+
return fake.name()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def sanitize_first_name(value):
|
|
13
|
+
return fake.first_name()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def sanitize_last_name(value):
|
|
17
|
+
return fake.last_name()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def sanitize_email(value):
|
|
21
|
+
return fake.unique.email()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def sanitize_company_name(value):
|
|
25
|
+
return fake.unique.company()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def sanitize_phone_number(value):
|
|
29
|
+
return "+44" + fake.msisdn()[3:]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def sanitize_address(value):
|
|
33
|
+
return fake.address().replace("\n", ", ")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def sanitize_website(value):
|
|
37
|
+
return fake.domain_name(2)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sanitize_text(value):
|
|
41
|
+
if not value:
|
|
42
|
+
return value
|
|
43
|
+
return fake.paragraph(nb_sentences=5)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def sanitize_short_text(value):
|
|
47
|
+
if not value:
|
|
48
|
+
return value
|
|
49
|
+
return fake.text(max_nb_chars=100)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def sanitize_street_address(value):
|
|
53
|
+
return fake.street_address()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def sanitize_city(value):
|
|
57
|
+
return fake.city()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def sanitize_postcode(value):
|
|
61
|
+
return fake.postcode()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def sanitize_eori_number(value):
|
|
65
|
+
return "GB" + str(fake.random_number(digits=12))
|
|
66
|
+
|
|
67
|
+
def sanitize_ni_eori_number(value):
|
|
68
|
+
return "XI" + str(fake.random_number(digits=12))
|
|
69
|
+
|
|
70
|
+
def sanitize_eu_eori_number(value):
|
|
71
|
+
eu_countries = ["LU","BE", "FI", "FR", "CZ"]
|
|
72
|
+
return fake.random_element(eu_countries) + str(fake.random_number(digits=5))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def sanitize_sic_number(value):
|
|
77
|
+
return str(fake.random_number(digits=5))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def sanitize_vat_number(value):
|
|
81
|
+
return "GB" + str(fake.random_number(digits=9))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def sanitize_registration_number(value):
|
|
85
|
+
return str(fake.random_number(digits=8))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def sanitize_filename(value):
|
|
89
|
+
return os.path.split(
|
|
90
|
+
fake.file_path(extension=["pdf", "jpg", "csv", "txt", "docx", "ods", "xlsx"])
|
|
91
|
+
)[-1]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
from django.conf import settings
|
|
6
|
+
from django.core.management.base import BaseCommand, CommandError
|
|
7
|
+
|
|
8
|
+
from django_db_anonymiser.database_sanitizer.config import Configuration
|
|
9
|
+
from django_db_anonymiser.database_sanitizer.dump import run
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Command(BaseCommand):
|
|
15
|
+
def add_arguments(self, parser):
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"--keep-local-dumpfile",
|
|
18
|
+
action="store_true",
|
|
19
|
+
help="Keep local dump file, rather than cleaning it up.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--skip-s3-upload",
|
|
23
|
+
action="store_true",
|
|
24
|
+
help="Skip uploading to S3.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def configure(self):
|
|
28
|
+
self.keep_local_dumpfile = False
|
|
29
|
+
self.skip_s3_upload = False
|
|
30
|
+
self.dump_file_name = settings.DB_ANONYMISER_DUMP_FILE_NAME
|
|
31
|
+
self.temporary_dump_location = getattr(
|
|
32
|
+
settings,
|
|
33
|
+
"DB_ANONYMISER_TEMPORARY_DUMP_LOCATION",
|
|
34
|
+
f"/tmp/{self.dump_file_name}",
|
|
35
|
+
)
|
|
36
|
+
try:
|
|
37
|
+
self.config_location = settings.DB_ANONYMISER_CONFIG_LOCATION
|
|
38
|
+
except AttributeError:
|
|
39
|
+
raise CommandError(
|
|
40
|
+
"DB_ANONYMISER_CONFIG_LOCATION must be set in django settings."
|
|
41
|
+
)
|
|
42
|
+
additional_s3_params = {}
|
|
43
|
+
aws_endpoint_url = getattr(settings, "DB_ANONYMISER_AWS_ENDPOINT_URL", None)
|
|
44
|
+
if aws_endpoint_url:
|
|
45
|
+
additional_s3_params[
|
|
46
|
+
"endpoint_url"
|
|
47
|
+
] = settings.DB_ANONYMISER_AWS_ENDPOINT_URL
|
|
48
|
+
self.s3_client = boto3.client(
|
|
49
|
+
"s3",
|
|
50
|
+
aws_access_key_id=settings.DB_ANONYMISER_AWS_ACCESS_KEY_ID,
|
|
51
|
+
aws_secret_access_key=settings.DB_ANONYMISER_AWS_SECRET_ACCESS_KEY,
|
|
52
|
+
region_name=settings.DB_ANONYMISER_AWS_REGION,
|
|
53
|
+
**additional_s3_params,
|
|
54
|
+
)
|
|
55
|
+
self.s3_bucket_name = settings.DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME
|
|
56
|
+
|
|
57
|
+
def handle(self, *args, **options):
|
|
58
|
+
logger.info("Starting DB dump and anonymiser")
|
|
59
|
+
self.configure()
|
|
60
|
+
|
|
61
|
+
if options["keep_local_dumpfile"]:
|
|
62
|
+
self.keep_local_dumpfile = True
|
|
63
|
+
|
|
64
|
+
if options["skip_s3_upload"]:
|
|
65
|
+
self.skip_s3_upload = True
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
self.dump_anonymised_db()
|
|
69
|
+
self.write_to_s3()
|
|
70
|
+
logger.info("DB dump and anonymiser was successful!")
|
|
71
|
+
finally:
|
|
72
|
+
self.cleanup()
|
|
73
|
+
|
|
74
|
+
def dump_anonymised_db(self):
|
|
75
|
+
db_details = settings.DATABASES["default"]
|
|
76
|
+
postgres_url = f"postgresql://{db_details['USER']}:{db_details['PASSWORD']}@{db_details['HOST']}:{db_details['PORT']}/{db_details['NAME']}"
|
|
77
|
+
logger.info(
|
|
78
|
+
"Writing anonymised dumpfile to temporary location %s", self.dump_file_name
|
|
79
|
+
)
|
|
80
|
+
with open(self.temporary_dump_location, "w") as outfile:
|
|
81
|
+
run(
|
|
82
|
+
url=postgres_url,
|
|
83
|
+
config=Configuration.from_file(self.config_location),
|
|
84
|
+
output=outfile,
|
|
85
|
+
)
|
|
86
|
+
logger.info("Writing anonymised dumpfile complete")
|
|
87
|
+
|
|
88
|
+
def write_to_s3(self):
|
|
89
|
+
if self.skip_s3_upload:
|
|
90
|
+
return
|
|
91
|
+
logger.info("Writing file %s to S3", self.dump_file_name)
|
|
92
|
+
self.s3_client.upload_file(
|
|
93
|
+
self.temporary_dump_location, self.s3_bucket_name, self.dump_file_name
|
|
94
|
+
)
|
|
95
|
+
logger.info("Writing file to S3 complete")
|
|
96
|
+
|
|
97
|
+
def cleanup(self):
|
|
98
|
+
if self.keep_local_dumpfile:
|
|
99
|
+
return
|
|
100
|
+
logger.info("Cleaning up temporary files")
|
|
101
|
+
try:
|
|
102
|
+
os.remove(self.temporary_dump_location)
|
|
103
|
+
except FileNotFoundError:
|
|
104
|
+
pass
|
|
105
|
+
logger.info("Clean up complete")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from unittest.mock import patch
|
|
3
|
+
|
|
4
|
+
from django.conf import settings
|
|
5
|
+
from django.contrib.auth.models import User
|
|
6
|
+
from django.core.management import call_command
|
|
7
|
+
from django.test import TransactionTestCase
|
|
8
|
+
|
|
9
|
+
import boto3
|
|
10
|
+
import pytest
|
|
11
|
+
from moto import mock_aws
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.mark.skipif(os.getenv("CIRCLECI") != "true", reason="Skipped because test requires real postgres db.")
|
|
15
|
+
@mock_aws
|
|
16
|
+
class TestDumpAndAnonmyiseCommand(TransactionTestCase):
|
|
17
|
+
def setUp(self):
|
|
18
|
+
self.aws = boto3.client("s3", region_name=settings.DB_ANONYMISER_AWS_REGION)
|
|
19
|
+
self.aws.create_bucket(
|
|
20
|
+
Bucket=settings.DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME,
|
|
21
|
+
CreateBucketConfiguration={
|
|
22
|
+
"LocationConstraint": settings.DB_ANONYMISER_AWS_REGION
|
|
23
|
+
},
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@patch(
|
|
27
|
+
"django_db_anonymiser.db_anonymiser.management.commands.dump_and_anonymise.run"
|
|
28
|
+
)
|
|
29
|
+
@patch(
|
|
30
|
+
"django_db_anonymiser.db_anonymiser.management.commands.dump_and_anonymise.Configuration"
|
|
31
|
+
)
|
|
32
|
+
def test_dump_and_anonymise_calls_anonymiser(
|
|
33
|
+
self, mocked_configuration, mocked_anonymiser_run
|
|
34
|
+
):
|
|
35
|
+
call_command(
|
|
36
|
+
"dump_and_anonymise", keep_local_dumpfile=False, skip_s3_upload=True
|
|
37
|
+
)
|
|
38
|
+
call_args, call_kwargs = mocked_anonymiser_run.call_args
|
|
39
|
+
assert (
|
|
40
|
+
call_kwargs["url"]
|
|
41
|
+
== f"postgresql://{settings.DATABASES['default']['USER']}:{settings.DATABASES['default']['PASSWORD']}@{settings.DATABASES['default']['HOST']}:{settings.DATABASES['default']['PORT']}/{settings.DATABASES['default']['NAME']}"
|
|
42
|
+
)
|
|
43
|
+
assert call_kwargs["config"] == mocked_configuration.from_file.return_value
|
|
44
|
+
assert call_kwargs["output"].name == "/tmp/anonymised.sql"
|
|
45
|
+
# Ensure skip_s3_upload was respected
|
|
46
|
+
bucket_contents = self.aws.list_objects(
|
|
47
|
+
Bucket=settings.DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME
|
|
48
|
+
).get("Contents", [])
|
|
49
|
+
assert bucket_contents == []
|
|
50
|
+
|
|
51
|
+
def test_dump_and_anonymise_writes_sql(self):
|
|
52
|
+
user = User.objects.create(
|
|
53
|
+
first_name="Bob",
|
|
54
|
+
last_name="Benson",
|
|
55
|
+
email="bob.benson@example.net", # /PS-IGNORE
|
|
56
|
+
username="bob.benson",
|
|
57
|
+
)
|
|
58
|
+
call_command(
|
|
59
|
+
"dump_and_anonymise", keep_local_dumpfile=True, skip_s3_upload=True
|
|
60
|
+
)
|
|
61
|
+
dump_sql = ""
|
|
62
|
+
with open(f"/tmp/{settings.DB_ANONYMISER_DUMP_FILE_NAME}") as f:
|
|
63
|
+
dump_sql = f.read()
|
|
64
|
+
assert user.username in dump_sql
|
|
65
|
+
assert user.first_name not in dump_sql
|
|
66
|
+
assert user.last_name not in dump_sql
|
|
67
|
+
assert user.email not in dump_sql
|
|
68
|
+
|
|
69
|
+
def test_dump_and_anonymise_writes_to_s3(self):
|
|
70
|
+
call_command("dump_and_anonymise", keep_local_dumpfile=False)
|
|
71
|
+
bucket_contents = self.aws.list_objects(
|
|
72
|
+
Bucket=settings.DB_ANONYMISER_AWS_STORAGE_BUCKET_NAME
|
|
73
|
+
).get("Contents", [])
|
|
74
|
+
assert bucket_contents[0]["Key"] == settings.DB_ANONYMISER_DUMP_FILE_NAME
|
|
75
|
+
|
|
76
|
+
@patch(
|
|
77
|
+
"django_db_anonymiser.db_anonymiser.management.commands.dump_and_anonymise.os.remove"
|
|
78
|
+
)
|
|
79
|
+
def test_dump_and_anonymise_clears_local_file(self, mocked_os_remove):
|
|
80
|
+
call_command("dump_and_anonymise")
|
|
81
|
+
mocked_os_remove.assert_called_with(
|
|
82
|
+
f"/tmp/{settings.DB_ANONYMISER_DUMP_FILE_NAME}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@patch(
|
|
86
|
+
"django_db_anonymiser.db_anonymiser.management.commands.dump_and_anonymise.os.remove"
|
|
87
|
+
)
|
|
88
|
+
def test_dump_and_anonymise_keeps_local_file(self, mocked_os_remove):
|
|
89
|
+
call_command("dump_and_anonymise", keep_local_dumpfile=True)
|
|
90
|
+
assert not mocked_os_remove.called
|