ndr_pseudonymise 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +64 -0
- data/Rakefile +14 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/ndr_pseudonymise/client.rb +115 -0
- data/lib/ndr_pseudonymise/demographics_only_pseudonymiser.rb +88 -0
- data/lib/ndr_pseudonymise/engine.rb +6 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/command_line.rb +194 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/encrypted_object.rb +124 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/remote_repository.rb +44 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/repository.rb +165 -0
- data/lib/ndr_pseudonymise/ndr_encrypt.rb +10 -0
- data/lib/ndr_pseudonymise/prescription_pseudonymiser.rb +71 -0
- data/lib/ndr_pseudonymise/progress_printer.rb +53 -0
- data/lib/ndr_pseudonymise/pseudonymisation_specification.rb +379 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_converter.rb +92 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_wrapper.rb +96 -0
- data/lib/ndr_pseudonymise/simple_pseudonymisation.rb +125 -0
- data/lib/ndr_pseudonymise/version.rb +3 -0
- data/lib/ndr_pseudonymise.rb +16 -0
- data/lib/rsa_aes_cbc.rb +114 -0
- data/ndr_pseudonymise.gemspec +36 -0
- data/script/ndr_encrypt/README.md +154 -0
- data/script/ndr_encrypt/ndr_encrypt +4 -0
- metadata +197 -0
@@ -0,0 +1,379 @@
|
|
1
|
+
require 'securerandom'
|
2
|
+
require 'json'
|
3
|
+
require 'csv'
|
4
|
+
require 'stringio'
|
5
|
+
require 'rsa_aes_cbc'
|
6
|
+
|
7
|
+
module NdrPseudonymise
|
8
|
+
# Pseudonymise CSV data for matching purposes
|
9
|
+
# Sample format spec:
|
10
|
+
# {:core_demographics => [[[0, ' ']],
|
11
|
+
# [[1, ' ', :upcase], [2, ' ', :upcase]]],
|
12
|
+
# :columns => [
|
13
|
+
# {:title => 'nhsnumber', :maxlength => 12, :format => '\A[0-9A-Z]*\Z',
|
14
|
+
# :format_msg => 'Must contain only numbers, or numbers and letters for old NHS numbers'},
|
15
|
+
# {:title => 'dob', :format => '\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z',
|
16
|
+
# :format_msg => 'Must have format YYYY-MM-DD, e.g. 2013-08-20',
|
17
|
+
# :canonical_title => 'birthdate'},
|
18
|
+
# {:title => 'postcode'},
|
19
|
+
# {:title => 'surname'},
|
20
|
+
# {:title => 'data1'},
|
21
|
+
# {:title => 'data2'},
|
22
|
+
# ],
|
23
|
+
# :demographics => [0, 1, 2, 3],
|
24
|
+
# }
|
25
|
+
# -- delete spaces, upcase, use columns 0+1, 0+2 as keys for core demographics
|
26
|
+
# -- treat columns 0, 1, 2, 3 as demographics
|
27
|
+
class PseudonymisationSpecification
|
28
|
+
KEY_BYTES = 32 # length of randomly generated keys (32 bytes = 256 bits)
|
29
|
+
PREAMBLE_V1_STRIPED = 'Pseudonymised matching data v1.0-striped'.freeze
|
30
|
+
HEADER_ROW_PREFIX = 'HEADER_ROW'.freeze
|
31
|
+
|
32
|
+
def initialize(format_spec, key_bundle)
|
33
|
+
@format_spec = format_spec
|
34
|
+
[:core_demographics, :columns, :demographics, :encrypt_clinical].each do |k|
|
35
|
+
unless @format_spec.key?(k)
|
36
|
+
raise(ArgumentError, "Expected format_spec to have a #{k.inspect} section")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
@format_spec[:columns].each_with_index do |col, i|
|
40
|
+
raise(ArgumentError, "Expected format_spec to have a title for column #{i}") unless col.key?(:title)
|
41
|
+
end
|
42
|
+
unless [true, false].include?(@format_spec[:encrypt_clinical])
|
43
|
+
raise(ArgumentError, 'Expected encrypt_clinical to be true or false')
|
44
|
+
end
|
45
|
+
@salt1 = key_bundle[:salt1]
|
46
|
+
@salt2 = key_bundle[:salt2]
|
47
|
+
raise(ArgumentError, 'Invalid salt1') unless @salt1 =~ /\A[0-9a-f]*\Z/ && @salt1.size >= 64
|
48
|
+
raise(ArgumentError, 'Invalid salt2') unless @salt2 =~ /\A[0-9a-f]*\Z/ && @salt2.size >= 64
|
49
|
+
end
|
50
|
+
|
51
|
+
# Builds a pseudonymiser with the preferred pseudonymisation class of the given format spec
|
52
|
+
def self.factory(format_spec, key_bundle)
|
53
|
+
klass_name = format_spec[:pseudonymisation_class]
|
54
|
+
if klass_name
|
55
|
+
# Support existing format specifications.
|
56
|
+
# (Pseudonymisation classes have now moved to NdrPseudonymise namespace.)
|
57
|
+
klass_name = klass_name.sub!(/^Pseudonymisation::/, 'NdrPseudonymise::')
|
58
|
+
klass = Object.const_get(klass_name)
|
59
|
+
unless klass <= NdrPseudonymise::PseudonymisationSpecification
|
60
|
+
raise(ArgumentError, "Invalid pseudonymisation_class #{klass_name}")
|
61
|
+
end
|
62
|
+
else
|
63
|
+
klass = NdrPseudonymise::PseudonymisationSpecification
|
64
|
+
end
|
65
|
+
klass.new(format_spec, key_bundle)
|
66
|
+
end
|
67
|
+
|
68
|
+
def random_key
|
69
|
+
SecureRandom.hex(KEY_BYTES)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns arrays of core demographics field values, each of the form
|
73
|
+
# e.g. [[['nhsnumber', '1234567881']],
|
74
|
+
# [['birthdate', '2010-08-21'], ['postcode', 'CB22 3AD']]]
|
75
|
+
# Column titles can be remapped using a :canonical_title entry, to ensure
|
76
|
+
# consistent pseudo_ids even when column titles are predefined.
|
77
|
+
def core_demographics(row)
|
78
|
+
@format_spec[:core_demographics].collect do |fields|
|
79
|
+
fields.collect do |col_num, delchar, modifier|
|
80
|
+
val = row[col_num].to_s
|
81
|
+
val = val.to_s.delete(delchar) if delchar
|
82
|
+
case modifier
|
83
|
+
when :upcase
|
84
|
+
val = val.upcase
|
85
|
+
when nil
|
86
|
+
else
|
87
|
+
raise "Unknown modifier #{modifier.inspect} for core_demographics"
|
88
|
+
end
|
89
|
+
row_spec = @format_spec[:columns][col_num]
|
90
|
+
[row_spec[:canonical_title] || row_spec[:title], val]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# List of pseudonymised ids, based on this row's core demographics + salt1
|
96
|
+
def real_ids(row)
|
97
|
+
core_demographics(row).collect do |fields|
|
98
|
+
(fields.collect(&:first) +
|
99
|
+
fields.collect(&:last)).collect { |s| s.gsub('_', '__') }.join('_')
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Convert a real id to a pseudonymised id
|
104
|
+
def pseudo_id(real_id)
|
105
|
+
data_hash(real_id, @salt1)
|
106
|
+
end
|
107
|
+
|
108
|
+
def data_hash(value, salt)
|
109
|
+
Digest::SHA2.hexdigest(value.to_s + salt.to_s)
|
110
|
+
end
|
111
|
+
|
112
|
+
def encrypt_data(data, pseudo_id, partial_crypt_key, salt)
|
113
|
+
if [pseudo_id, partial_crypt_key, salt].any? { |s| s.to_s.blank? }
|
114
|
+
raise(ArgumentError, 'Expected all key arguments to be non-blank')
|
115
|
+
end
|
116
|
+
key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
|
117
|
+
# unless key =~ /\A[0-9a-f]+\Z/
|
118
|
+
# raise(ArgumentError, 'Expected key to be all hex characters (0-9, a-f)')
|
119
|
+
# end
|
120
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
121
|
+
aes.encrypt
|
122
|
+
aes.key = Digest::SHA256.digest(key)
|
123
|
+
Base64.strict_encode64(aes.update(data) + aes.final)
|
124
|
+
end
|
125
|
+
|
126
|
+
def decrypt_data(data, pseudo_id, partial_crypt_key, salt)
|
127
|
+
key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
|
128
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
129
|
+
aes.decrypt
|
130
|
+
aes.key = Digest::SHA256.digest(key)
|
131
|
+
aes.update(Base64.strict_decode64(data)) + aes.final
|
132
|
+
end
|
133
|
+
|
134
|
+
def self.get_key_bundle(key_fname, admin_password)
|
135
|
+
data = File.read(key_fname)
|
136
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
137
|
+
aes.decrypt
|
138
|
+
aes.key = Digest::SHA256.digest(admin_password)
|
139
|
+
begin
|
140
|
+
bundle = YAML.load(aes.update(Base64.decode64(data)) + aes.final)
|
141
|
+
# Check that the bundle decoded successfully
|
142
|
+
raise('Invalid bundle - not a hash') unless bundle.is_a?(Hash)
|
143
|
+
bundle
|
144
|
+
rescue # => e # Lint/UselessAssignment
|
145
|
+
raise('Wrong password or invalid bundle')
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def all_demographics(row)
|
150
|
+
# TODO: What about rows with missing fields?
|
151
|
+
result = []
|
152
|
+
demographics_cols = @format_spec[:demographics]
|
153
|
+
row.each_with_index do |x, i|
|
154
|
+
result << x if demographics_cols.include?(i)
|
155
|
+
end
|
156
|
+
result
|
157
|
+
end
|
158
|
+
|
159
|
+
def clinical_data(row)
|
160
|
+
# TODO: What about rows with missing fields?
|
161
|
+
result = []
|
162
|
+
demographics_cols = @format_spec[:demographics]
|
163
|
+
row.each_with_index do |x, i|
|
164
|
+
result << x unless demographics_cols.include?(i)
|
165
|
+
end
|
166
|
+
result
|
167
|
+
end
|
168
|
+
|
169
|
+
# Pseudonymise a row of data, returning 3 sets of rows:
|
170
|
+
# [index_rows, demographics_rows, clinical_rows]
|
171
|
+
def pseudonymise_row(row)
|
172
|
+
index_rows = []
|
173
|
+
demographics_rows = []
|
174
|
+
clinical_rows = []
|
175
|
+
real_ids(row).each do |real_id|
|
176
|
+
pseudo = pseudo_id(real_id)
|
177
|
+
row_key = random_key
|
178
|
+
partial_crypt_key1 = random_key # middle bit of crypto key
|
179
|
+
if @format_spec[:encrypt_clinical]
|
180
|
+
partial_crypt_key2 = random_key # middle bit of crypto key
|
181
|
+
index_rows << [pseudo, row_key, partial_crypt_key1, partial_crypt_key2]
|
182
|
+
else
|
183
|
+
index_rows << [pseudo, row_key, partial_crypt_key1]
|
184
|
+
end
|
185
|
+
# demographics and clinical files only have non-information-bearing keys
|
186
|
+
demographics_rows << [row_key,
|
187
|
+
encrypt_data(safe_json(all_demographics(row)),
|
188
|
+
pseudo, partial_crypt_key1, @salt2)]
|
189
|
+
safe_clinical = safe_json(clinical_data(row))
|
190
|
+
if @format_spec[:encrypt_clinical]
|
191
|
+
safe_clinical = encrypt_data(safe_clinical,
|
192
|
+
pseudo, partial_crypt_key2, @salt2)
|
193
|
+
end
|
194
|
+
clinical_rows << [row_key, safe_clinical]
|
195
|
+
end
|
196
|
+
[index_rows, demographics_rows, clinical_rows]
|
197
|
+
end
|
198
|
+
|
199
|
+
# Convert data to json, but raise exception if it won't safely deserialise
|
200
|
+
def safe_json(data)
|
201
|
+
result = data.to_json
|
202
|
+
unless data == JSON.load(result)
|
203
|
+
raise(ArgumentError, "Expected consistent JSON serialisation of #{data.inspect}")
|
204
|
+
end
|
205
|
+
result
|
206
|
+
end
|
207
|
+
|
208
|
+
# Return true if this row is a valid header row, according to the spec
|
209
|
+
def header_row?(row)
|
210
|
+
expected_keys = @format_spec[:columns].collect { |col| col[:title] }
|
211
|
+
row_keys = row.collect(&:downcase)
|
212
|
+
if (row_keys & expected_keys).size >= 3 # at least 3 common keys
|
213
|
+
if row_keys == expected_keys
|
214
|
+
true # Only expected keys, in right order
|
215
|
+
else
|
216
|
+
raise(ArgumentError, "Error: invalid header row; expected keys #{expected_keys.inspect}, actually #{row_keys.inspect}")
|
217
|
+
end
|
218
|
+
else
|
219
|
+
false
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# Return false if this row is a valid data row, otherwise a list of errors
|
224
|
+
def row_errors(row)
|
225
|
+
@check_cols ||= begin
|
226
|
+
check_cols = []
|
227
|
+
@format_spec[:columns].each_with_index do |col, i|
|
228
|
+
# Unpack column checking meta-data proactively
|
229
|
+
if col[:maxlength] || col[:format]
|
230
|
+
check_cols << [col, i, col[:maxlength],
|
231
|
+
col[:format] && Regexp.new(col[:format])]
|
232
|
+
end
|
233
|
+
end
|
234
|
+
check_cols
|
235
|
+
end
|
236
|
+
@dmax ||= @format_spec[:core_demographics].flatten(1).collect(&:first).max
|
237
|
+
if row.size <= @dmax + 1
|
238
|
+
"Missing core demographics: at least #{@dmax} columns expected"
|
239
|
+
elsif row[@format_spec[:columns].size..-1].to_a.any? { |s| !s.blank? }
|
240
|
+
"Too many columns (#{row.size}); expected #{@format_spec[:columns].size}"
|
241
|
+
else
|
242
|
+
# Check field formats
|
243
|
+
errs = []
|
244
|
+
@check_cols.each do |col, i, col_maxlength, col_format_re|
|
245
|
+
val = row[i].to_s # Missing columns treated as blank
|
246
|
+
if col_maxlength && val.size > col_maxlength
|
247
|
+
errs << "Field #{col[:title]} (column #{i + 1}) is longer than maxlength #{col[:maxlength]}."
|
248
|
+
end
|
249
|
+
if col_format_re
|
250
|
+
unless col_format_re.match(val)
|
251
|
+
if col[:format_msg]
|
252
|
+
errs << "Field #{col[:title]} (column #{i + 1}) #{col[:format_msg]} -- invalid value: #{val}"
|
253
|
+
else
|
254
|
+
errs << "Field #{col[:title]} (column #{i + 1}) does not match format #{col[:format].inspect} -- invalid value: #{val}"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
if errs.empty?
|
260
|
+
false
|
261
|
+
else
|
262
|
+
errs.join(', ')
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
# Header row for CSV data
|
268
|
+
def csv_header_row
|
269
|
+
[PREAMBLE_V1_STRIPED]
|
270
|
+
end
|
271
|
+
|
272
|
+
# Append the output of pseudonymise_row to a CSV file
|
273
|
+
def emit_csv_rows(out_csv, pseudonymised_row)
|
274
|
+
(index_rows, demographics_rows, clinical_rows) = pseudonymised_row
|
275
|
+
unless index_rows.size == demographics_rows.size &&
|
276
|
+
index_rows.size == clinical_rows.size
|
277
|
+
raise(ArgumentError, <<-ERROR
|
278
|
+
Mismatch in number of index_rows (#{index_rows.size})
|
279
|
+
vs demographics_rows (#{demographics_rows.size})
|
280
|
+
vs clinical_rows (#{clinical_rows.size})
|
281
|
+
ERROR
|
282
|
+
)
|
283
|
+
end
|
284
|
+
|
285
|
+
index_rows.zip(demographics_rows).zip(clinical_rows).collect do |(index_row, demographics_row), clinical_row|
|
286
|
+
# Alternate each of 3 data types into 1 output file
|
287
|
+
out_csv << index_row
|
288
|
+
out_csv << demographics_row
|
289
|
+
out_csv << clinical_row
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
# csv_data can be an open IO object (a CSV file), or an array of data rows
|
294
|
+
# out_data can be an open IO object or a StringIO -- CSV data is output
|
295
|
+
# public_key_fname supports public key encryption of the output
|
296
|
+
# progress_monitor is an object for reporting progress, that responds to
|
297
|
+
# log_progress(start_time, time_now, csv_row, progress, total)
|
298
|
+
# where progress and total are in the same units, either bytes or rows
|
299
|
+
def pseudonymise_csv(csv_data, out_data, public_key_fname = nil, progress_monitor = nil)
|
300
|
+
csv_lib = CSV
|
301
|
+
if csv_data.is_a?(IO) || csv_data.is_a?(StringIO)
|
302
|
+
csv = csv_lib.new(csv_data)
|
303
|
+
elsif csv_data.is_a?(Array)
|
304
|
+
csv = csv_data
|
305
|
+
else
|
306
|
+
raise(ArgumentError, 'Expected an IO or Array of rows, not a filename for csv_data')
|
307
|
+
end
|
308
|
+
|
309
|
+
if public_key_fname
|
310
|
+
unless File.exist?(public_key_fname)
|
311
|
+
raise(ArgumentError, "Missing public key file: #{public_key_fname}")
|
312
|
+
end
|
313
|
+
rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname), nil)
|
314
|
+
end
|
315
|
+
|
316
|
+
unless out_data.respond_to?('<<')
|
317
|
+
raise(ArgumentError, 'Expected an IO or writeable structure for out_data')
|
318
|
+
end
|
319
|
+
out_buff = StringIO.new
|
320
|
+
out_csv = csv_lib.new(out_buff)
|
321
|
+
out_csv << csv_header_row
|
322
|
+
out_buff.rewind
|
323
|
+
out_data <<
|
324
|
+
if public_key_fname
|
325
|
+
rsa_aes_cbc.encrypt(out_buff.read) + "\n"
|
326
|
+
else
|
327
|
+
out_buff.read
|
328
|
+
end
|
329
|
+
|
330
|
+
i = 0
|
331
|
+
t0 = Time.current
|
332
|
+
csv_size = progress_monitor && csv_data.size
|
333
|
+
csv.each do |row|
|
334
|
+
out_buff = StringIO.new
|
335
|
+
out_csv = csv_lib.new(out_buff)
|
336
|
+
i += 1
|
337
|
+
if i == 1 && header_row?(row)
|
338
|
+
# Preserve header row in output
|
339
|
+
out_csv << [HEADER_ROW_PREFIX] + row
|
340
|
+
else
|
341
|
+
errs = row_errors(row)
|
342
|
+
raise("Invalid row #{i}: #{errs}") if errs
|
343
|
+
begin
|
344
|
+
emit_csv_rows(out_csv, pseudonymise_row(row))
|
345
|
+
rescue ArgumentError, RuntimeError => e
|
346
|
+
raise(ArgumentError, "Invalid row #{i}: #{e}", e.backtrace)
|
347
|
+
end
|
348
|
+
end
|
349
|
+
out_buff.rewind
|
350
|
+
out_data <<
|
351
|
+
if public_key_fname
|
352
|
+
rsa_aes_cbc.encrypt(out_buff.read) + "\n"
|
353
|
+
else
|
354
|
+
out_buff.read
|
355
|
+
end
|
356
|
+
|
357
|
+
# Current runs at about 325 rows per second for prescription data 2016-05-09 ruby 2.3.1
|
358
|
+
# so try to log progress about every 15 seconds
|
359
|
+
if (i % 5000) == 0 && progress_monitor
|
360
|
+
progress_monitor.log_progress(t0, Time.current, i, csv.is_a?(Array) ? i : csv.pos, csv_size)
|
361
|
+
end
|
362
|
+
end
|
363
|
+
if (i % 5000) != 0 && progress_monitor
|
364
|
+
progress_monitor.log_progress(t0, Time.current, i, csv_size, csv_size)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
# Decrypt public key encrypted data to a CSV file
|
369
|
+
# encrypted_data can be an open IO object (a file), or an array of data rows
|
370
|
+
# out_data can be an open IO object or a StringIO -- CSV data is output
|
371
|
+
def decrypt_to_csv(encrypted_data, out_data, public_key_fname, private_key_fname)
|
372
|
+
rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname),
|
373
|
+
File.read(private_key_fname))
|
374
|
+
encrypted_data.each do |crypto_data|
|
375
|
+
out_data << rsa_aes_cbc.decrypt(crypto_data)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'logger'
|
3
|
+
require_relative 'pseudonymised_file_wrapper'
|
4
|
+
|
5
|
+
# This is primarily a CLI to pseudonymised_file_wrapper.rb, with a few additional bells
|
6
|
+
# and whistles. For details about the output format of individual files, see the comments
|
7
|
+
# in the wrapper.
|
8
|
+
#
|
9
|
+
# run: bundle exec ruby pseydonymised_file_converter.rb <filename>
|
10
|
+
logger = Logger.new(STDOUT)
|
11
|
+
options = { mode: :pretty_write,
|
12
|
+
direction: :horizontal,
|
13
|
+
include_name: true,
|
14
|
+
comparison_mode: false }
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = 'Usage; pseudonymised_file_converter <filenames> [options]'
|
17
|
+
opts.on('-f',
|
18
|
+
'--fields',
|
19
|
+
'Report available fields') { options[:mode] = :report_fields }
|
20
|
+
opts.on('-v',
|
21
|
+
'--vertical',
|
22
|
+
'Report available fields vertically') { options[:direction] = :vertical }
|
23
|
+
opts.on('-n',
|
24
|
+
'--no-name',
|
25
|
+
'Exclude filename in horizontal printing') { options[:include_name] = false }
|
26
|
+
# Handy for inspecting numerous files form one provider with different field sets.
|
27
|
+
# This option figures out which fields are common to all the provided files, then
|
28
|
+
# groups files by the sets of fields which distinguish them
|
29
|
+
opts.on('-c',
|
30
|
+
'--compare-fields',
|
31
|
+
'Figure out available filds') { options[:comparison_mode] = true }
|
32
|
+
opts.on('-b', '--batch x y z', Array, 'Not yet implemented!') do |list|
|
33
|
+
options[:files] = list
|
34
|
+
end
|
35
|
+
end.parse!
|
36
|
+
|
37
|
+
raise 'No filename provided' unless ARGV
|
38
|
+
|
39
|
+
if options[:comparison_mode]
|
40
|
+
results = {}
|
41
|
+
(ARGV + STDIN.readlines.map(&:strip)).each do |file|
|
42
|
+
fw = PseudonymisedFileWrapper.new(file)
|
43
|
+
fw.process
|
44
|
+
results[file] = fw.available_fields
|
45
|
+
end
|
46
|
+
|
47
|
+
common_fields = results.map { |_k, v| v }.inject(:&)
|
48
|
+
logger.debug 'Common fields: '
|
49
|
+
common_fields.each do |field|
|
50
|
+
logger.debug "\t#{field}"
|
51
|
+
end
|
52
|
+
|
53
|
+
files_and_fields = results.map { |k, v| [k, v - common_fields] }
|
54
|
+
|
55
|
+
files_and_fields.chunk { |_k, v| v } .each do |_k, v|
|
56
|
+
logger.debug '********* Field Chunk *********'
|
57
|
+
if v[0][0]
|
58
|
+
v[0][1].each do |field|
|
59
|
+
logger.debug "\t#{field}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
logger.debug ''
|
64
|
+
v.each do |file, _fields|
|
65
|
+
logger.debug "\t#{file}"
|
66
|
+
end
|
67
|
+
logger.debug ''
|
68
|
+
end
|
69
|
+
else
|
70
|
+
ARGV.each do |file|
|
71
|
+
logger.debug file
|
72
|
+
logger.debug file.class
|
73
|
+
fw = PseudonymisedFileWrapper.new(file)
|
74
|
+
fw.process
|
75
|
+
case options[:mode]
|
76
|
+
when :pretty_write
|
77
|
+
fw.pretty_write
|
78
|
+
when :report_fields
|
79
|
+
case options[:direction]
|
80
|
+
when :horizontal
|
81
|
+
logger.debug "#{file if options[:include_name]}: #{fw.available_fields.sort}"
|
82
|
+
when :vertical
|
83
|
+
logger.debug "#{file}: "
|
84
|
+
fw.available_fields.sort.each do |field|
|
85
|
+
logger.debug "\t#{field}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# *************** Read in the file, parsing and recording fields in each line **************
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'csv'
|
3
|
+
# require 'pry'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
# To convert files from the command line, see pseudonymised_file_converter.rb, which has a
|
7
|
+
# CLI set up. To use this wrapper to convert files from within a ruby program:
|
8
|
+
#
|
9
|
+
# wrapper = PseudonymisedFileWrapper.new(<filename>)
|
10
|
+
# wrapper.process
|
11
|
+
# wrapper.pretty_write
|
12
|
+
#
|
13
|
+
# This will create an excel-readable copy of the file in the same location as the original.
|
14
|
+
# The new file will be named the same as the original, with .pseudo converted to _pretty.csv
|
15
|
+
# There is a column for every field present in any record, and the column name is prefixed
|
16
|
+
# by 'mapped' or 'raw' according to which column it was in in the .pseudo version.
|
17
|
+
# As this is only intended for human viewing, the values of encrypted fields are not output.
|
18
|
+
# This conveniently has the effect of making the csv files notable smaller than their
|
19
|
+
# .pseudo counterparts
|
20
|
+
#
|
21
|
+
|
22
|
+
# Provide the ability to extract fieldnames and create CSV output from .pseudo files
|
23
|
+
class PseudonymisedFileWrapper
|
24
|
+
def initialize(filename)
|
25
|
+
@filename = filename
|
26
|
+
@logger = Logger.new(STDOUT)
|
27
|
+
end
|
28
|
+
|
29
|
+
def available_fields
|
30
|
+
(@all_fields1 + @all_fields2).sort.uniq
|
31
|
+
end
|
32
|
+
|
33
|
+
# Read in the source file, accumulating all the field names used in any row
|
34
|
+
def process
|
35
|
+
line_counter = 1
|
36
|
+
processed_lines = []
|
37
|
+
all_fields1 = []
|
38
|
+
all_fields2 = []
|
39
|
+
CSV.foreach(@filename) do |row|
|
40
|
+
if row.size == 1
|
41
|
+
# Header; do nothing
|
42
|
+
elsif row.size == 7
|
43
|
+
cur = { map1: JSON.parse(row[4]),
|
44
|
+
map2: JSON.parse(row[6]),
|
45
|
+
id1: row[0],
|
46
|
+
id2: row[1],
|
47
|
+
keys: row[2] }
|
48
|
+
processed_lines.push(cur)
|
49
|
+
all_fields1.push(*cur[:map1].keys).uniq!
|
50
|
+
all_fields2.push(*cur[:map2].keys).uniq!
|
51
|
+
else
|
52
|
+
@logger.debug"Line #{line_counter} contained unexpected number of fields: #{row.size}"
|
53
|
+
end
|
54
|
+
line_counter += 1
|
55
|
+
end
|
56
|
+
@lines = line_counter
|
57
|
+
@all_fields1 = all_fields1
|
58
|
+
@all_fields2 = all_fields2
|
59
|
+
@processed_lines = processed_lines
|
60
|
+
end
|
61
|
+
|
62
|
+
# Create an excel-readable CSV file, in the same location as the original
|
63
|
+
def pretty_write
|
64
|
+
/(?<base_name>.*)\.(?:csv|(?:zip|xlsx?)\.pseudo)/i.match(@filename)
|
65
|
+
target_filename = "#{$LAST_MATCH_INFO[:base_name]}_pretty.csv"
|
66
|
+
@logger.debug "Writing output to #{target_filename}"
|
67
|
+
CSV.open(target_filename, 'w') do |file|
|
68
|
+
headers = (@all_fields1.map { |name| "mapped:#{name}" } +
|
69
|
+
@all_fields2.map { |name| "raw:#{name}" } +
|
70
|
+
%w(pseudo_id1 pseudo_id2 key_bundle))
|
71
|
+
file << headers
|
72
|
+
@processed_lines.each do |line|
|
73
|
+
output_fields = @all_fields1.map { |field| line[:map1][field] } +
|
74
|
+
@all_fields2.map { |field| line[:map2][field] }
|
75
|
+
output_fields.push(line[:id1], line[:id2], line[:keys])
|
76
|
+
file << output_fields
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def pretty_data
|
82
|
+
csv_string = CSV.generate do |csv|
|
83
|
+
headers = (@all_fields1.map { |name| "mapped:#{name}" } +
|
84
|
+
@all_fields2.map { |name| "raw:#{name}" } +
|
85
|
+
%w(pseudo_id1 pseudo_id2 key_bundle))
|
86
|
+
csv << headers
|
87
|
+
@processed_lines.each do |line|
|
88
|
+
output_fields = @all_fields1.map { |field| line[:map1][field] } +
|
89
|
+
@all_fields2.map { |field| line[:map2][field] }
|
90
|
+
output_fields.push(line[:id1], line[:id2], line[:keys])
|
91
|
+
csv << output_fields
|
92
|
+
end
|
93
|
+
end
|
94
|
+
csv_string
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'securerandom'
|
3
|
+
require 'base64'
|
4
|
+
|
5
|
+
module NdrPseudonymise
|
6
|
+
# Simple pseudonymisation library, for efficient pseudonymisation of
|
7
|
+
# identifiable data, suitable for fuzzy matching
|
8
|
+
#
|
9
|
+
# Sample usage:
|
10
|
+
# Set up clinical data and demographics
|
11
|
+
# clinical_data = ... load pdf file ...
|
12
|
+
# all_demographics = {'nhsnumber' => '1234567881', 'postcode' => 'CB22 3AD',
|
13
|
+
# 'birthdate' => '1975-10-22', 'surname' => 'SMITH', 'forenames' => 'JOHN ROBERT'}
|
14
|
+
#
|
15
|
+
# # Generate pseudonymised identifiers and encryption keys
|
16
|
+
# (pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key) =
|
17
|
+
# NdrPseudonymise::SimplePseudonymisation.generate_keys(salt_id, salt_demog, salt_clinical,
|
18
|
+
# all_demographics['nhsnumber'], all_demographics['postcode'], all_demographics['birthdate'])
|
19
|
+
#
|
20
|
+
# # Emit first 4 values as index demographics
|
21
|
+
# emit_index_demographics(pseudo_id1, pseudo_id2, key_bundle, rowid)
|
22
|
+
#
|
23
|
+
# # Encrypt all demographics with demog_key
|
24
|
+
# emit_encrypted_demographics(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data64(demog_key, all_demographics.to_json))
|
25
|
+
#
|
26
|
+
# # Encrypt all clinical data with clinical_key
|
27
|
+
# emit_encrypted_clinical_data(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data(clinical_key, clinical_data))
|
28
|
+
#
|
29
|
+
class SimplePseudonymisation
|
30
|
+
# Generate pseudonymised identifiers and pseudonymisation keys
|
31
|
+
# Returns an array of 6 strings:
|
32
|
+
# [pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
|
33
|
+
def self.generate_keys(salt_id, salt_demog, salt_clinical, nhsnumber, current_postcode, birthdate)
|
34
|
+
unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
|
35
|
+
raise 'Invalid NHS number'
|
36
|
+
end
|
37
|
+
unless current_postcode.is_a?(String) && current_postcode =~ /\A[A-Z0-9 ]*\Z/
|
38
|
+
raise 'Invalid postcode'
|
39
|
+
end
|
40
|
+
unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
|
41
|
+
raise 'Invalid birthdate'
|
42
|
+
end
|
43
|
+
real_id1 = 'nhsnumber_' + nhsnumber
|
44
|
+
# Delete spaces from postcode
|
45
|
+
real_id2 = 'birthdate_postcode_' + birthdate + '_' + current_postcode.split(' ').join('')
|
46
|
+
|
47
|
+
pseudo_id1 = data_hash(real_id1, salt_id)
|
48
|
+
pseudo_id2 = data_hash(real_id2, salt_id)
|
49
|
+
demog_key = random_key
|
50
|
+
clinical_key = random_key
|
51
|
+
keys = []
|
52
|
+
if nhsnumber.length > 0
|
53
|
+
keys += [encrypt_data64(real_id1 + salt_demog, demog_key),
|
54
|
+
encrypt_data64(real_id1 + salt_clinical, clinical_key)]
|
55
|
+
end
|
56
|
+
if current_postcode.length > 0 && birthdate.length > 0
|
57
|
+
keys += [encrypt_data64(real_id2 + salt_demog, demog_key),
|
58
|
+
encrypt_data64(real_id2 + salt_clinical, clinical_key)]
|
59
|
+
end
|
60
|
+
# TODO: Consider whether it's worth storing something, if keys would otherwise be empty.
|
61
|
+
key_bundle = keys.join(' ')
|
62
|
+
rowid = random_key
|
63
|
+
[pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
|
64
|
+
end
|
65
|
+
|
66
|
+
# Generate pseudonymised identifiers and pseudonymisation keys
|
67
|
+
# for data with only an NHS number (missing patient postcode or DOB), where
|
68
|
+
# only the demographics need to be pseudonymised (e.g. prescription data).
|
69
|
+
# Returns an array of 3 strings:
|
70
|
+
# [pseudo_id1, key_bundle, demog_key]
|
71
|
+
def self.generate_keys_nhsnumber_demog_only(salt_id, salt_demog, nhsnumber)
|
72
|
+
unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
|
73
|
+
raise 'Invalid NHS number'
|
74
|
+
end
|
75
|
+
real_id1 = 'nhsnumber_' + nhsnumber
|
76
|
+
|
77
|
+
pseudo_id1 = data_hash(real_id1, salt_id)
|
78
|
+
demog_key = random_key
|
79
|
+
key_bundle = if nhsnumber.length > 0
|
80
|
+
encrypt_data64(real_id1 + salt_demog, demog_key)
|
81
|
+
else
|
82
|
+
''
|
83
|
+
end
|
84
|
+
[pseudo_id1, key_bundle, demog_key]
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.data_hash(value, salt)
|
88
|
+
Digest::SHA2.hexdigest(value.to_s + salt.to_s)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.random_key
|
92
|
+
SecureRandom.hex(32) # 32 bytes = 256 bits
|
93
|
+
end
|
94
|
+
|
95
|
+
# returns a base-64 encoded string
|
96
|
+
def self.encrypt_data64(key, data)
|
97
|
+
Base64.strict_encode64(encrypt_data(key, data))
|
98
|
+
end
|
99
|
+
|
100
|
+
# returns a binary string
|
101
|
+
def self.encrypt_data(key, data)
|
102
|
+
unless key =~ /[0-9a-f]{32}/
|
103
|
+
raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
|
104
|
+
end
|
105
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
106
|
+
aes.encrypt
|
107
|
+
aes.key = Digest::SHA256.digest(key)
|
108
|
+
aes.update(data) + aes.final
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.decrypt_data64(key, data)
|
112
|
+
decrypt_data(key, Base64.strict_decode64(data))
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.decrypt_data(key, data)
|
116
|
+
unless key =~ /[0-9a-f]{32}/
|
117
|
+
raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
|
118
|
+
end
|
119
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
120
|
+
aes.decrypt
|
121
|
+
aes.key = Digest::SHA256.digest(key.chomp)
|
122
|
+
(aes.update(data) + aes.final)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|