ndr_pseudonymise 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +64 -0
- data/Rakefile +14 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/ndr_pseudonymise/client.rb +115 -0
- data/lib/ndr_pseudonymise/demographics_only_pseudonymiser.rb +88 -0
- data/lib/ndr_pseudonymise/engine.rb +6 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/command_line.rb +194 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/encrypted_object.rb +124 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/remote_repository.rb +44 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/repository.rb +165 -0
- data/lib/ndr_pseudonymise/ndr_encrypt.rb +10 -0
- data/lib/ndr_pseudonymise/prescription_pseudonymiser.rb +71 -0
- data/lib/ndr_pseudonymise/progress_printer.rb +53 -0
- data/lib/ndr_pseudonymise/pseudonymisation_specification.rb +379 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_converter.rb +92 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_wrapper.rb +96 -0
- data/lib/ndr_pseudonymise/simple_pseudonymisation.rb +125 -0
- data/lib/ndr_pseudonymise/version.rb +3 -0
- data/lib/ndr_pseudonymise.rb +16 -0
- data/lib/rsa_aes_cbc.rb +114 -0
- data/ndr_pseudonymise.gemspec +36 -0
- data/script/ndr_encrypt/README.md +154 -0
- data/script/ndr_encrypt/ndr_encrypt +4 -0
- metadata +197 -0
@@ -0,0 +1,379 @@
|
|
1
|
+
require 'securerandom'
|
2
|
+
require 'json'
|
3
|
+
require 'csv'
|
4
|
+
require 'stringio'
|
5
|
+
require 'rsa_aes_cbc'
|
6
|
+
|
7
|
+
module NdrPseudonymise
|
8
|
+
# Pseudonymise CSV data for matching purposes
|
9
|
+
# Sample format spec:
|
10
|
+
# {:core_demographics => [[[0, ' ']],
|
11
|
+
# [[1, ' ', :upcase], [2, ' ', :upcase]]],
|
12
|
+
# :columns => [
|
13
|
+
# {:title => 'nhsnumber', :maxlength => 12, :format => '\A[0-9A-Z]*\Z',
|
14
|
+
# :format_msg => 'Must contain only numbers, or numbers and letters for old NHS numbers'},
|
15
|
+
# {:title => 'dob', :format => '\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z',
|
16
|
+
# :format_msg => 'Must have format YYYY-MM-DD, e.g. 2013-08-20',
|
17
|
+
# :canonical_title => 'birthdate'},
|
18
|
+
# {:title => 'postcode'},
|
19
|
+
# {:title => 'surname'},
|
20
|
+
# {:title => 'data1'},
|
21
|
+
# {:title => 'data2'},
|
22
|
+
# ],
|
23
|
+
# :demographics => [0, 1, 2, 3],
|
24
|
+
# }
|
25
|
+
# -- delete spaces, upcase, use columns 0+1, 0+2 as keys for core demographics
|
26
|
+
# -- treat columns 0, 1, 2, 3 as demographics
|
27
|
+
class PseudonymisationSpecification
|
28
|
+
KEY_BYTES = 32 # length of randomly generated keys (32 bytes = 256 bits)
|
29
|
+
PREAMBLE_V1_STRIPED = 'Pseudonymised matching data v1.0-striped'.freeze
|
30
|
+
HEADER_ROW_PREFIX = 'HEADER_ROW'.freeze
|
31
|
+
|
32
|
+
def initialize(format_spec, key_bundle)
|
33
|
+
@format_spec = format_spec
|
34
|
+
[:core_demographics, :columns, :demographics, :encrypt_clinical].each do |k|
|
35
|
+
unless @format_spec.key?(k)
|
36
|
+
raise(ArgumentError, "Expected format_spec to have a #{k.inspect} section")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
@format_spec[:columns].each_with_index do |col, i|
|
40
|
+
raise(ArgumentError, "Expected format_spec to have a title for column #{i}") unless col.key?(:title)
|
41
|
+
end
|
42
|
+
unless [true, false].include?(@format_spec[:encrypt_clinical])
|
43
|
+
raise(ArgumentError, 'Expected encrypt_clinical to be true or false')
|
44
|
+
end
|
45
|
+
@salt1 = key_bundle[:salt1]
|
46
|
+
@salt2 = key_bundle[:salt2]
|
47
|
+
raise(ArgumentError, 'Invalid salt1') unless @salt1 =~ /\A[0-9a-f]*\Z/ && @salt1.size >= 64
|
48
|
+
raise(ArgumentError, 'Invalid salt2') unless @salt2 =~ /\A[0-9a-f]*\Z/ && @salt2.size >= 64
|
49
|
+
end
|
50
|
+
|
51
|
+
# Builds a pseudonymiser with the preferred pseudonymisation class of the given format spec
|
52
|
+
def self.factory(format_spec, key_bundle)
|
53
|
+
klass_name = format_spec[:pseudonymisation_class]
|
54
|
+
if klass_name
|
55
|
+
# Support existing format specifications.
|
56
|
+
# (Pseudonymisation classes have now moved to NdrPseudonymise namespace.)
|
57
|
+
klass_name = klass_name.sub!(/^Pseudonymisation::/, 'NdrPseudonymise::')
|
58
|
+
klass = Object.const_get(klass_name)
|
59
|
+
unless klass <= NdrPseudonymise::PseudonymisationSpecification
|
60
|
+
raise(ArgumentError, "Invalid pseudonymisation_class #{klass_name}")
|
61
|
+
end
|
62
|
+
else
|
63
|
+
klass = NdrPseudonymise::PseudonymisationSpecification
|
64
|
+
end
|
65
|
+
klass.new(format_spec, key_bundle)
|
66
|
+
end
|
67
|
+
|
68
|
+
def random_key
|
69
|
+
SecureRandom.hex(KEY_BYTES)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns arrays of core demographics field values, each of the form
|
73
|
+
# e.g. [[['nhsnumber', '1234567881']],
|
74
|
+
# [['birthdate', '2010-08-21'], ['postcode', 'CB22 3AD']]]
|
75
|
+
# Column titles can be remapped using a :canonical_title entry, to ensure
|
76
|
+
# consistent pseudo_ids even when column titles are predefined.
|
77
|
+
def core_demographics(row)
|
78
|
+
@format_spec[:core_demographics].collect do |fields|
|
79
|
+
fields.collect do |col_num, delchar, modifier|
|
80
|
+
val = row[col_num].to_s
|
81
|
+
val = val.to_s.delete(delchar) if delchar
|
82
|
+
case modifier
|
83
|
+
when :upcase
|
84
|
+
val = val.upcase
|
85
|
+
when nil
|
86
|
+
else
|
87
|
+
raise "Unknown modifier #{modifier.inspect} for core_demographics"
|
88
|
+
end
|
89
|
+
row_spec = @format_spec[:columns][col_num]
|
90
|
+
[row_spec[:canonical_title] || row_spec[:title], val]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# List of pseudonymised ids, based on this row's core demographics + salt1
|
96
|
+
def real_ids(row)
|
97
|
+
core_demographics(row).collect do |fields|
|
98
|
+
(fields.collect(&:first) +
|
99
|
+
fields.collect(&:last)).collect { |s| s.gsub('_', '__') }.join('_')
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Convert a real id to a pseudonymised id
|
104
|
+
def pseudo_id(real_id)
|
105
|
+
data_hash(real_id, @salt1)
|
106
|
+
end
|
107
|
+
|
108
|
+
def data_hash(value, salt)
|
109
|
+
Digest::SHA2.hexdigest(value.to_s + salt.to_s)
|
110
|
+
end
|
111
|
+
|
112
|
+
def encrypt_data(data, pseudo_id, partial_crypt_key, salt)
|
113
|
+
if [pseudo_id, partial_crypt_key, salt].any? { |s| s.to_s.blank? }
|
114
|
+
raise(ArgumentError, 'Expected all key arguments to be non-blank')
|
115
|
+
end
|
116
|
+
key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
|
117
|
+
# unless key =~ /\A[0-9a-f]+\Z/
|
118
|
+
# raise(ArgumentError, 'Expected key to be all hex characters (0-9, a-f)')
|
119
|
+
# end
|
120
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
121
|
+
aes.encrypt
|
122
|
+
aes.key = Digest::SHA256.digest(key)
|
123
|
+
Base64.strict_encode64(aes.update(data) + aes.final)
|
124
|
+
end
|
125
|
+
|
126
|
+
def decrypt_data(data, pseudo_id, partial_crypt_key, salt)
|
127
|
+
key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
|
128
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
129
|
+
aes.decrypt
|
130
|
+
aes.key = Digest::SHA256.digest(key)
|
131
|
+
aes.update(Base64.strict_decode64(data)) + aes.final
|
132
|
+
end
|
133
|
+
|
134
|
+
def self.get_key_bundle(key_fname, admin_password)
|
135
|
+
data = File.read(key_fname)
|
136
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
137
|
+
aes.decrypt
|
138
|
+
aes.key = Digest::SHA256.digest(admin_password)
|
139
|
+
begin
|
140
|
+
bundle = YAML.load(aes.update(Base64.decode64(data)) + aes.final)
|
141
|
+
# Check that the bundle decoded successfully
|
142
|
+
raise('Invalid bundle - not a hash') unless bundle.is_a?(Hash)
|
143
|
+
bundle
|
144
|
+
rescue # => e # Lint/UselessAssignment
|
145
|
+
raise('Wrong password or invalid bundle')
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def all_demographics(row)
|
150
|
+
# TODO: What about rows with missing fields?
|
151
|
+
result = []
|
152
|
+
demographics_cols = @format_spec[:demographics]
|
153
|
+
row.each_with_index do |x, i|
|
154
|
+
result << x if demographics_cols.include?(i)
|
155
|
+
end
|
156
|
+
result
|
157
|
+
end
|
158
|
+
|
159
|
+
def clinical_data(row)
|
160
|
+
# TODO: What about rows with missing fields?
|
161
|
+
result = []
|
162
|
+
demographics_cols = @format_spec[:demographics]
|
163
|
+
row.each_with_index do |x, i|
|
164
|
+
result << x unless demographics_cols.include?(i)
|
165
|
+
end
|
166
|
+
result
|
167
|
+
end
|
168
|
+
|
169
|
+
# Pseudonymise a row of data, returning 3 sets of rows:
|
170
|
+
# [index_rows, demographics_rows, clinical_rows]
|
171
|
+
def pseudonymise_row(row)
|
172
|
+
index_rows = []
|
173
|
+
demographics_rows = []
|
174
|
+
clinical_rows = []
|
175
|
+
real_ids(row).each do |real_id|
|
176
|
+
pseudo = pseudo_id(real_id)
|
177
|
+
row_key = random_key
|
178
|
+
partial_crypt_key1 = random_key # middle bit of crypto key
|
179
|
+
if @format_spec[:encrypt_clinical]
|
180
|
+
partial_crypt_key2 = random_key # middle bit of crypto key
|
181
|
+
index_rows << [pseudo, row_key, partial_crypt_key1, partial_crypt_key2]
|
182
|
+
else
|
183
|
+
index_rows << [pseudo, row_key, partial_crypt_key1]
|
184
|
+
end
|
185
|
+
# demographics and clinical files only have non-information-bearing keys
|
186
|
+
demographics_rows << [row_key,
|
187
|
+
encrypt_data(safe_json(all_demographics(row)),
|
188
|
+
pseudo, partial_crypt_key1, @salt2)]
|
189
|
+
safe_clinical = safe_json(clinical_data(row))
|
190
|
+
if @format_spec[:encrypt_clinical]
|
191
|
+
safe_clinical = encrypt_data(safe_clinical,
|
192
|
+
pseudo, partial_crypt_key2, @salt2)
|
193
|
+
end
|
194
|
+
clinical_rows << [row_key, safe_clinical]
|
195
|
+
end
|
196
|
+
[index_rows, demographics_rows, clinical_rows]
|
197
|
+
end
|
198
|
+
|
199
|
+
# Convert data to json, but raise exception if it won't safely deserialise
|
200
|
+
def safe_json(data)
|
201
|
+
result = data.to_json
|
202
|
+
unless data == JSON.load(result)
|
203
|
+
raise(ArgumentError, "Expected consistent JSON serialisation of #{data.inspect}")
|
204
|
+
end
|
205
|
+
result
|
206
|
+
end
|
207
|
+
|
208
|
+
# Return true if this row is a valid header row, according to the spec
|
209
|
+
def header_row?(row)
|
210
|
+
expected_keys = @format_spec[:columns].collect { |col| col[:title] }
|
211
|
+
row_keys = row.collect(&:downcase)
|
212
|
+
if (row_keys & expected_keys).size >= 3 # at least 3 common keys
|
213
|
+
if row_keys == expected_keys
|
214
|
+
true # Only expected keys, in right order
|
215
|
+
else
|
216
|
+
raise(ArgumentError, "Error: invalid header row; expected keys #{expected_keys.inspect}, actually #{row_keys.inspect}")
|
217
|
+
end
|
218
|
+
else
|
219
|
+
false
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# Return false if this row is a valid data row, otherwise a list of errors
|
224
|
+
def row_errors(row)
|
225
|
+
@check_cols ||= begin
|
226
|
+
check_cols = []
|
227
|
+
@format_spec[:columns].each_with_index do |col, i|
|
228
|
+
# Unpack column checking meta-data proactively
|
229
|
+
if col[:maxlength] || col[:format]
|
230
|
+
check_cols << [col, i, col[:maxlength],
|
231
|
+
col[:format] && Regexp.new(col[:format])]
|
232
|
+
end
|
233
|
+
end
|
234
|
+
check_cols
|
235
|
+
end
|
236
|
+
@dmax ||= @format_spec[:core_demographics].flatten(1).collect(&:first).max
|
237
|
+
if row.size <= @dmax + 1
|
238
|
+
"Missing core demographics: at least #{@dmax} columns expected"
|
239
|
+
elsif row[@format_spec[:columns].size..-1].to_a.any? { |s| !s.blank? }
|
240
|
+
"Too many columns (#{row.size}); expected #{@format_spec[:columns].size}"
|
241
|
+
else
|
242
|
+
# Check field formats
|
243
|
+
errs = []
|
244
|
+
@check_cols.each do |col, i, col_maxlength, col_format_re|
|
245
|
+
val = row[i].to_s # Missing columns treated as blank
|
246
|
+
if col_maxlength && val.size > col_maxlength
|
247
|
+
errs << "Field #{col[:title]} (column #{i + 1}) is longer than maxlength #{col[:maxlength]}."
|
248
|
+
end
|
249
|
+
if col_format_re
|
250
|
+
unless col_format_re.match(val)
|
251
|
+
if col[:format_msg]
|
252
|
+
errs << "Field #{col[:title]} (column #{i + 1}) #{col[:format_msg]} -- invalid value: #{val}"
|
253
|
+
else
|
254
|
+
errs << "Field #{col[:title]} (column #{i + 1}) does not match format #{col[:format].inspect} -- invalid value: #{val}"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
if errs.empty?
|
260
|
+
false
|
261
|
+
else
|
262
|
+
errs.join(', ')
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
# Header row for CSV data
|
268
|
+
def csv_header_row
|
269
|
+
[PREAMBLE_V1_STRIPED]
|
270
|
+
end
|
271
|
+
|
272
|
+
# Append the output of pseudonymise_row to a CSV file
|
273
|
+
def emit_csv_rows(out_csv, pseudonymised_row)
|
274
|
+
(index_rows, demographics_rows, clinical_rows) = pseudonymised_row
|
275
|
+
unless index_rows.size == demographics_rows.size &&
|
276
|
+
index_rows.size == clinical_rows.size
|
277
|
+
raise(ArgumentError, <<-ERROR
|
278
|
+
Mismatch in number of index_rows (#{index_rows.size})
|
279
|
+
vs demographics_rows (#{demographics_rows.size})
|
280
|
+
vs clinical_rows (#{clinical_rows.size})
|
281
|
+
ERROR
|
282
|
+
)
|
283
|
+
end
|
284
|
+
|
285
|
+
index_rows.zip(demographics_rows).zip(clinical_rows).collect do |(index_row, demographics_row), clinical_row|
|
286
|
+
# Alternate each of 3 data types into 1 output file
|
287
|
+
out_csv << index_row
|
288
|
+
out_csv << demographics_row
|
289
|
+
out_csv << clinical_row
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
# csv_data can be an open IO object (a CSV file), or an array of data rows
|
294
|
+
# out_data can be an open IO object or a StringIO -- CSV data is output
|
295
|
+
# public_key_fname supports public key encryption of the output
|
296
|
+
# progress_monitor is an object for reporting progress, that responds to
|
297
|
+
# log_progress(start_time, time_now, csv_row, progress, total)
|
298
|
+
# where progress and total are in the same units, either bytes or rows
|
299
|
+
def pseudonymise_csv(csv_data, out_data, public_key_fname = nil, progress_monitor = nil)
|
300
|
+
csv_lib = CSV
|
301
|
+
if csv_data.is_a?(IO) || csv_data.is_a?(StringIO)
|
302
|
+
csv = csv_lib.new(csv_data)
|
303
|
+
elsif csv_data.is_a?(Array)
|
304
|
+
csv = csv_data
|
305
|
+
else
|
306
|
+
raise(ArgumentError, 'Expected an IO or Array of rows, not a filename for csv_data')
|
307
|
+
end
|
308
|
+
|
309
|
+
if public_key_fname
|
310
|
+
unless File.exist?(public_key_fname)
|
311
|
+
raise(ArgumentError, "Missing public key file: #{public_key_fname}")
|
312
|
+
end
|
313
|
+
rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname), nil)
|
314
|
+
end
|
315
|
+
|
316
|
+
unless out_data.respond_to?('<<')
|
317
|
+
raise(ArgumentError, 'Expected an IO or writeable structure for out_data')
|
318
|
+
end
|
319
|
+
out_buff = StringIO.new
|
320
|
+
out_csv = csv_lib.new(out_buff)
|
321
|
+
out_csv << csv_header_row
|
322
|
+
out_buff.rewind
|
323
|
+
out_data <<
|
324
|
+
if public_key_fname
|
325
|
+
rsa_aes_cbc.encrypt(out_buff.read) + "\n"
|
326
|
+
else
|
327
|
+
out_buff.read
|
328
|
+
end
|
329
|
+
|
330
|
+
i = 0
|
331
|
+
t0 = Time.current
|
332
|
+
csv_size = progress_monitor && csv_data.size
|
333
|
+
csv.each do |row|
|
334
|
+
out_buff = StringIO.new
|
335
|
+
out_csv = csv_lib.new(out_buff)
|
336
|
+
i += 1
|
337
|
+
if i == 1 && header_row?(row)
|
338
|
+
# Preserve header row in output
|
339
|
+
out_csv << [HEADER_ROW_PREFIX] + row
|
340
|
+
else
|
341
|
+
errs = row_errors(row)
|
342
|
+
raise("Invalid row #{i}: #{errs}") if errs
|
343
|
+
begin
|
344
|
+
emit_csv_rows(out_csv, pseudonymise_row(row))
|
345
|
+
rescue ArgumentError, RuntimeError => e
|
346
|
+
raise(ArgumentError, "Invalid row #{i}: #{e}", e.backtrace)
|
347
|
+
end
|
348
|
+
end
|
349
|
+
out_buff.rewind
|
350
|
+
out_data <<
|
351
|
+
if public_key_fname
|
352
|
+
rsa_aes_cbc.encrypt(out_buff.read) + "\n"
|
353
|
+
else
|
354
|
+
out_buff.read
|
355
|
+
end
|
356
|
+
|
357
|
+
# Current runs at about 325 rows per second for prescription data 2016-05-09 ruby 2.3.1
|
358
|
+
# so try to log progress about every 15 seconds
|
359
|
+
if (i % 5000) == 0 && progress_monitor
|
360
|
+
progress_monitor.log_progress(t0, Time.current, i, csv.is_a?(Array) ? i : csv.pos, csv_size)
|
361
|
+
end
|
362
|
+
end
|
363
|
+
if (i % 5000) != 0 && progress_monitor
|
364
|
+
progress_monitor.log_progress(t0, Time.current, i, csv_size, csv_size)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
# Decrypt public key encrypted data to a CSV file
|
369
|
+
# encrypted_data can be an open IO object (a file), or an array of data rows
|
370
|
+
# out_data can be an open IO object or a StringIO -- CSV data is output
|
371
|
+
def decrypt_to_csv(encrypted_data, out_data, public_key_fname, private_key_fname)
|
372
|
+
rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname),
|
373
|
+
File.read(private_key_fname))
|
374
|
+
encrypted_data.each do |crypto_data|
|
375
|
+
out_data << rsa_aes_cbc.decrypt(crypto_data)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'logger'
|
3
|
+
require_relative 'pseudonymised_file_wrapper'
|
4
|
+
|
5
|
+
# This is primarily a CLI to pseudonymised_file_wrapper.rb, with a few additional bells
|
6
|
+
# and whistles. For details about the output format of individual files, see the comments
|
7
|
+
# in the wrapper.
|
8
|
+
#
|
9
|
+
# run: bundle exec ruby pseydonymised_file_converter.rb <filename>
|
10
|
+
logger = Logger.new(STDOUT)
|
11
|
+
options = { mode: :pretty_write,
|
12
|
+
direction: :horizontal,
|
13
|
+
include_name: true,
|
14
|
+
comparison_mode: false }
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = 'Usage; pseudonymised_file_converter <filenames> [options]'
|
17
|
+
opts.on('-f',
|
18
|
+
'--fields',
|
19
|
+
'Report available fields') { options[:mode] = :report_fields }
|
20
|
+
opts.on('-v',
|
21
|
+
'--vertical',
|
22
|
+
'Report available fields vertically') { options[:direction] = :vertical }
|
23
|
+
opts.on('-n',
|
24
|
+
'--no-name',
|
25
|
+
'Exclude filename in horizontal printing') { options[:include_name] = false }
|
26
|
+
# Handy for inspecting numerous files form one provider with different field sets.
|
27
|
+
# This option figures out which fields are common to all the provided files, then
|
28
|
+
# groups files by the sets of fields which distinguish them
|
29
|
+
opts.on('-c',
|
30
|
+
'--compare-fields',
|
31
|
+
'Figure out available filds') { options[:comparison_mode] = true }
|
32
|
+
opts.on('-b', '--batch x y z', Array, 'Not yet implemented!') do |list|
|
33
|
+
options[:files] = list
|
34
|
+
end
|
35
|
+
end.parse!
|
36
|
+
|
37
|
+
raise 'No filename provided' unless ARGV
|
38
|
+
|
39
|
+
if options[:comparison_mode]
|
40
|
+
results = {}
|
41
|
+
(ARGV + STDIN.readlines.map(&:strip)).each do |file|
|
42
|
+
fw = PseudonymisedFileWrapper.new(file)
|
43
|
+
fw.process
|
44
|
+
results[file] = fw.available_fields
|
45
|
+
end
|
46
|
+
|
47
|
+
common_fields = results.map { |_k, v| v }.inject(:&)
|
48
|
+
logger.debug 'Common fields: '
|
49
|
+
common_fields.each do |field|
|
50
|
+
logger.debug "\t#{field}"
|
51
|
+
end
|
52
|
+
|
53
|
+
files_and_fields = results.map { |k, v| [k, v - common_fields] }
|
54
|
+
|
55
|
+
files_and_fields.chunk { |_k, v| v } .each do |_k, v|
|
56
|
+
logger.debug '********* Field Chunk *********'
|
57
|
+
if v[0][0]
|
58
|
+
v[0][1].each do |field|
|
59
|
+
logger.debug "\t#{field}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
logger.debug ''
|
64
|
+
v.each do |file, _fields|
|
65
|
+
logger.debug "\t#{file}"
|
66
|
+
end
|
67
|
+
logger.debug ''
|
68
|
+
end
|
69
|
+
else
|
70
|
+
ARGV.each do |file|
|
71
|
+
logger.debug file
|
72
|
+
logger.debug file.class
|
73
|
+
fw = PseudonymisedFileWrapper.new(file)
|
74
|
+
fw.process
|
75
|
+
case options[:mode]
|
76
|
+
when :pretty_write
|
77
|
+
fw.pretty_write
|
78
|
+
when :report_fields
|
79
|
+
case options[:direction]
|
80
|
+
when :horizontal
|
81
|
+
logger.debug "#{file if options[:include_name]}: #{fw.available_fields.sort}"
|
82
|
+
when :vertical
|
83
|
+
logger.debug "#{file}: "
|
84
|
+
fw.available_fields.sort.each do |field|
|
85
|
+
logger.debug "\t#{field}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# *************** Read in the file, parsing and recording fields in each line **************
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'csv'
|
3
|
+
# require 'pry'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
# To convert files from the command line, see pseudonymised_file_converter.rb, which has a
|
7
|
+
# CLI set up. To use this wrapper to convert files from within a ruby program:
|
8
|
+
#
|
9
|
+
# wrapper = PseudonymisedFileWrapper.new(<filename>)
|
10
|
+
# wrapper.process
|
11
|
+
# wrapper.pretty_write
|
12
|
+
#
|
13
|
+
# This will create an excel-readable copy of the file in the same location as the original.
|
14
|
+
# The new file will be named the same as the original, with .pseudo converted to _pretty.csv
|
15
|
+
# There is a column for every field present in any record, and the column name is prefixed
|
16
|
+
# by 'mapped' or 'raw' according to which column it was in in the .pseudo version.
|
17
|
+
# As this is only intended for human viewing, the values of encrypted fields are not output.
|
18
|
+
# This conveniently has the effect of making the csv files notable smaller than their
|
19
|
+
# .pseudo counterparts
|
20
|
+
#
|
21
|
+
|
22
|
+
# Provide the ability to extract fieldnames and create CSV output from .pseudo files
|
23
|
+
class PseudonymisedFileWrapper
|
24
|
+
def initialize(filename)
|
25
|
+
@filename = filename
|
26
|
+
@logger = Logger.new(STDOUT)
|
27
|
+
end
|
28
|
+
|
29
|
+
def available_fields
|
30
|
+
(@all_fields1 + @all_fields2).sort.uniq
|
31
|
+
end
|
32
|
+
|
33
|
+
# Read in the source file, accumulating all the field names used in any row
|
34
|
+
def process
|
35
|
+
line_counter = 1
|
36
|
+
processed_lines = []
|
37
|
+
all_fields1 = []
|
38
|
+
all_fields2 = []
|
39
|
+
CSV.foreach(@filename) do |row|
|
40
|
+
if row.size == 1
|
41
|
+
# Header; do nothing
|
42
|
+
elsif row.size == 7
|
43
|
+
cur = { map1: JSON.parse(row[4]),
|
44
|
+
map2: JSON.parse(row[6]),
|
45
|
+
id1: row[0],
|
46
|
+
id2: row[1],
|
47
|
+
keys: row[2] }
|
48
|
+
processed_lines.push(cur)
|
49
|
+
all_fields1.push(*cur[:map1].keys).uniq!
|
50
|
+
all_fields2.push(*cur[:map2].keys).uniq!
|
51
|
+
else
|
52
|
+
@logger.debug"Line #{line_counter} contained unexpected number of fields: #{row.size}"
|
53
|
+
end
|
54
|
+
line_counter += 1
|
55
|
+
end
|
56
|
+
@lines = line_counter
|
57
|
+
@all_fields1 = all_fields1
|
58
|
+
@all_fields2 = all_fields2
|
59
|
+
@processed_lines = processed_lines
|
60
|
+
end
|
61
|
+
|
62
|
+
# Create an excel-readable CSV file, in the same location as the original
|
63
|
+
def pretty_write
|
64
|
+
/(?<base_name>.*)\.(?:csv|(?:zip|xlsx?)\.pseudo)/i.match(@filename)
|
65
|
+
target_filename = "#{$LAST_MATCH_INFO[:base_name]}_pretty.csv"
|
66
|
+
@logger.debug "Writing output to #{target_filename}"
|
67
|
+
CSV.open(target_filename, 'w') do |file|
|
68
|
+
headers = (@all_fields1.map { |name| "mapped:#{name}" } +
|
69
|
+
@all_fields2.map { |name| "raw:#{name}" } +
|
70
|
+
%w(pseudo_id1 pseudo_id2 key_bundle))
|
71
|
+
file << headers
|
72
|
+
@processed_lines.each do |line|
|
73
|
+
output_fields = @all_fields1.map { |field| line[:map1][field] } +
|
74
|
+
@all_fields2.map { |field| line[:map2][field] }
|
75
|
+
output_fields.push(line[:id1], line[:id2], line[:keys])
|
76
|
+
file << output_fields
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def pretty_data
|
82
|
+
csv_string = CSV.generate do |csv|
|
83
|
+
headers = (@all_fields1.map { |name| "mapped:#{name}" } +
|
84
|
+
@all_fields2.map { |name| "raw:#{name}" } +
|
85
|
+
%w(pseudo_id1 pseudo_id2 key_bundle))
|
86
|
+
csv << headers
|
87
|
+
@processed_lines.each do |line|
|
88
|
+
output_fields = @all_fields1.map { |field| line[:map1][field] } +
|
89
|
+
@all_fields2.map { |field| line[:map2][field] }
|
90
|
+
output_fields.push(line[:id1], line[:id2], line[:keys])
|
91
|
+
csv << output_fields
|
92
|
+
end
|
93
|
+
end
|
94
|
+
csv_string
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'securerandom'
|
3
|
+
require 'base64'
|
4
|
+
|
5
|
+
module NdrPseudonymise
|
6
|
+
# Simple pseudonymisation library, for efficient pseudonymisation of
|
7
|
+
# identifiable data, suitable for fuzzy matching
|
8
|
+
#
|
9
|
+
# Sample usage:
|
10
|
+
# Set up clinical data and demographics
|
11
|
+
# clinical_data = ... load pdf file ...
|
12
|
+
# all_demographics = {'nhsnumber' => '1234567881', 'postcode' => 'CB22 3AD',
|
13
|
+
# 'birthdate' => '1975-10-22', 'surname' => 'SMITH', 'forenames' => 'JOHN ROBERT'}
|
14
|
+
#
|
15
|
+
# # Generate pseudonymised identifiers and encryption keys
|
16
|
+
# (pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key) =
|
17
|
+
# NdrPseudonymise::SimplePseudonymisation.generate_keys(salt_id, salt_demog, salt_clinical,
|
18
|
+
# all_demographics['nhsnumber'], all_demographics['postcode'], all_demographics['birthdate'])
|
19
|
+
#
|
20
|
+
# # Emit first 4 values as index demographics
|
21
|
+
# emit_index_demographics(pseudo_id1, pseudo_id2, key_bundle, rowid)
|
22
|
+
#
|
23
|
+
# # Encrypt all demographics with demog_key
|
24
|
+
# emit_encrypted_demographics(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data64(demog_key, all_demographics.to_json))
|
25
|
+
#
|
26
|
+
# # Encrypt all clinical data with clinical_key
|
27
|
+
# emit_encrypted_clinical_data(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data(clinical_key, clinical_data))
|
28
|
+
#
|
29
|
+
class SimplePseudonymisation
|
30
|
+
# Generate pseudonymised identifiers and pseudonymisation keys
|
31
|
+
# Returns an array of 6 strings:
|
32
|
+
# [pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
|
33
|
+
def self.generate_keys(salt_id, salt_demog, salt_clinical, nhsnumber, current_postcode, birthdate)
|
34
|
+
unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
|
35
|
+
raise 'Invalid NHS number'
|
36
|
+
end
|
37
|
+
unless current_postcode.is_a?(String) && current_postcode =~ /\A[A-Z0-9 ]*\Z/
|
38
|
+
raise 'Invalid postcode'
|
39
|
+
end
|
40
|
+
unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
|
41
|
+
raise 'Invalid birthdate'
|
42
|
+
end
|
43
|
+
real_id1 = 'nhsnumber_' + nhsnumber
|
44
|
+
# Delete spaces from postcode
|
45
|
+
real_id2 = 'birthdate_postcode_' + birthdate + '_' + current_postcode.split(' ').join('')
|
46
|
+
|
47
|
+
pseudo_id1 = data_hash(real_id1, salt_id)
|
48
|
+
pseudo_id2 = data_hash(real_id2, salt_id)
|
49
|
+
demog_key = random_key
|
50
|
+
clinical_key = random_key
|
51
|
+
keys = []
|
52
|
+
if nhsnumber.length > 0
|
53
|
+
keys += [encrypt_data64(real_id1 + salt_demog, demog_key),
|
54
|
+
encrypt_data64(real_id1 + salt_clinical, clinical_key)]
|
55
|
+
end
|
56
|
+
if current_postcode.length > 0 && birthdate.length > 0
|
57
|
+
keys += [encrypt_data64(real_id2 + salt_demog, demog_key),
|
58
|
+
encrypt_data64(real_id2 + salt_clinical, clinical_key)]
|
59
|
+
end
|
60
|
+
# TODO: Consider whether it's worth storing something, if keys would otherwise be empty.
|
61
|
+
key_bundle = keys.join(' ')
|
62
|
+
rowid = random_key
|
63
|
+
[pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
|
64
|
+
end
|
65
|
+
|
66
|
+
# Generate pseudonymised identifiers and pseudonymisation keys
|
67
|
+
# for data with only an NHS number (missing patient postcode or DOB), where
|
68
|
+
# only the demographics need to be pseudonymised (e.g. prescription data).
|
69
|
+
# Returns an array of 3 strings:
|
70
|
+
# [pseudo_id1, key_bundle, demog_key]
|
71
|
+
def self.generate_keys_nhsnumber_demog_only(salt_id, salt_demog, nhsnumber)
|
72
|
+
unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
|
73
|
+
raise 'Invalid NHS number'
|
74
|
+
end
|
75
|
+
real_id1 = 'nhsnumber_' + nhsnumber
|
76
|
+
|
77
|
+
pseudo_id1 = data_hash(real_id1, salt_id)
|
78
|
+
demog_key = random_key
|
79
|
+
key_bundle = if nhsnumber.length > 0
|
80
|
+
encrypt_data64(real_id1 + salt_demog, demog_key)
|
81
|
+
else
|
82
|
+
''
|
83
|
+
end
|
84
|
+
[pseudo_id1, key_bundle, demog_key]
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.data_hash(value, salt)
|
88
|
+
Digest::SHA2.hexdigest(value.to_s + salt.to_s)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.random_key
|
92
|
+
SecureRandom.hex(32) # 32 bytes = 256 bits
|
93
|
+
end
|
94
|
+
|
95
|
+
# returns a base-64 encoded string
|
96
|
+
def self.encrypt_data64(key, data)
|
97
|
+
Base64.strict_encode64(encrypt_data(key, data))
|
98
|
+
end
|
99
|
+
|
100
|
+
# returns a binary string
|
101
|
+
def self.encrypt_data(key, data)
|
102
|
+
unless key =~ /[0-9a-f]{32}/
|
103
|
+
raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
|
104
|
+
end
|
105
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
106
|
+
aes.encrypt
|
107
|
+
aes.key = Digest::SHA256.digest(key)
|
108
|
+
aes.update(data) + aes.final
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.decrypt_data64(key, data)
|
112
|
+
decrypt_data(key, Base64.strict_decode64(data))
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.decrypt_data(key, data)
|
116
|
+
unless key =~ /[0-9a-f]{32}/
|
117
|
+
raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
|
118
|
+
end
|
119
|
+
aes = OpenSSL::Cipher.new('AES-256-CBC')
|
120
|
+
aes.decrypt
|
121
|
+
aes.key = Digest::SHA256.digest(key.chomp)
|
122
|
+
(aes.update(data) + aes.final)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|