ndr_pseudonymise 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,379 @@
1
+ require 'securerandom'
2
+ require 'json'
3
+ require 'csv'
4
+ require 'stringio'
5
+ require 'rsa_aes_cbc'
6
+
7
+ module NdrPseudonymise
8
+ # Pseudonymise CSV data for matching purposes
9
+ # Sample format spec:
10
+ # {:core_demographics => [[[0, ' ']],
11
+ # [[1, ' ', :upcase], [2, ' ', :upcase]]],
12
+ # :columns => [
13
+ # {:title => 'nhsnumber', :maxlength => 12, :format => '\A[0-9A-Z]*\Z',
14
+ # :format_msg => 'Must contain only numbers, or numbers and letters for old NHS numbers'},
15
+ # {:title => 'dob', :format => '\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z',
16
+ # :format_msg => 'Must have format YYYY-MM-DD, e.g. 2013-08-20',
17
+ # :canonical_title => 'birthdate'},
18
+ # {:title => 'postcode'},
19
+ # {:title => 'surname'},
20
+ # {:title => 'data1'},
21
+ # {:title => 'data2'},
22
+ # ],
23
+ # :demographics => [0, 1, 2, 3],
24
+ # }
25
+ # -- delete spaces, upcase, use columns 0+1, 0+2 as keys for core demographics
26
+ # -- treat columns 0, 1, 2, 3 as demographics
27
+ class PseudonymisationSpecification
28
+ KEY_BYTES = 32 # length of randomly generated keys (32 bytes = 256 bits)
29
+ PREAMBLE_V1_STRIPED = 'Pseudonymised matching data v1.0-striped'.freeze
30
+ HEADER_ROW_PREFIX = 'HEADER_ROW'.freeze
31
+
32
+ def initialize(format_spec, key_bundle)
33
+ @format_spec = format_spec
34
+ [:core_demographics, :columns, :demographics, :encrypt_clinical].each do |k|
35
+ unless @format_spec.key?(k)
36
+ raise(ArgumentError, "Expected format_spec to have a #{k.inspect} section")
37
+ end
38
+ end
39
+ @format_spec[:columns].each_with_index do |col, i|
40
+ raise(ArgumentError, "Expected format_spec to have a title for column #{i}") unless col.key?(:title)
41
+ end
42
+ unless [true, false].include?(@format_spec[:encrypt_clinical])
43
+ raise(ArgumentError, 'Expected encrypt_clinical to be true or false')
44
+ end
45
+ @salt1 = key_bundle[:salt1]
46
+ @salt2 = key_bundle[:salt2]
47
+ raise(ArgumentError, 'Invalid salt1') unless @salt1 =~ /\A[0-9a-f]*\Z/ && @salt1.size >= 64
48
+ raise(ArgumentError, 'Invalid salt2') unless @salt2 =~ /\A[0-9a-f]*\Z/ && @salt2.size >= 64
49
+ end
50
+
51
+ # Builds a pseudonymiser with the preferred pseudonymisation class of the given format spec
52
+ def self.factory(format_spec, key_bundle)
53
+ klass_name = format_spec[:pseudonymisation_class]
54
+ if klass_name
55
+ # Support existing format specifications.
56
+ # (Pseudonymisation classes have now moved to NdrPseudonymise namespace.)
57
+ klass_name = klass_name.sub!(/^Pseudonymisation::/, 'NdrPseudonymise::')
58
+ klass = Object.const_get(klass_name)
59
+ unless klass <= NdrPseudonymise::PseudonymisationSpecification
60
+ raise(ArgumentError, "Invalid pseudonymisation_class #{klass_name}")
61
+ end
62
+ else
63
+ klass = NdrPseudonymise::PseudonymisationSpecification
64
+ end
65
+ klass.new(format_spec, key_bundle)
66
+ end
67
+
68
+ def random_key
69
+ SecureRandom.hex(KEY_BYTES)
70
+ end
71
+
72
+ # Returns arrays of core demographics field values, each of the form
73
+ # e.g. [[['nhsnumber', '1234567881']],
74
+ # [['birthdate', '2010-08-21'], ['postcode', 'CB22 3AD']]]
75
+ # Column titles can be remapped using a :canonical_title entry, to ensure
76
+ # consistent pseudo_ids even when column titles are predefined.
77
+ def core_demographics(row)
78
+ @format_spec[:core_demographics].collect do |fields|
79
+ fields.collect do |col_num, delchar, modifier|
80
+ val = row[col_num].to_s
81
+ val = val.to_s.delete(delchar) if delchar
82
+ case modifier
83
+ when :upcase
84
+ val = val.upcase
85
+ when nil
86
+ else
87
+ raise "Unknown modifier #{modifier.inspect} for core_demographics"
88
+ end
89
+ row_spec = @format_spec[:columns][col_num]
90
+ [row_spec[:canonical_title] || row_spec[:title], val]
91
+ end
92
+ end
93
+ end
94
+
95
+ # List of pseudonymised ids, based on this row's core demographics + salt1
96
+ def real_ids(row)
97
+ core_demographics(row).collect do |fields|
98
+ (fields.collect(&:first) +
99
+ fields.collect(&:last)).collect { |s| s.gsub('_', '__') }.join('_')
100
+ end
101
+ end
102
+
103
+ # Convert a real id to a pseudonymised id
104
+ def pseudo_id(real_id)
105
+ data_hash(real_id, @salt1)
106
+ end
107
+
108
+ def data_hash(value, salt)
109
+ Digest::SHA2.hexdigest(value.to_s + salt.to_s)
110
+ end
111
+
112
+ def encrypt_data(data, pseudo_id, partial_crypt_key, salt)
113
+ if [pseudo_id, partial_crypt_key, salt].any? { |s| s.to_s.blank? }
114
+ raise(ArgumentError, 'Expected all key arguments to be non-blank')
115
+ end
116
+ key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
117
+ # unless key =~ /\A[0-9a-f]+\Z/
118
+ # raise(ArgumentError, 'Expected key to be all hex characters (0-9, a-f)')
119
+ # end
120
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
121
+ aes.encrypt
122
+ aes.key = Digest::SHA256.digest(key)
123
+ Base64.strict_encode64(aes.update(data) + aes.final)
124
+ end
125
+
126
+ def decrypt_data(data, pseudo_id, partial_crypt_key, salt)
127
+ key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
128
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
129
+ aes.decrypt
130
+ aes.key = Digest::SHA256.digest(key)
131
+ aes.update(Base64.strict_decode64(data)) + aes.final
132
+ end
133
+
134
+ def self.get_key_bundle(key_fname, admin_password)
135
+ data = File.read(key_fname)
136
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
137
+ aes.decrypt
138
+ aes.key = Digest::SHA256.digest(admin_password)
139
+ begin
140
+ bundle = YAML.load(aes.update(Base64.decode64(data)) + aes.final)
141
+ # Check that the bundle decoded successfully
142
+ raise('Invalid bundle - not a hash') unless bundle.is_a?(Hash)
143
+ bundle
144
+ rescue # => e # Lint/UselessAssignment
145
+ raise('Wrong password or invalid bundle')
146
+ end
147
+ end
148
+
149
+ def all_demographics(row)
150
+ # TODO: What about rows with missing fields?
151
+ result = []
152
+ demographics_cols = @format_spec[:demographics]
153
+ row.each_with_index do |x, i|
154
+ result << x if demographics_cols.include?(i)
155
+ end
156
+ result
157
+ end
158
+
159
+ def clinical_data(row)
160
+ # TODO: What about rows with missing fields?
161
+ result = []
162
+ demographics_cols = @format_spec[:demographics]
163
+ row.each_with_index do |x, i|
164
+ result << x unless demographics_cols.include?(i)
165
+ end
166
+ result
167
+ end
168
+
169
+ # Pseudonymise a row of data, returning 3 sets of rows:
170
+ # [index_rows, demographics_rows, clinical_rows]
171
+ def pseudonymise_row(row)
172
+ index_rows = []
173
+ demographics_rows = []
174
+ clinical_rows = []
175
+ real_ids(row).each do |real_id|
176
+ pseudo = pseudo_id(real_id)
177
+ row_key = random_key
178
+ partial_crypt_key1 = random_key # middle bit of crypto key
179
+ if @format_spec[:encrypt_clinical]
180
+ partial_crypt_key2 = random_key # middle bit of crypto key
181
+ index_rows << [pseudo, row_key, partial_crypt_key1, partial_crypt_key2]
182
+ else
183
+ index_rows << [pseudo, row_key, partial_crypt_key1]
184
+ end
185
+ # demographics and clinical files only have non-information-bearing keys
186
+ demographics_rows << [row_key,
187
+ encrypt_data(safe_json(all_demographics(row)),
188
+ pseudo, partial_crypt_key1, @salt2)]
189
+ safe_clinical = safe_json(clinical_data(row))
190
+ if @format_spec[:encrypt_clinical]
191
+ safe_clinical = encrypt_data(safe_clinical,
192
+ pseudo, partial_crypt_key2, @salt2)
193
+ end
194
+ clinical_rows << [row_key, safe_clinical]
195
+ end
196
+ [index_rows, demographics_rows, clinical_rows]
197
+ end
198
+
199
+ # Convert data to json, but raise exception if it won't safely deserialise
200
+ def safe_json(data)
201
+ result = data.to_json
202
+ unless data == JSON.load(result)
203
+ raise(ArgumentError, "Expected consistent JSON serialisation of #{data.inspect}")
204
+ end
205
+ result
206
+ end
207
+
208
+ # Return true if this row is a valid header row, according to the spec
209
+ def header_row?(row)
210
+ expected_keys = @format_spec[:columns].collect { |col| col[:title] }
211
+ row_keys = row.collect(&:downcase)
212
+ if (row_keys & expected_keys).size >= 3 # at least 3 common keys
213
+ if row_keys == expected_keys
214
+ true # Only expected keys, in right order
215
+ else
216
+ raise(ArgumentError, "Error: invalid header row; expected keys #{expected_keys.inspect}, actually #{row_keys.inspect}")
217
+ end
218
+ else
219
+ false
220
+ end
221
+ end
222
+
223
+ # Return false if this row is a valid data row, otherwise a list of errors
224
+ def row_errors(row)
225
+ @check_cols ||= begin
226
+ check_cols = []
227
+ @format_spec[:columns].each_with_index do |col, i|
228
+ # Unpack column checking meta-data proactively
229
+ if col[:maxlength] || col[:format]
230
+ check_cols << [col, i, col[:maxlength],
231
+ col[:format] && Regexp.new(col[:format])]
232
+ end
233
+ end
234
+ check_cols
235
+ end
236
+ @dmax ||= @format_spec[:core_demographics].flatten(1).collect(&:first).max
237
+ if row.size <= @dmax + 1
238
+ "Missing core demographics: at least #{@dmax} columns expected"
239
+ elsif row[@format_spec[:columns].size..-1].to_a.any? { |s| !s.blank? }
240
+ "Too many columns (#{row.size}); expected #{@format_spec[:columns].size}"
241
+ else
242
+ # Check field formats
243
+ errs = []
244
+ @check_cols.each do |col, i, col_maxlength, col_format_re|
245
+ val = row[i].to_s # Missing columns treated as blank
246
+ if col_maxlength && val.size > col_maxlength
247
+ errs << "Field #{col[:title]} (column #{i + 1}) is longer than maxlength #{col[:maxlength]}."
248
+ end
249
+ if col_format_re
250
+ unless col_format_re.match(val)
251
+ if col[:format_msg]
252
+ errs << "Field #{col[:title]} (column #{i + 1}) #{col[:format_msg]} -- invalid value: #{val}"
253
+ else
254
+ errs << "Field #{col[:title]} (column #{i + 1}) does not match format #{col[:format].inspect} -- invalid value: #{val}"
255
+ end
256
+ end
257
+ end
258
+ end
259
+ if errs.empty?
260
+ false
261
+ else
262
+ errs.join(', ')
263
+ end
264
+ end
265
+ end
266
+
267
+ # Header row for CSV data
268
+ def csv_header_row
269
+ [PREAMBLE_V1_STRIPED]
270
+ end
271
+
272
+ # Append the output of pseudonymise_row to a CSV file
273
+ def emit_csv_rows(out_csv, pseudonymised_row)
274
+ (index_rows, demographics_rows, clinical_rows) = pseudonymised_row
275
+ unless index_rows.size == demographics_rows.size &&
276
+ index_rows.size == clinical_rows.size
277
+ raise(ArgumentError, <<-ERROR
278
+ Mismatch in number of index_rows (#{index_rows.size})
279
+ vs demographics_rows (#{demographics_rows.size})
280
+ vs clinical_rows (#{clinical_rows.size})
281
+ ERROR
282
+ )
283
+ end
284
+
285
+ index_rows.zip(demographics_rows).zip(clinical_rows).collect do |(index_row, demographics_row), clinical_row|
286
+ # Alternate each of 3 data types into 1 output file
287
+ out_csv << index_row
288
+ out_csv << demographics_row
289
+ out_csv << clinical_row
290
+ end
291
+ end
292
+
293
+ # csv_data can be an open IO object (a CSV file), or an array of data rows
294
+ # out_data can be an open IO object or a StringIO -- CSV data is output
295
+ # public_key_fname supports public key encryption of the output
296
+ # progress_monitor is an object for reporting progress, that responds to
297
+ # log_progress(start_time, time_now, csv_row, progress, total)
298
+ # where progress and total are in the same units, either bytes or rows
299
+ def pseudonymise_csv(csv_data, out_data, public_key_fname = nil, progress_monitor = nil)
300
+ csv_lib = CSV
301
+ if csv_data.is_a?(IO) || csv_data.is_a?(StringIO)
302
+ csv = csv_lib.new(csv_data)
303
+ elsif csv_data.is_a?(Array)
304
+ csv = csv_data
305
+ else
306
+ raise(ArgumentError, 'Expected an IO or Array of rows, not a filename for csv_data')
307
+ end
308
+
309
+ if public_key_fname
310
+ unless File.exist?(public_key_fname)
311
+ raise(ArgumentError, "Missing public key file: #{public_key_fname}")
312
+ end
313
+ rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname), nil)
314
+ end
315
+
316
+ unless out_data.respond_to?('<<')
317
+ raise(ArgumentError, 'Expected an IO or writeable structure for out_data')
318
+ end
319
+ out_buff = StringIO.new
320
+ out_csv = csv_lib.new(out_buff)
321
+ out_csv << csv_header_row
322
+ out_buff.rewind
323
+ out_data <<
324
+ if public_key_fname
325
+ rsa_aes_cbc.encrypt(out_buff.read) + "\n"
326
+ else
327
+ out_buff.read
328
+ end
329
+
330
+ i = 0
331
+ t0 = Time.current
332
+ csv_size = progress_monitor && csv_data.size
333
+ csv.each do |row|
334
+ out_buff = StringIO.new
335
+ out_csv = csv_lib.new(out_buff)
336
+ i += 1
337
+ if i == 1 && header_row?(row)
338
+ # Preserve header row in output
339
+ out_csv << [HEADER_ROW_PREFIX] + row
340
+ else
341
+ errs = row_errors(row)
342
+ raise("Invalid row #{i}: #{errs}") if errs
343
+ begin
344
+ emit_csv_rows(out_csv, pseudonymise_row(row))
345
+ rescue ArgumentError, RuntimeError => e
346
+ raise(ArgumentError, "Invalid row #{i}: #{e}", e.backtrace)
347
+ end
348
+ end
349
+ out_buff.rewind
350
+ out_data <<
351
+ if public_key_fname
352
+ rsa_aes_cbc.encrypt(out_buff.read) + "\n"
353
+ else
354
+ out_buff.read
355
+ end
356
+
357
+ # Current runs at about 325 rows per second for prescription data 2016-05-09 ruby 2.3.1
358
+ # so try to log progress about every 15 seconds
359
+ if (i % 5000) == 0 && progress_monitor
360
+ progress_monitor.log_progress(t0, Time.current, i, csv.is_a?(Array) ? i : csv.pos, csv_size)
361
+ end
362
+ end
363
+ if (i % 5000) != 0 && progress_monitor
364
+ progress_monitor.log_progress(t0, Time.current, i, csv_size, csv_size)
365
+ end
366
+ end
367
+
368
+ # Decrypt public key encrypted data to a CSV file
369
+ # encrypted_data can be an open IO object (a file), or an array of data rows
370
+ # out_data can be an open IO object or a StringIO -- CSV data is output
371
+ def decrypt_to_csv(encrypted_data, out_data, public_key_fname, private_key_fname)
372
+ rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname),
373
+ File.read(private_key_fname))
374
+ encrypted_data.each do |crypto_data|
375
+ out_data << rsa_aes_cbc.decrypt(crypto_data)
376
+ end
377
+ end
378
+ end
379
+ end
@@ -0,0 +1,92 @@
1
+ require 'optparse'
2
+ require 'logger'
3
+ require_relative 'pseudonymised_file_wrapper'
4
+
5
+ # This is primarily a CLI to pseudonymised_file_wrapper.rb, with a few additional bells
6
+ # and whistles. For details about the output format of individual files, see the comments
7
+ # in the wrapper.
8
+ #
9
+ # run: bundle exec ruby pseydonymised_file_converter.rb <filename>
10
+ logger = Logger.new(STDOUT)
11
+ options = { mode: :pretty_write,
12
+ direction: :horizontal,
13
+ include_name: true,
14
+ comparison_mode: false }
15
+ OptionParser.new do |opts|
16
+ opts.banner = 'Usage; pseudonymised_file_converter <filenames> [options]'
17
+ opts.on('-f',
18
+ '--fields',
19
+ 'Report available fields') { options[:mode] = :report_fields }
20
+ opts.on('-v',
21
+ '--vertical',
22
+ 'Report available fields vertically') { options[:direction] = :vertical }
23
+ opts.on('-n',
24
+ '--no-name',
25
+ 'Exclude filename in horizontal printing') { options[:include_name] = false }
26
+ # Handy for inspecting numerous files form one provider with different field sets.
27
+ # This option figures out which fields are common to all the provided files, then
28
+ # groups files by the sets of fields which distinguish them
29
+ opts.on('-c',
30
+ '--compare-fields',
31
+ 'Figure out available filds') { options[:comparison_mode] = true }
32
+ opts.on('-b', '--batch x y z', Array, 'Not yet implemented!') do |list|
33
+ options[:files] = list
34
+ end
35
+ end.parse!
36
+
37
+ raise 'No filename provided' unless ARGV
38
+
39
+ if options[:comparison_mode]
40
+ results = {}
41
+ (ARGV + STDIN.readlines.map(&:strip)).each do |file|
42
+ fw = PseudonymisedFileWrapper.new(file)
43
+ fw.process
44
+ results[file] = fw.available_fields
45
+ end
46
+
47
+ common_fields = results.map { |_k, v| v }.inject(:&)
48
+ logger.debug 'Common fields: '
49
+ common_fields.each do |field|
50
+ logger.debug "\t#{field}"
51
+ end
52
+
53
+ files_and_fields = results.map { |k, v| [k, v - common_fields] }
54
+
55
+ files_and_fields.chunk { |_k, v| v } .each do |_k, v|
56
+ logger.debug '********* Field Chunk *********'
57
+ if v[0][0]
58
+ v[0][1].each do |field|
59
+ logger.debug "\t#{field}"
60
+ end
61
+ end
62
+
63
+ logger.debug ''
64
+ v.each do |file, _fields|
65
+ logger.debug "\t#{file}"
66
+ end
67
+ logger.debug ''
68
+ end
69
+ else
70
+ ARGV.each do |file|
71
+ logger.debug file
72
+ logger.debug file.class
73
+ fw = PseudonymisedFileWrapper.new(file)
74
+ fw.process
75
+ case options[:mode]
76
+ when :pretty_write
77
+ fw.pretty_write
78
+ when :report_fields
79
+ case options[:direction]
80
+ when :horizontal
81
+ logger.debug "#{file if options[:include_name]}: #{fw.available_fields.sort}"
82
+ when :vertical
83
+ logger.debug "#{file}: "
84
+ fw.available_fields.sort.each do |field|
85
+ logger.debug "\t#{field}"
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ # *************** Read in the file, parsing and recording fields in each line **************
@@ -0,0 +1,96 @@
1
+ require 'json'
2
+ require 'csv'
3
+ # require 'pry'
4
+ require 'logger'
5
+
6
+ # To convert files from the command line, see pseudonymised_file_converter.rb, which has a
7
+ # CLI set up. To use this wrapper to convert files from within a ruby program:
8
+ #
9
+ # wrapper = PseudonymisedFileWrapper.new(<filename>)
10
+ # wrapper.process
11
+ # wrapper.pretty_write
12
+ #
13
+ # This will create an excel-readable copy of the file in the same location as the original.
14
+ # The new file will be named the same as the original, with .pseudo converted to _pretty.csv
15
+ # There is a column for every field present in any record, and the column name is prefixed
16
+ # by 'mapped' or 'raw' according to which column it was in in the .pseudo version.
17
+ # As this is only intended for human viewing, the values of encrypted fields are not output.
18
+ # This conveniently has the effect of making the csv files notable smaller than their
19
+ # .pseudo counterparts
20
+ #
21
+
22
+ # Provide the ability to extract fieldnames and create CSV output from .pseudo files
23
+ class PseudonymisedFileWrapper
24
+ def initialize(filename)
25
+ @filename = filename
26
+ @logger = Logger.new(STDOUT)
27
+ end
28
+
29
+ def available_fields
30
+ (@all_fields1 + @all_fields2).sort.uniq
31
+ end
32
+
33
+ # Read in the source file, accumulating all the field names used in any row
34
+ def process
35
+ line_counter = 1
36
+ processed_lines = []
37
+ all_fields1 = []
38
+ all_fields2 = []
39
+ CSV.foreach(@filename) do |row|
40
+ if row.size == 1
41
+ # Header; do nothing
42
+ elsif row.size == 7
43
+ cur = { map1: JSON.parse(row[4]),
44
+ map2: JSON.parse(row[6]),
45
+ id1: row[0],
46
+ id2: row[1],
47
+ keys: row[2] }
48
+ processed_lines.push(cur)
49
+ all_fields1.push(*cur[:map1].keys).uniq!
50
+ all_fields2.push(*cur[:map2].keys).uniq!
51
+ else
52
+ @logger.debug"Line #{line_counter} contained unexpected number of fields: #{row.size}"
53
+ end
54
+ line_counter += 1
55
+ end
56
+ @lines = line_counter
57
+ @all_fields1 = all_fields1
58
+ @all_fields2 = all_fields2
59
+ @processed_lines = processed_lines
60
+ end
61
+
62
+ # Create an excel-readable CSV file, in the same location as the original
63
+ def pretty_write
64
+ /(?<base_name>.*)\.(?:csv|(?:zip|xlsx?)\.pseudo)/i.match(@filename)
65
+ target_filename = "#{$LAST_MATCH_INFO[:base_name]}_pretty.csv"
66
+ @logger.debug "Writing output to #{target_filename}"
67
+ CSV.open(target_filename, 'w') do |file|
68
+ headers = (@all_fields1.map { |name| "mapped:#{name}" } +
69
+ @all_fields2.map { |name| "raw:#{name}" } +
70
+ %w(pseudo_id1 pseudo_id2 key_bundle))
71
+ file << headers
72
+ @processed_lines.each do |line|
73
+ output_fields = @all_fields1.map { |field| line[:map1][field] } +
74
+ @all_fields2.map { |field| line[:map2][field] }
75
+ output_fields.push(line[:id1], line[:id2], line[:keys])
76
+ file << output_fields
77
+ end
78
+ end
79
+ end
80
+
81
+ def pretty_data
82
+ csv_string = CSV.generate do |csv|
83
+ headers = (@all_fields1.map { |name| "mapped:#{name}" } +
84
+ @all_fields2.map { |name| "raw:#{name}" } +
85
+ %w(pseudo_id1 pseudo_id2 key_bundle))
86
+ csv << headers
87
+ @processed_lines.each do |line|
88
+ output_fields = @all_fields1.map { |field| line[:map1][field] } +
89
+ @all_fields2.map { |field| line[:map2][field] }
90
+ output_fields.push(line[:id1], line[:id2], line[:keys])
91
+ csv << output_fields
92
+ end
93
+ end
94
+ csv_string
95
+ end
96
+ end
@@ -0,0 +1,125 @@
1
+ require 'digest/sha1'
2
+ require 'securerandom'
3
+ require 'base64'
4
+
5
+ module NdrPseudonymise
6
+ # Simple pseudonymisation library, for efficient pseudonymisation of
7
+ # identifiable data, suitable for fuzzy matching
8
+ #
9
+ # Sample usage:
10
+ # Set up clinical data and demographics
11
+ # clinical_data = ... load pdf file ...
12
+ # all_demographics = {'nhsnumber' => '1234567881', 'postcode' => 'CB22 3AD',
13
+ # 'birthdate' => '1975-10-22', 'surname' => 'SMITH', 'forenames' => 'JOHN ROBERT'}
14
+ #
15
+ # # Generate pseudonymised identifiers and encryption keys
16
+ # (pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key) =
17
+ # NdrPseudonymise::SimplePseudonymisation.generate_keys(salt_id, salt_demog, salt_clinical,
18
+ # all_demographics['nhsnumber'], all_demographics['postcode'], all_demographics['birthdate'])
19
+ #
20
+ # # Emit first 4 values as index demographics
21
+ # emit_index_demographics(pseudo_id1, pseudo_id2, key_bundle, rowid)
22
+ #
23
+ # # Encrypt all demographics with demog_key
24
+ # emit_encrypted_demographics(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data64(demog_key, all_demographics.to_json))
25
+ #
26
+ # # Encrypt all clinical data with clinical_key
27
+ # emit_encrypted_clinical_data(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data(clinical_key, clinical_data))
28
+ #
29
+ class SimplePseudonymisation
30
+ # Generate pseudonymised identifiers and pseudonymisation keys
31
+ # Returns an array of 6 strings:
32
+ # [pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
33
+ def self.generate_keys(salt_id, salt_demog, salt_clinical, nhsnumber, current_postcode, birthdate)
34
+ unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
35
+ raise 'Invalid NHS number'
36
+ end
37
+ unless current_postcode.is_a?(String) && current_postcode =~ /\A[A-Z0-9 ]*\Z/
38
+ raise 'Invalid postcode'
39
+ end
40
+ unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
41
+ raise 'Invalid birthdate'
42
+ end
43
+ real_id1 = 'nhsnumber_' + nhsnumber
44
+ # Delete spaces from postcode
45
+ real_id2 = 'birthdate_postcode_' + birthdate + '_' + current_postcode.split(' ').join('')
46
+
47
+ pseudo_id1 = data_hash(real_id1, salt_id)
48
+ pseudo_id2 = data_hash(real_id2, salt_id)
49
+ demog_key = random_key
50
+ clinical_key = random_key
51
+ keys = []
52
+ if nhsnumber.length > 0
53
+ keys += [encrypt_data64(real_id1 + salt_demog, demog_key),
54
+ encrypt_data64(real_id1 + salt_clinical, clinical_key)]
55
+ end
56
+ if current_postcode.length > 0 && birthdate.length > 0
57
+ keys += [encrypt_data64(real_id2 + salt_demog, demog_key),
58
+ encrypt_data64(real_id2 + salt_clinical, clinical_key)]
59
+ end
60
+ # TODO: Consider whether it's worth storing something, if keys would otherwise be empty.
61
+ key_bundle = keys.join(' ')
62
+ rowid = random_key
63
+ [pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
64
+ end
65
+
66
+ # Generate pseudonymised identifiers and pseudonymisation keys
67
+ # for data with only an NHS number (missing patient postcode or DOB), where
68
+ # only the demographics need to be pseudonymised (e.g. prescription data).
69
+ # Returns an array of 3 strings:
70
+ # [pseudo_id1, key_bundle, demog_key]
71
+ def self.generate_keys_nhsnumber_demog_only(salt_id, salt_demog, nhsnumber)
72
+ unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
73
+ raise 'Invalid NHS number'
74
+ end
75
+ real_id1 = 'nhsnumber_' + nhsnumber
76
+
77
+ pseudo_id1 = data_hash(real_id1, salt_id)
78
+ demog_key = random_key
79
+ key_bundle = if nhsnumber.length > 0
80
+ encrypt_data64(real_id1 + salt_demog, demog_key)
81
+ else
82
+ ''
83
+ end
84
+ [pseudo_id1, key_bundle, demog_key]
85
+ end
86
+
87
+ def self.data_hash(value, salt)
88
+ Digest::SHA2.hexdigest(value.to_s + salt.to_s)
89
+ end
90
+
91
+ def self.random_key
92
+ SecureRandom.hex(32) # 32 bytes = 256 bits
93
+ end
94
+
95
+ # returns a base-64 encoded string
96
+ def self.encrypt_data64(key, data)
97
+ Base64.strict_encode64(encrypt_data(key, data))
98
+ end
99
+
100
+ # returns a binary string
101
+ def self.encrypt_data(key, data)
102
+ unless key =~ /[0-9a-f]{32}/
103
+ raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
104
+ end
105
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
106
+ aes.encrypt
107
+ aes.key = Digest::SHA256.digest(key)
108
+ aes.update(data) + aes.final
109
+ end
110
+
111
+ def self.decrypt_data64(key, data)
112
+ decrypt_data(key, Base64.strict_decode64(data))
113
+ end
114
+
115
+ def self.decrypt_data(key, data)
116
+ unless key =~ /[0-9a-f]{32}/
117
+ raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
118
+ end
119
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
120
+ aes.decrypt
121
+ aes.key = Digest::SHA256.digest(key.chomp)
122
+ (aes.update(data) + aes.final)
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,3 @@
1
+ module NdrPseudonymise
2
+ VERSION = '0.4.1'.freeze
3
+ end