ndr_pseudonymise 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,379 @@
1
+ require 'securerandom'
2
+ require 'json'
3
+ require 'csv'
4
+ require 'stringio'
5
+ require 'rsa_aes_cbc'
6
+
7
+ module NdrPseudonymise
8
+ # Pseudonymise CSV data for matching purposes
9
+ # Sample format spec:
10
+ # {:core_demographics => [[[0, ' ']],
11
+ # [[1, ' ', :upcase], [2, ' ', :upcase]]],
12
+ # :columns => [
13
+ # {:title => 'nhsnumber', :maxlength => 12, :format => '\A[0-9A-Z]*\Z',
14
+ # :format_msg => 'Must contain only numbers, or numbers and letters for old NHS numbers'},
15
+ # {:title => 'dob', :format => '\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z',
16
+ # :format_msg => 'Must have format YYYY-MM-DD, e.g. 2013-08-20',
17
+ # :canonical_title => 'birthdate'},
18
+ # {:title => 'postcode'},
19
+ # {:title => 'surname'},
20
+ # {:title => 'data1'},
21
+ # {:title => 'data2'},
22
+ # ],
23
+ # :demographics => [0, 1, 2, 3],
24
+ # }
25
+ # -- delete spaces, upcase, use columns 0+1, 0+2 as keys for core demographics
26
+ # -- treat columns 0, 1, 2, 3 as demographics
27
+ class PseudonymisationSpecification
28
+ KEY_BYTES = 32 # length of randomly generated keys (32 bytes = 256 bits)
29
+ PREAMBLE_V1_STRIPED = 'Pseudonymised matching data v1.0-striped'.freeze
30
+ HEADER_ROW_PREFIX = 'HEADER_ROW'.freeze
31
+
32
+ def initialize(format_spec, key_bundle)
33
+ @format_spec = format_spec
34
+ [:core_demographics, :columns, :demographics, :encrypt_clinical].each do |k|
35
+ unless @format_spec.key?(k)
36
+ raise(ArgumentError, "Expected format_spec to have a #{k.inspect} section")
37
+ end
38
+ end
39
+ @format_spec[:columns].each_with_index do |col, i|
40
+ raise(ArgumentError, "Expected format_spec to have a title for column #{i}") unless col.key?(:title)
41
+ end
42
+ unless [true, false].include?(@format_spec[:encrypt_clinical])
43
+ raise(ArgumentError, 'Expected encrypt_clinical to be true or false')
44
+ end
45
+ @salt1 = key_bundle[:salt1]
46
+ @salt2 = key_bundle[:salt2]
47
+ raise(ArgumentError, 'Invalid salt1') unless @salt1 =~ /\A[0-9a-f]*\Z/ && @salt1.size >= 64
48
+ raise(ArgumentError, 'Invalid salt2') unless @salt2 =~ /\A[0-9a-f]*\Z/ && @salt2.size >= 64
49
+ end
50
+
51
+ # Builds a pseudonymiser with the preferred pseudonymisation class of the given format spec
52
+ def self.factory(format_spec, key_bundle)
53
+ klass_name = format_spec[:pseudonymisation_class]
54
+ if klass_name
55
+ # Support existing format specifications.
56
+ # (Pseudonymisation classes have now moved to NdrPseudonymise namespace.)
57
+ klass_name = klass_name.sub!(/^Pseudonymisation::/, 'NdrPseudonymise::')
58
+ klass = Object.const_get(klass_name)
59
+ unless klass <= NdrPseudonymise::PseudonymisationSpecification
60
+ raise(ArgumentError, "Invalid pseudonymisation_class #{klass_name}")
61
+ end
62
+ else
63
+ klass = NdrPseudonymise::PseudonymisationSpecification
64
+ end
65
+ klass.new(format_spec, key_bundle)
66
+ end
67
+
68
+ def random_key
69
+ SecureRandom.hex(KEY_BYTES)
70
+ end
71
+
72
+ # Returns arrays of core demographics field values, each of the form
73
+ # e.g. [[['nhsnumber', '1234567881']],
74
+ # [['birthdate', '2010-08-21'], ['postcode', 'CB22 3AD']]]
75
+ # Column titles can be remapped using a :canonical_title entry, to ensure
76
+ # consistent pseudo_ids even when column titles are predefined.
77
+ def core_demographics(row)
78
+ @format_spec[:core_demographics].collect do |fields|
79
+ fields.collect do |col_num, delchar, modifier|
80
+ val = row[col_num].to_s
81
+ val = val.to_s.delete(delchar) if delchar
82
+ case modifier
83
+ when :upcase
84
+ val = val.upcase
85
+ when nil
86
+ else
87
+ raise "Unknown modifier #{modifier.inspect} for core_demographics"
88
+ end
89
+ row_spec = @format_spec[:columns][col_num]
90
+ [row_spec[:canonical_title] || row_spec[:title], val]
91
+ end
92
+ end
93
+ end
94
+
95
+ # List of pseudonymised ids, based on this row's core demographics + salt1
96
+ def real_ids(row)
97
+ core_demographics(row).collect do |fields|
98
+ (fields.collect(&:first) +
99
+ fields.collect(&:last)).collect { |s| s.gsub('_', '__') }.join('_')
100
+ end
101
+ end
102
+
103
+ # Convert a real id to a pseudonymised id
104
+ def pseudo_id(real_id)
105
+ data_hash(real_id, @salt1)
106
+ end
107
+
108
+ def data_hash(value, salt)
109
+ Digest::SHA2.hexdigest(value.to_s + salt.to_s)
110
+ end
111
+
112
+ def encrypt_data(data, pseudo_id, partial_crypt_key, salt)
113
+ if [pseudo_id, partial_crypt_key, salt].any? { |s| s.to_s.blank? }
114
+ raise(ArgumentError, 'Expected all key arguments to be non-blank')
115
+ end
116
+ key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
117
+ # unless key =~ /\A[0-9a-f]+\Z/
118
+ # raise(ArgumentError, 'Expected key to be all hex characters (0-9, a-f)')
119
+ # end
120
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
121
+ aes.encrypt
122
+ aes.key = Digest::SHA256.digest(key)
123
+ Base64.strict_encode64(aes.update(data) + aes.final)
124
+ end
125
+
126
+ def decrypt_data(data, pseudo_id, partial_crypt_key, salt)
127
+ key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
128
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
129
+ aes.decrypt
130
+ aes.key = Digest::SHA256.digest(key)
131
+ aes.update(Base64.strict_decode64(data)) + aes.final
132
+ end
133
+
134
+ def self.get_key_bundle(key_fname, admin_password)
135
+ data = File.read(key_fname)
136
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
137
+ aes.decrypt
138
+ aes.key = Digest::SHA256.digest(admin_password)
139
+ begin
140
+ bundle = YAML.load(aes.update(Base64.decode64(data)) + aes.final)
141
+ # Check that the bundle decoded successfully
142
+ raise('Invalid bundle - not a hash') unless bundle.is_a?(Hash)
143
+ bundle
144
+ rescue # => e # Lint/UselessAssignment
145
+ raise('Wrong password or invalid bundle')
146
+ end
147
+ end
148
+
149
+ def all_demographics(row)
150
+ # TODO: What about rows with missing fields?
151
+ result = []
152
+ demographics_cols = @format_spec[:demographics]
153
+ row.each_with_index do |x, i|
154
+ result << x if demographics_cols.include?(i)
155
+ end
156
+ result
157
+ end
158
+
159
+ def clinical_data(row)
160
+ # TODO: What about rows with missing fields?
161
+ result = []
162
+ demographics_cols = @format_spec[:demographics]
163
+ row.each_with_index do |x, i|
164
+ result << x unless demographics_cols.include?(i)
165
+ end
166
+ result
167
+ end
168
+
169
+ # Pseudonymise a row of data, returning 3 sets of rows:
170
+ # [index_rows, demographics_rows, clinical_rows]
171
+ def pseudonymise_row(row)
172
+ index_rows = []
173
+ demographics_rows = []
174
+ clinical_rows = []
175
+ real_ids(row).each do |real_id|
176
+ pseudo = pseudo_id(real_id)
177
+ row_key = random_key
178
+ partial_crypt_key1 = random_key # middle bit of crypto key
179
+ if @format_spec[:encrypt_clinical]
180
+ partial_crypt_key2 = random_key # middle bit of crypto key
181
+ index_rows << [pseudo, row_key, partial_crypt_key1, partial_crypt_key2]
182
+ else
183
+ index_rows << [pseudo, row_key, partial_crypt_key1]
184
+ end
185
+ # demographics and clinical files only have non-information-bearing keys
186
+ demographics_rows << [row_key,
187
+ encrypt_data(safe_json(all_demographics(row)),
188
+ pseudo, partial_crypt_key1, @salt2)]
189
+ safe_clinical = safe_json(clinical_data(row))
190
+ if @format_spec[:encrypt_clinical]
191
+ safe_clinical = encrypt_data(safe_clinical,
192
+ pseudo, partial_crypt_key2, @salt2)
193
+ end
194
+ clinical_rows << [row_key, safe_clinical]
195
+ end
196
+ [index_rows, demographics_rows, clinical_rows]
197
+ end
198
+
199
+ # Convert data to json, but raise exception if it won't safely deserialise
200
+ def safe_json(data)
201
+ result = data.to_json
202
+ unless data == JSON.load(result)
203
+ raise(ArgumentError, "Expected consistent JSON serialisation of #{data.inspect}")
204
+ end
205
+ result
206
+ end
207
+
208
+ # Return true if this row is a valid header row, according to the spec
209
+ def header_row?(row)
210
+ expected_keys = @format_spec[:columns].collect { |col| col[:title] }
211
+ row_keys = row.collect(&:downcase)
212
+ if (row_keys & expected_keys).size >= 3 # at least 3 common keys
213
+ if row_keys == expected_keys
214
+ true # Only expected keys, in right order
215
+ else
216
+ raise(ArgumentError, "Error: invalid header row; expected keys #{expected_keys.inspect}, actually #{row_keys.inspect}")
217
+ end
218
+ else
219
+ false
220
+ end
221
+ end
222
+
223
+ # Return false if this row is a valid data row, otherwise a list of errors
224
+ def row_errors(row)
225
+ @check_cols ||= begin
226
+ check_cols = []
227
+ @format_spec[:columns].each_with_index do |col, i|
228
+ # Unpack column checking meta-data proactively
229
+ if col[:maxlength] || col[:format]
230
+ check_cols << [col, i, col[:maxlength],
231
+ col[:format] && Regexp.new(col[:format])]
232
+ end
233
+ end
234
+ check_cols
235
+ end
236
+ @dmax ||= @format_spec[:core_demographics].flatten(1).collect(&:first).max
237
+ if row.size <= @dmax + 1
238
+ "Missing core demographics: at least #{@dmax} columns expected"
239
+ elsif row[@format_spec[:columns].size..-1].to_a.any? { |s| !s.blank? }
240
+ "Too many columns (#{row.size}); expected #{@format_spec[:columns].size}"
241
+ else
242
+ # Check field formats
243
+ errs = []
244
+ @check_cols.each do |col, i, col_maxlength, col_format_re|
245
+ val = row[i].to_s # Missing columns treated as blank
246
+ if col_maxlength && val.size > col_maxlength
247
+ errs << "Field #{col[:title]} (column #{i + 1}) is longer than maxlength #{col[:maxlength]}."
248
+ end
249
+ if col_format_re
250
+ unless col_format_re.match(val)
251
+ if col[:format_msg]
252
+ errs << "Field #{col[:title]} (column #{i + 1}) #{col[:format_msg]} -- invalid value: #{val}"
253
+ else
254
+ errs << "Field #{col[:title]} (column #{i + 1}) does not match format #{col[:format].inspect} -- invalid value: #{val}"
255
+ end
256
+ end
257
+ end
258
+ end
259
+ if errs.empty?
260
+ false
261
+ else
262
+ errs.join(', ')
263
+ end
264
+ end
265
+ end
266
+
267
+ # Header row for CSV data
268
+ def csv_header_row
269
+ [PREAMBLE_V1_STRIPED]
270
+ end
271
+
272
+ # Append the output of pseudonymise_row to a CSV file
273
+ def emit_csv_rows(out_csv, pseudonymised_row)
274
+ (index_rows, demographics_rows, clinical_rows) = pseudonymised_row
275
+ unless index_rows.size == demographics_rows.size &&
276
+ index_rows.size == clinical_rows.size
277
+ raise(ArgumentError, <<-ERROR
278
+ Mismatch in number of index_rows (#{index_rows.size})
279
+ vs demographics_rows (#{demographics_rows.size})
280
+ vs clinical_rows (#{clinical_rows.size})
281
+ ERROR
282
+ )
283
+ end
284
+
285
+ index_rows.zip(demographics_rows).zip(clinical_rows).collect do |(index_row, demographics_row), clinical_row|
286
+ # Alternate each of 3 data types into 1 output file
287
+ out_csv << index_row
288
+ out_csv << demographics_row
289
+ out_csv << clinical_row
290
+ end
291
+ end
292
+
293
+ # csv_data can be an open IO object (a CSV file), or an array of data rows
294
+ # out_data can be an open IO object or a StringIO -- CSV data is output
295
+ # public_key_fname supports public key encryption of the output
296
+ # progress_monitor is an object for reporting progress, that responds to
297
+ # log_progress(start_time, time_now, csv_row, progress, total)
298
+ # where progress and total are in the same units, either bytes or rows
299
+ def pseudonymise_csv(csv_data, out_data, public_key_fname = nil, progress_monitor = nil)
300
+ csv_lib = CSV
301
+ if csv_data.is_a?(IO) || csv_data.is_a?(StringIO)
302
+ csv = csv_lib.new(csv_data)
303
+ elsif csv_data.is_a?(Array)
304
+ csv = csv_data
305
+ else
306
+ raise(ArgumentError, 'Expected an IO or Array of rows, not a filename for csv_data')
307
+ end
308
+
309
+ if public_key_fname
310
+ unless File.exist?(public_key_fname)
311
+ raise(ArgumentError, "Missing public key file: #{public_key_fname}")
312
+ end
313
+ rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname), nil)
314
+ end
315
+
316
+ unless out_data.respond_to?('<<')
317
+ raise(ArgumentError, 'Expected an IO or writeable structure for out_data')
318
+ end
319
+ out_buff = StringIO.new
320
+ out_csv = csv_lib.new(out_buff)
321
+ out_csv << csv_header_row
322
+ out_buff.rewind
323
+ out_data <<
324
+ if public_key_fname
325
+ rsa_aes_cbc.encrypt(out_buff.read) + "\n"
326
+ else
327
+ out_buff.read
328
+ end
329
+
330
+ i = 0
331
+ t0 = Time.current
332
+ csv_size = progress_monitor && csv_data.size
333
+ csv.each do |row|
334
+ out_buff = StringIO.new
335
+ out_csv = csv_lib.new(out_buff)
336
+ i += 1
337
+ if i == 1 && header_row?(row)
338
+ # Preserve header row in output
339
+ out_csv << [HEADER_ROW_PREFIX] + row
340
+ else
341
+ errs = row_errors(row)
342
+ raise("Invalid row #{i}: #{errs}") if errs
343
+ begin
344
+ emit_csv_rows(out_csv, pseudonymise_row(row))
345
+ rescue ArgumentError, RuntimeError => e
346
+ raise(ArgumentError, "Invalid row #{i}: #{e}", e.backtrace)
347
+ end
348
+ end
349
+ out_buff.rewind
350
+ out_data <<
351
+ if public_key_fname
352
+ rsa_aes_cbc.encrypt(out_buff.read) + "\n"
353
+ else
354
+ out_buff.read
355
+ end
356
+
357
+ # Current runs at about 325 rows per second for prescription data 2016-05-09 ruby 2.3.1
358
+ # so try to log progress about every 15 seconds
359
+ if (i % 5000) == 0 && progress_monitor
360
+ progress_monitor.log_progress(t0, Time.current, i, csv.is_a?(Array) ? i : csv.pos, csv_size)
361
+ end
362
+ end
363
+ if (i % 5000) != 0 && progress_monitor
364
+ progress_monitor.log_progress(t0, Time.current, i, csv_size, csv_size)
365
+ end
366
+ end
367
+
368
+ # Decrypt public key encrypted data to a CSV file
369
+ # encrypted_data can be an open IO object (a file), or an array of data rows
370
+ # out_data can be an open IO object or a StringIO -- CSV data is output
371
+ def decrypt_to_csv(encrypted_data, out_data, public_key_fname, private_key_fname)
372
+ rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname),
373
+ File.read(private_key_fname))
374
+ encrypted_data.each do |crypto_data|
375
+ out_data << rsa_aes_cbc.decrypt(crypto_data)
376
+ end
377
+ end
378
+ end
379
+ end
@@ -0,0 +1,92 @@
1
+ require 'optparse'
2
+ require 'logger'
3
+ require_relative 'pseudonymised_file_wrapper'
4
+
5
+ # This is primarily a CLI to pseudonymised_file_wrapper.rb, with a few additional bells
6
+ # and whistles. For details about the output format of individual files, see the comments
7
+ # in the wrapper.
8
+ #
9
+ # run: bundle exec ruby pseydonymised_file_converter.rb <filename>
10
+ logger = Logger.new(STDOUT)
11
+ options = { mode: :pretty_write,
12
+ direction: :horizontal,
13
+ include_name: true,
14
+ comparison_mode: false }
15
+ OptionParser.new do |opts|
16
+ opts.banner = 'Usage; pseudonymised_file_converter <filenames> [options]'
17
+ opts.on('-f',
18
+ '--fields',
19
+ 'Report available fields') { options[:mode] = :report_fields }
20
+ opts.on('-v',
21
+ '--vertical',
22
+ 'Report available fields vertically') { options[:direction] = :vertical }
23
+ opts.on('-n',
24
+ '--no-name',
25
+ 'Exclude filename in horizontal printing') { options[:include_name] = false }
26
+ # Handy for inspecting numerous files form one provider with different field sets.
27
+ # This option figures out which fields are common to all the provided files, then
28
+ # groups files by the sets of fields which distinguish them
29
+ opts.on('-c',
30
+ '--compare-fields',
31
+ 'Figure out available filds') { options[:comparison_mode] = true }
32
+ opts.on('-b', '--batch x y z', Array, 'Not yet implemented!') do |list|
33
+ options[:files] = list
34
+ end
35
+ end.parse!
36
+
37
+ raise 'No filename provided' unless ARGV
38
+
39
+ if options[:comparison_mode]
40
+ results = {}
41
+ (ARGV + STDIN.readlines.map(&:strip)).each do |file|
42
+ fw = PseudonymisedFileWrapper.new(file)
43
+ fw.process
44
+ results[file] = fw.available_fields
45
+ end
46
+
47
+ common_fields = results.map { |_k, v| v }.inject(:&)
48
+ logger.debug 'Common fields: '
49
+ common_fields.each do |field|
50
+ logger.debug "\t#{field}"
51
+ end
52
+
53
+ files_and_fields = results.map { |k, v| [k, v - common_fields] }
54
+
55
+ files_and_fields.chunk { |_k, v| v } .each do |_k, v|
56
+ logger.debug '********* Field Chunk *********'
57
+ if v[0][0]
58
+ v[0][1].each do |field|
59
+ logger.debug "\t#{field}"
60
+ end
61
+ end
62
+
63
+ logger.debug ''
64
+ v.each do |file, _fields|
65
+ logger.debug "\t#{file}"
66
+ end
67
+ logger.debug ''
68
+ end
69
+ else
70
+ ARGV.each do |file|
71
+ logger.debug file
72
+ logger.debug file.class
73
+ fw = PseudonymisedFileWrapper.new(file)
74
+ fw.process
75
+ case options[:mode]
76
+ when :pretty_write
77
+ fw.pretty_write
78
+ when :report_fields
79
+ case options[:direction]
80
+ when :horizontal
81
+ logger.debug "#{file if options[:include_name]}: #{fw.available_fields.sort}"
82
+ when :vertical
83
+ logger.debug "#{file}: "
84
+ fw.available_fields.sort.each do |field|
85
+ logger.debug "\t#{field}"
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ # *************** Read in the file, parsing and recording fields in each line **************
@@ -0,0 +1,96 @@
1
+ require 'json'
2
+ require 'csv'
3
+ # require 'pry'
4
+ require 'logger'
5
+
6
+ # To convert files from the command line, see pseudonymised_file_converter.rb, which has a
7
+ # CLI set up. To use this wrapper to convert files from within a ruby program:
8
+ #
9
+ # wrapper = PseudonymisedFileWrapper.new(<filename>)
10
+ # wrapper.process
11
+ # wrapper.pretty_write
12
+ #
13
+ # This will create an excel-readable copy of the file in the same location as the original.
14
+ # The new file will be named the same as the original, with .pseudo converted to _pretty.csv
15
+ # There is a column for every field present in any record, and the column name is prefixed
16
+ # by 'mapped' or 'raw' according to which column it was in in the .pseudo version.
17
+ # As this is only intended for human viewing, the values of encrypted fields are not output.
18
+ # This conveniently has the effect of making the csv files notable smaller than their
19
+ # .pseudo counterparts
20
+ #
21
+
22
+ # Provide the ability to extract fieldnames and create CSV output from .pseudo files
23
+ class PseudonymisedFileWrapper
24
+ def initialize(filename)
25
+ @filename = filename
26
+ @logger = Logger.new(STDOUT)
27
+ end
28
+
29
+ def available_fields
30
+ (@all_fields1 + @all_fields2).sort.uniq
31
+ end
32
+
33
+ # Read in the source file, accumulating all the field names used in any row
34
+ def process
35
+ line_counter = 1
36
+ processed_lines = []
37
+ all_fields1 = []
38
+ all_fields2 = []
39
+ CSV.foreach(@filename) do |row|
40
+ if row.size == 1
41
+ # Header; do nothing
42
+ elsif row.size == 7
43
+ cur = { map1: JSON.parse(row[4]),
44
+ map2: JSON.parse(row[6]),
45
+ id1: row[0],
46
+ id2: row[1],
47
+ keys: row[2] }
48
+ processed_lines.push(cur)
49
+ all_fields1.push(*cur[:map1].keys).uniq!
50
+ all_fields2.push(*cur[:map2].keys).uniq!
51
+ else
52
+ @logger.debug"Line #{line_counter} contained unexpected number of fields: #{row.size}"
53
+ end
54
+ line_counter += 1
55
+ end
56
+ @lines = line_counter
57
+ @all_fields1 = all_fields1
58
+ @all_fields2 = all_fields2
59
+ @processed_lines = processed_lines
60
+ end
61
+
62
+ # Create an excel-readable CSV file, in the same location as the original
63
+ def pretty_write
64
+ /(?<base_name>.*)\.(?:csv|(?:zip|xlsx?)\.pseudo)/i.match(@filename)
65
+ target_filename = "#{$LAST_MATCH_INFO[:base_name]}_pretty.csv"
66
+ @logger.debug "Writing output to #{target_filename}"
67
+ CSV.open(target_filename, 'w') do |file|
68
+ headers = (@all_fields1.map { |name| "mapped:#{name}" } +
69
+ @all_fields2.map { |name| "raw:#{name}" } +
70
+ %w(pseudo_id1 pseudo_id2 key_bundle))
71
+ file << headers
72
+ @processed_lines.each do |line|
73
+ output_fields = @all_fields1.map { |field| line[:map1][field] } +
74
+ @all_fields2.map { |field| line[:map2][field] }
75
+ output_fields.push(line[:id1], line[:id2], line[:keys])
76
+ file << output_fields
77
+ end
78
+ end
79
+ end
80
+
81
+ def pretty_data
82
+ csv_string = CSV.generate do |csv|
83
+ headers = (@all_fields1.map { |name| "mapped:#{name}" } +
84
+ @all_fields2.map { |name| "raw:#{name}" } +
85
+ %w(pseudo_id1 pseudo_id2 key_bundle))
86
+ csv << headers
87
+ @processed_lines.each do |line|
88
+ output_fields = @all_fields1.map { |field| line[:map1][field] } +
89
+ @all_fields2.map { |field| line[:map2][field] }
90
+ output_fields.push(line[:id1], line[:id2], line[:keys])
91
+ csv << output_fields
92
+ end
93
+ end
94
+ csv_string
95
+ end
96
+ end
@@ -0,0 +1,125 @@
1
+ require 'digest/sha1'
2
+ require 'securerandom'
3
+ require 'base64'
4
+
5
+ module NdrPseudonymise
6
+ # Simple pseudonymisation library, for efficient pseudonymisation of
7
+ # identifiable data, suitable for fuzzy matching
8
+ #
9
+ # Sample usage:
10
+ # Set up clinical data and demographics
11
+ # clinical_data = ... load pdf file ...
12
+ # all_demographics = {'nhsnumber' => '1234567881', 'postcode' => 'CB22 3AD',
13
+ # 'birthdate' => '1975-10-22', 'surname' => 'SMITH', 'forenames' => 'JOHN ROBERT'}
14
+ #
15
+ # # Generate pseudonymised identifiers and encryption keys
16
+ # (pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key) =
17
+ # NdrPseudonymise::SimplePseudonymisation.generate_keys(salt_id, salt_demog, salt_clinical,
18
+ # all_demographics['nhsnumber'], all_demographics['postcode'], all_demographics['birthdate'])
19
+ #
20
+ # # Emit first 4 values as index demographics
21
+ # emit_index_demographics(pseudo_id1, pseudo_id2, key_bundle, rowid)
22
+ #
23
+ # # Encrypt all demographics with demog_key
24
+ # emit_encrypted_demographics(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data64(demog_key, all_demographics.to_json))
25
+ #
26
+ # # Encrypt all clinical data with clinical_key
27
+ # emit_encrypted_clinical_data(rowid, NdrPseudonymise::SimplePseudonymisation.encrypt_data(clinical_key, clinical_data))
28
+ #
29
+ class SimplePseudonymisation
30
+ # Generate pseudonymised identifiers and pseudonymisation keys
31
+ # Returns an array of 6 strings:
32
+ # [pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
33
+ def self.generate_keys(salt_id, salt_demog, salt_clinical, nhsnumber, current_postcode, birthdate)
34
+ unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
35
+ raise 'Invalid NHS number'
36
+ end
37
+ unless current_postcode.is_a?(String) && current_postcode =~ /\A[A-Z0-9 ]*\Z/
38
+ raise 'Invalid postcode'
39
+ end
40
+ unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
41
+ raise 'Invalid birthdate'
42
+ end
43
+ real_id1 = 'nhsnumber_' + nhsnumber
44
+ # Delete spaces from postcode
45
+ real_id2 = 'birthdate_postcode_' + birthdate + '_' + current_postcode.split(' ').join('')
46
+
47
+ pseudo_id1 = data_hash(real_id1, salt_id)
48
+ pseudo_id2 = data_hash(real_id2, salt_id)
49
+ demog_key = random_key
50
+ clinical_key = random_key
51
+ keys = []
52
+ if nhsnumber.length > 0
53
+ keys += [encrypt_data64(real_id1 + salt_demog, demog_key),
54
+ encrypt_data64(real_id1 + salt_clinical, clinical_key)]
55
+ end
56
+ if current_postcode.length > 0 && birthdate.length > 0
57
+ keys += [encrypt_data64(real_id2 + salt_demog, demog_key),
58
+ encrypt_data64(real_id2 + salt_clinical, clinical_key)]
59
+ end
60
+ # TODO: Consider whether it's worth storing something, if keys would otherwise be empty.
61
+ key_bundle = keys.join(' ')
62
+ rowid = random_key
63
+ [pseudo_id1, pseudo_id2, key_bundle, rowid, demog_key, clinical_key]
64
+ end
65
+
66
+ # Generate pseudonymised identifiers and pseudonymisation keys
67
+ # for data with only an NHS number (missing patient postcode or DOB), where
68
+ # only the demographics need to be pseudonymised (e.g. prescription data).
69
+ # Returns an array of 3 strings:
70
+ # [pseudo_id1, key_bundle, demog_key]
71
+ def self.generate_keys_nhsnumber_demog_only(salt_id, salt_demog, nhsnumber)
72
+ unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
73
+ raise 'Invalid NHS number'
74
+ end
75
+ real_id1 = 'nhsnumber_' + nhsnumber
76
+
77
+ pseudo_id1 = data_hash(real_id1, salt_id)
78
+ demog_key = random_key
79
+ key_bundle = if nhsnumber.length > 0
80
+ encrypt_data64(real_id1 + salt_demog, demog_key)
81
+ else
82
+ ''
83
+ end
84
+ [pseudo_id1, key_bundle, demog_key]
85
+ end
86
+
87
+ def self.data_hash(value, salt)
88
+ Digest::SHA2.hexdigest(value.to_s + salt.to_s)
89
+ end
90
+
91
+ def self.random_key
92
+ SecureRandom.hex(32) # 32 bytes = 256 bits
93
+ end
94
+
95
+ # returns a base-64 encoded string
96
+ def self.encrypt_data64(key, data)
97
+ Base64.strict_encode64(encrypt_data(key, data))
98
+ end
99
+
100
+ # returns a binary string
101
+ def self.encrypt_data(key, data)
102
+ unless key =~ /[0-9a-f]{32}/
103
+ raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
104
+ end
105
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
106
+ aes.encrypt
107
+ aes.key = Digest::SHA256.digest(key)
108
+ aes.update(data) + aes.final
109
+ end
110
+
111
+ def self.decrypt_data64(key, data)
112
+ decrypt_data(key, Base64.strict_decode64(data))
113
+ end
114
+
115
+ def self.decrypt_data(key, data)
116
+ unless key =~ /[0-9a-f]{32}/
117
+ raise(ArgumentError, 'Expected key to contain at least 256 bits of hex characters (0-9, a-f)')
118
+ end
119
+ aes = OpenSSL::Cipher.new('AES-256-CBC')
120
+ aes.decrypt
121
+ aes.key = Digest::SHA256.digest(key.chomp)
122
+ (aes.update(data) + aes.final)
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,3 @@
1
+ module NdrPseudonymise
2
+ VERSION = '0.4.1'.freeze
3
+ end