ndr_pseudonymise 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,124 @@
1
+ require 'digest'
2
+ require 'io/console'
3
+ require 'openssl'
4
+ require 'zlib'
5
+
6
+ module NdrPseudonymise
7
+ module NdrEncrypt
8
+ # Defines utility methods for encrypting / decrypting objects
9
+ module EncryptedObject
10
+ # rubocop:disable Style/SlicingWithRange
11
+ def self.blob(data)
12
+ "blob #{data.size}\0#{data}"
13
+ end
14
+
15
+ def self.unpack_blob(blob)
16
+ prefix, data = blob.split("\x00", 2)
17
+ raise(ArgumentError, 'Invalid blob format') unless /\Ablob [0-9]+\z/ =~ prefix
18
+
19
+ size = prefix[5..-1].to_i
20
+ raise(ArgumentError, 'Incorrect blob size') unless size == data.size
21
+
22
+ data
23
+ end
24
+
25
+ def self.digest(blob)
26
+ Digest::SHA256.hexdigest(blob)
27
+ end
28
+
29
+ # Create zlib-compressed version of the content
30
+ def self.compress(blob)
31
+ Zlib::Deflate.deflate(blob)
32
+ end
33
+
34
+ # Unpack zlib-compressed content
35
+ def self.decompress(contents)
36
+ Zlib::Inflate.inflate(contents)
37
+ end
38
+
39
+ def self.encrypted_id(git_blobid, key_name: nil)
40
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
41
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
42
+
43
+ temp_id = "ndr_encrypt #{git_blobid} #{key_name}"
44
+ digest(blob(temp_id))
45
+ end
46
+
47
+ # Encrypt sensitive secret data, given a public key file as a String
48
+ # Returns the encrypted output data
49
+ # Result can either be decrypted using the decrypt method on this class.
50
+ # TODO: write equivalent command-line method using only openssl and shell scripts
51
+ def self.encrypt(secret_data, pub_key: nil)
52
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
53
+ raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
54
+ return nil unless secret_data
55
+
56
+ public_key_data = File.read(pub_key)
57
+ cipher = OpenSSL::Cipher.new('aes-256-cbc')
58
+ cipher.encrypt
59
+ cipher.key = random_key = cipher.random_key
60
+ cipher.iv = random_iv = cipher.random_iv
61
+ rawdata = cipher.update(secret_data)
62
+ rawdata << cipher.final
63
+ public_key = OpenSSL::PKey::RSA.new(public_key_data)
64
+ public_key.public_encrypt(random_key) + random_iv + rawdata
65
+ end
66
+
67
+ # Decrypt sensitive secret data, given a private key and its password
68
+ # Returns the decrypted output data
69
+ # TODO: write equivalent command-line method using only openssl and shell scripts
70
+ # TODO: Refactor with code from era UnifiedSources::ApiRetrieval::Extractor
71
+ def self.decrypt(rawdata, private_key: nil, passin: nil)
72
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
73
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
74
+ return nil unless rawdata
75
+
76
+ password = get_passphrase(private_key: private_key, passin: passin)
77
+ private_key_data = File.read(private_key)
78
+ cipher = OpenSSL::Cipher.new('aes-256-cbc')
79
+ cipher.decrypt
80
+ private_key = OpenSSL::PKey::RSA.new(private_key_data, password)
81
+ key_size = private_key.n.num_bytes
82
+ cipher.key = private_key.private_decrypt(rawdata[0..key_size - 1])
83
+ cipher.iv = rawdata[key_size..key_size + 15]
84
+ decrypted_data = cipher.update(rawdata[key_size + 16..-1])
85
+ decrypted_data << cipher.final
86
+ end
87
+
88
+ def self.get_passphrase(private_key: nil, passin: nil)
89
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
90
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
91
+
92
+ @passphrase_cache ||= {}
93
+ return @passphrase_cache[private_key] if @passphrase_cache.key?(private_key)
94
+
95
+ raise(ArgumentError, 'Missing private key file') unless File.exist?(private_key)
96
+
97
+ # Implement a subset of the openssl -passin options in
98
+ # https://www.openssl.org/docs/man3.0/man1/openssl-passphrase-options.html
99
+ result = case passin
100
+ when nil, ''
101
+ msg = "Enter passphrase for #{private_key}: "
102
+ if IO.console.respond_to?(:getpass)
103
+ IO.console.getpass msg
104
+ else
105
+ $stdout.print msg
106
+ password = $stdin.noecho(&:gets).chomp
107
+ puts
108
+ password
109
+ end
110
+ when /\Apass:/
111
+ passin[5..-1]
112
+ when /\Aenv:/
113
+ ENV[passin[4..-1]]
114
+ when 'stdin'
115
+ $stdin.readline.chomp
116
+ else
117
+ raise(ArgumentError, 'Unsupported passin option')
118
+ end
119
+ @passphrase_cache[private_key] = result
120
+ end
121
+ # rubocop:enable Style/SlicingWithRange
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,44 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module NdrPseudonymise
5
+ module NdrEncrypt
6
+ # Defines a local ndr_encrypt working copy
7
+ class RemoteRepository
8
+ # rubocop:disable Style/SlicingWithRange
9
+ def initialize(base_url: nil)
10
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
11
+ raise(ArgumentError, 'missing keyword: :base_url') unless base_url
12
+
13
+ @base_url = base_url
14
+ end
15
+
16
+ # Retrieve remote file(s) based on git_blobid
17
+ def cat_remote(git_blobid, key_name: nil, private_key: nil, passin: nil)
18
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
19
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
20
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
21
+
22
+ encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
23
+ rawdata = retrieve_remote_url(encrypted_id)
24
+ contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
25
+ passin: passin)
26
+ blob = NdrEncrypt::EncryptedObject.decompress(contents)
27
+ NdrEncrypt::EncryptedObject.unpack_blob(blob)
28
+ end
29
+
30
+ private
31
+
32
+ # Retrieve remote encrypted file(s) based on encrypted_id
33
+ def retrieve_remote_url(encrypted_id)
34
+ uri = URI.join(@base_url, "#{encrypted_id[0..1]}/#{encrypted_id[2..-1]}")
35
+ res = Net::HTTP.get_response(uri)
36
+ # TODO: More finegrained error messages
37
+ raise(ArgumentError, 'Could not retrieve URL') unless res.is_a?(Net::HTTPSuccess)
38
+
39
+ res.body
40
+ end
41
+ # rubocop:enable Style/SlicingWithRange
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,165 @@
1
+ require 'csv'
2
+ require 'fileutils'
3
+ require 'set'
4
+ require 'stringio'
5
+
6
+ module NdrPseudonymise
7
+ module NdrEncrypt
8
+ # Defines a local ndr_encrypt working copy
9
+ class Repository
10
+ # rubocop:disable Style/SlicingWithRange
11
+ CSV_COLUMNS = %w[git_blobid path].freeze
12
+ ENCRYPTED_DIR = 'ndr_encrypted/'.freeze
13
+
14
+ def initialize(repo_dir: nil)
15
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
16
+ raise(ArgumentError, 'missing keyword: :repo_dir') unless repo_dir
17
+
18
+ @repo_dir = repo_dir
19
+ end
20
+
21
+ # Create directory structure
22
+ def init
23
+ FileUtils.mkdir_p(object_dir)
24
+ return false if valid_repository?
25
+
26
+ CSV.open(index_filename, 'wb') { |csv| csv << CSV_COLUMNS }
27
+ true
28
+ end
29
+
30
+ # Add file contents to the encrypted store and index
31
+ def add(paths, key_name: nil, pub_key: nil)
32
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
33
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
34
+ raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
35
+ raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
36
+
37
+ paths.each do |path|
38
+ git_blobid, _encrypted_id = hash_object(path,
39
+ key_name: key_name, pub_key: pub_key, write: true)
40
+ File.open(index_filename, 'ab') { |f| f << [git_blobid, path].to_csv }
41
+ end
42
+ end
43
+
44
+ # Cleanup unnecessary index entries and optimize the encrypted store
45
+ def gc(output_stream: StringIO.new)
46
+ raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
47
+
48
+ output_stream.print('Reading index: ')
49
+ csv_data = CSV.read(index_filename)
50
+ header = csv_data.shift
51
+ raise(ArgumentError, 'Invalid header in index file') unless CSV_COLUMNS == header
52
+
53
+ count0 = csv_data.size
54
+ output_stream.print("#{count0} entries.\nRemoving duplicates: ")
55
+ csv_data.each.with_index do |row, i|
56
+ unless row.size == 2 && row[0] =~ /\A[0-9a-f]+\z/
57
+ raise(ArgumentError, "Invalid index entry on data row #{i + 1}")
58
+ end
59
+ end
60
+ csv_data = csv_data.sort.uniq
61
+ count1 = csv_data.size
62
+ output_stream.print("#{count1} entries remaining.\nWriting objects: ")
63
+ # Move aside index file temporarily to reduce race conditions
64
+ # Note: should use a proper lock file for all index interactions
65
+ orig_filename = "#{index_filename}.orig"
66
+ temp_filename = "#{index_filename}.new"
67
+ FileUtils.mv(index_filename, "#{index_filename}.orig")
68
+ CSV.open(temp_filename, 'wb') do |csv|
69
+ csv << header
70
+ csv_data.each { |row| csv << row }
71
+ end
72
+ FileUtils.mv(temp_filename, index_filename)
73
+ FileUtils.rm(orig_filename)
74
+ output_stream.puts("100% (#{count1}/#{count1}), done.\n")
75
+ output_stream.puts("Total #{count1} (delta #{count0 - count1})")
76
+ end
77
+
78
+ # Retrieve local file(s) based on CSV entry
79
+ def get(paths, key_name: nil, private_key: nil, passin: nil)
80
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
81
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
82
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
83
+ raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
84
+
85
+ path_set = Set.new(paths)
86
+ paths = path_set.to_a # Keep only unique entries
87
+ found = Set.new # index may have duplicate objects if not garbage collected
88
+ CSV.foreach(index_filename, headers: true) do |row|
89
+ # Only keep first matching entry for each path
90
+ if path_set.include?(row['path'])
91
+ found << row
92
+ path_set.delete(row['path'])
93
+ break if path_set.empty?
94
+ end
95
+ end
96
+ raise(ArgumentError, 'Cannot find some files') unless found.size == paths.size
97
+
98
+ found.each do |row|
99
+ data = cat_file(row['git_blobid'], key_name: key_name, private_key: private_key,
100
+ passin: passin)
101
+ File.open(row['path'], 'wb') { |f| f << data }
102
+ end
103
+ end
104
+
105
+ # Compute object IDs and optionally creates an encrypted object from a file
106
+ # Returns [git_blobid, encrypted_id]
107
+ def hash_object(path, key_name: nil, pub_key: nil, write: nil)
108
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
109
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
110
+ raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
111
+
112
+ data = File.binread(path)
113
+ blob = NdrEncrypt::EncryptedObject.blob(data)
114
+ git_blobid = NdrEncrypt::EncryptedObject.digest(blob)
115
+ encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
116
+ if write
117
+ encrypted_dir = File.join(object_dir, encrypted_id[0..1])
118
+ encrypted_filename = File.join(encrypted_dir, encrypted_id[2..-1])
119
+ unless File.exist?(encrypted_filename) # Don't override existing file
120
+ contents = NdrEncrypt::EncryptedObject.compress(blob)
121
+ encrypted_contents = NdrEncrypt::EncryptedObject.encrypt(contents, pub_key: pub_key)
122
+ FileUtils.mkdir_p(encrypted_dir)
123
+ File.open(encrypted_filename, 'wb') { |f| f << encrypted_contents }
124
+ end
125
+ end
126
+ [git_blobid, encrypted_id]
127
+ end
128
+
129
+ # Retrieve local file(s) based on git_blobid
130
+ def cat_file(git_blobid, key_name: nil, private_key: nil, passin: nil)
131
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
132
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
133
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
134
+
135
+ encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
136
+ encrypted_filename = File.join(object_dir, encrypted_id[0..1], encrypted_id[2..-1])
137
+ unless File.exist?(encrypted_filename)
138
+ raise(ArgumentError, 'File does not exist in encrypted storage')
139
+ end
140
+
141
+ rawdata = File.binread(encrypted_filename)
142
+ contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
143
+ passin: passin)
144
+ blob = NdrEncrypt::EncryptedObject.decompress(contents)
145
+ NdrEncrypt::EncryptedObject.unpack_blob(blob)
146
+ end
147
+
148
+ private
149
+
150
+ # Does the repository have a valid structure
151
+ def valid_repository?
152
+ Dir.exist?(object_dir) && File.exist?(index_filename)
153
+ end
154
+
155
+ def object_dir
156
+ File.join(@repo_dir, ENCRYPTED_DIR, 'objects')
157
+ end
158
+
159
+ def index_filename
160
+ File.join(@repo_dir, ENCRYPTED_DIR, 'index.csv')
161
+ end
162
+ # rubocop:enable Style/SlicingWithRange
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,10 @@
1
+ require 'ndr_pseudonymise/ndr_encrypt/command_line'
2
+ require 'ndr_pseudonymise/ndr_encrypt/encrypted_object'
3
+ require 'ndr_pseudonymise/ndr_encrypt/remote_repository'
4
+ require 'ndr_pseudonymise/ndr_encrypt/repository'
5
+
6
+ module NdrPseudonymise
7
+ # Utilities and a command line tool for an encrypted object store
8
+ module NdrEncrypt
9
+ end
10
+ end
@@ -0,0 +1,71 @@
1
+ # Fast, simple pseudonymisation of prescription data with a very controlled
2
+ # format.
3
+ # Only the first 2 fields are potentially identifiable: nhs number and date of
4
+ # birth.
5
+
6
+ require 'ndr_pseudonymise/simple_pseudonymisation'
7
+ require 'ndr_pseudonymise/pseudonymisation_specification'
8
+
9
+ require 'json'
10
+
11
+ module NdrPseudonymise
12
+ # Pseudonymise prescription data
13
+ class PrescriptionPseudonymiser < PseudonymisationSpecification
14
+ PREAMBLE_V2_DEMOG_ONLY = 'Pseudonymised matching data v2.0-demog-only'.freeze
15
+
16
+ def initialize(format_spec, key_bundle)
17
+ super
18
+ return if @format_spec[:demographics] == [0, 1]
19
+ raise 'Invalid specification: expected nhsnumber and birthdate in first 2 columns'
20
+ end
21
+
22
+ # Validate a row of prescription data
23
+ # Return false if this row is a valid data row, otherwise a list of errors
24
+ def row_errors2(row)
25
+ # Not significantly faster than optimised general #row_errors method
26
+ (nhsnumber, birthdate) = row[0..1]
27
+ unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
28
+ raise 'Invalid NHS number'
29
+ end
30
+ raise 'Missing NHS number' if nhsnumber.size < 10
31
+ unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
32
+ raise 'Invalid birthdate'
33
+ end
34
+ end
35
+
36
+ # Pseudonymise a row of prescription data, returning an array of a single row:
37
+ # [[packed_pseudoid_and_demographics, clinical_data1, ...]]
38
+ # Where packed_pseudoid_and_demographics consists of
39
+ # "pseudo_id1 (key_bundle) packed_pseudoid_and_demographics"
40
+ def pseudonymise_row(row)
41
+ @key_cache ||= {} # Cache pseudonymisation keys for more compact import
42
+ all_demographics = { 'nhsnumber' => row[0], 'birthdate' => row[1] }
43
+ key = all_demographics.to_json
44
+ if @key_cache.key?(key)
45
+ pseudo_id1, key_bundle, demog_key = @key_cache[key]
46
+ else
47
+ pseudo_id1, key_bundle, demog_key = NdrPseudonymise::SimplePseudonymisation.
48
+ generate_keys_nhsnumber_demog_only(@salt1, @salt2, row[0])
49
+ if !row[0].to_s.empty? && !row[1].to_s.empty? # && false to stop caching
50
+ @key_cache = {} if @key_cache.size > 10000 # Limit cache size
51
+ @key_cache[key] = [pseudo_id1, key_bundle, demog_key]
52
+ end
53
+ end
54
+ encrypted_demographics = NdrPseudonymise::SimplePseudonymisation.
55
+ encrypt_data64(demog_key, all_demographics.to_json)
56
+ packed_pseudoid_and_demographics = format('%s (%s) %s', pseudo_id1, key_bundle,
57
+ encrypted_demographics)
58
+ [[packed_pseudoid_and_demographics] + row[2..-1]]
59
+ end
60
+
61
+ # Header row for CSV data
62
+ def csv_header_row
63
+ [PREAMBLE_V2_DEMOG_ONLY]
64
+ end
65
+
66
+ # Append the output of pseudonymise_row to a CSV file
67
+ def emit_csv_rows(out_csv, pseudonymised_row)
68
+ out_csv << pseudonymised_row[0]
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,53 @@
1
+ module NdrPseudonymise
2
+ # Log percentage progress on pseudonymisation
3
+ # Starts logging after 1 minute or 5%, then at 5% / 5 minute intervals
4
+ class ProgressPrinter
5
+ # Logs progress to the given stream (default $stdout)
6
+ # If verbose = false, only log percentages on a single line
7
+ # If verbose = true, log verbose output
8
+ # If verbose = :dynamic, act like verbose = false, but if the total time is
9
+ # more than 5 minutes, move into verbose = true mode
10
+ def initialize(dest = $stdout, verbose = false)
11
+ @dest = dest
12
+ @verbose = verbose
13
+ @last_percent = 0
14
+ @last_log = Time.current - (60 * 4) # First log entry after 1 minute
15
+ end
16
+
17
+ # Returns a lambda that prints progress to stdout (or another stream).
18
+ # parameter _csv_row is not used.
19
+ def log_progress(start_time, time_now, _csv_row, progress, total)
20
+ current_percentage = total == 0 ? 0 : (progress * 100 / total).to_i
21
+ now = Time.current
22
+ if (current_percentage / 5 > @last_percent / 5) || # Log at 5% / 5 minute intervals
23
+ (now - @last_log >= 60 * 5) || current_percentage == 100
24
+ if @verbose == :dynamic && (time_now - start_time >= 60 * 5)
25
+ @verbose = true
26
+ @dest << '...'
27
+ end
28
+ if @verbose == true
29
+ # TODO: Add estimated completion time
30
+ tfin = if progress > 0
31
+ time_now + (time_now - start_time) * (total - progress) / progress
32
+ end
33
+ completion = tfin ? ', expected completion' : ''
34
+ @dest << format("Completed %s%% in %.1f minutes%s\n",
35
+ current_percentage, (now - start_time) / 60.0, completion)
36
+
37
+ # @dest << ("Completed %s%% in %.1f minutes#{", expected completion #{tfin}" if tfin}\n" %
38
+ # [current_percentage, (now - start_time) / 60.0])
39
+
40
+ else
41
+ @dest << "#{'...' if @last_percent > 0}#{current_percentage}%"
42
+ @dest << "\n" if current_percentage == 100
43
+ end
44
+ # if current_percentage == 100 # Uncomment for performance debugging
45
+ # @dest << "Finished %s rows in %.3f secs\n" % [csv_row, time_now - start_time]
46
+ # end
47
+ @dest.flush
48
+ @last_percent = current_percentage
49
+ @last_log = now
50
+ end
51
+ end
52
+ end
53
+ end