ndr_pseudonymise 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ require 'digest'
2
+ require 'io/console'
3
+ require 'openssl'
4
+ require 'zlib'
5
+
6
+ module NdrPseudonymise
7
+ module NdrEncrypt
8
+ # Defines utility methods for encrypting / decrypting objects
9
+ module EncryptedObject
10
+ # rubocop:disable Style/SlicingWithRange
11
+ def self.blob(data)
12
+ "blob #{data.size}\0#{data}"
13
+ end
14
+
15
+ def self.unpack_blob(blob)
16
+ prefix, data = blob.split("\x00", 2)
17
+ raise(ArgumentError, 'Invalid blob format') unless /\Ablob [0-9]+\z/ =~ prefix
18
+
19
+ size = prefix[5..-1].to_i
20
+ raise(ArgumentError, 'Incorrect blob size') unless size == data.size
21
+
22
+ data
23
+ end
24
+
25
+ def self.digest(blob)
26
+ Digest::SHA256.hexdigest(blob)
27
+ end
28
+
29
+ # Create zlib-compressed version of the content
30
+ def self.compress(blob)
31
+ Zlib::Deflate.deflate(blob)
32
+ end
33
+
34
+ # Unpack zlib-compressed content
35
+ def self.decompress(contents)
36
+ Zlib::Inflate.inflate(contents)
37
+ end
38
+
39
+ def self.encrypted_id(git_blobid, key_name: nil)
40
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
41
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
42
+
43
+ temp_id = "ndr_encrypt #{git_blobid} #{key_name}"
44
+ digest(blob(temp_id))
45
+ end
46
+
47
+ # Encrypt sensitive secret data, given a public key file as a String
48
+ # Returns the encrypted output data
49
+ # Result can either be decrypted using the decrypt method on this class.
50
+ # TODO: write equivalent command-line method using only openssl and shell scripts
51
+ def self.encrypt(secret_data, pub_key: nil)
52
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
53
+ raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
54
+ return nil unless secret_data
55
+
56
+ public_key_data = File.read(pub_key)
57
+ cipher = OpenSSL::Cipher.new('aes-256-cbc')
58
+ cipher.encrypt
59
+ cipher.key = random_key = cipher.random_key
60
+ cipher.iv = random_iv = cipher.random_iv
61
+ rawdata = cipher.update(secret_data)
62
+ rawdata << cipher.final
63
+ public_key = OpenSSL::PKey::RSA.new(public_key_data)
64
+ public_key.public_encrypt(random_key) + random_iv + rawdata
65
+ end
66
+
67
+ # Decrypt sensitive secret data, given a private key and its password
68
+ # Returns the decrypted output data
69
+ # TODO: write equivalent command-line method using only openssl and shell scripts
70
+ # TODO: Refactor with code from era UnifiedSources::ApiRetrieval::Extractor
71
+ def self.decrypt(rawdata, private_key: nil, passin: nil)
72
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
73
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
74
+ return nil unless rawdata
75
+
76
+ password = get_passphrase(private_key: private_key, passin: passin)
77
+ private_key_data = File.read(private_key)
78
+ cipher = OpenSSL::Cipher.new('aes-256-cbc')
79
+ cipher.decrypt
80
+ private_key = OpenSSL::PKey::RSA.new(private_key_data, password)
81
+ key_size = private_key.n.num_bytes
82
+ cipher.key = private_key.private_decrypt(rawdata[0..key_size - 1])
83
+ cipher.iv = rawdata[key_size..key_size + 15]
84
+ decrypted_data = cipher.update(rawdata[key_size + 16..-1])
85
+ decrypted_data << cipher.final
86
+ end
87
+
88
+ def self.get_passphrase(private_key: nil, passin: nil)
89
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
90
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
91
+
92
+ @passphrase_cache ||= {}
93
+ return @passphrase_cache[private_key] if @passphrase_cache.key?(private_key)
94
+
95
+ raise(ArgumentError, 'Missing private key file') unless File.exist?(private_key)
96
+
97
+ # Implement a subset of the openssl -passin options in
98
+ # https://www.openssl.org/docs/man3.0/man1/openssl-passphrase-options.html
99
+ result = case passin
100
+ when nil, ''
101
+ msg = "Enter passphrase for #{private_key}: "
102
+ if IO.console.respond_to?(:getpass)
103
+ IO.console.getpass msg
104
+ else
105
+ $stdout.print msg
106
+ password = $stdin.noecho(&:gets).chomp
107
+ puts
108
+ password
109
+ end
110
+ when /\Apass:/
111
+ passin[5..-1]
112
+ when /\Aenv:/
113
+ ENV[passin[4..-1]]
114
+ when 'stdin'
115
+ $stdin.readline.chomp
116
+ else
117
+ raise(ArgumentError, 'Unsupported passin option')
118
+ end
119
+ @passphrase_cache[private_key] = result
120
+ end
121
+ # rubocop:enable Style/SlicingWithRange
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,44 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module NdrPseudonymise
5
+ module NdrEncrypt
6
+ # Defines a local ndr_encrypt working copy
7
+ class RemoteRepository
8
+ # rubocop:disable Style/SlicingWithRange
9
+ def initialize(base_url: nil)
10
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
11
+ raise(ArgumentError, 'missing keyword: :base_url') unless base_url
12
+
13
+ @base_url = base_url
14
+ end
15
+
16
+ # Retrieve remote file(s) based on git_blobid
17
+ def cat_remote(git_blobid, key_name: nil, private_key: nil, passin: nil)
18
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
19
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
20
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
21
+
22
+ encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
23
+ rawdata = retrieve_remote_url(encrypted_id)
24
+ contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
25
+ passin: passin)
26
+ blob = NdrEncrypt::EncryptedObject.decompress(contents)
27
+ NdrEncrypt::EncryptedObject.unpack_blob(blob)
28
+ end
29
+
30
+ private
31
+
32
+ # Retrieve remote encrypted file(s) based on encrypted_id
33
+ def retrieve_remote_url(encrypted_id)
34
+ uri = URI.join(@base_url, "#{encrypted_id[0..1]}/#{encrypted_id[2..-1]}")
35
+ res = Net::HTTP.get_response(uri)
36
+ # TODO: More finegrained error messages
37
+ raise(ArgumentError, 'Could not retrieve URL') unless res.is_a?(Net::HTTPSuccess)
38
+
39
+ res.body
40
+ end
41
+ # rubocop:enable Style/SlicingWithRange
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,165 @@
1
+ require 'csv'
2
+ require 'fileutils'
3
+ require 'set'
4
+ require 'stringio'
5
+
6
+ module NdrPseudonymise
7
+ module NdrEncrypt
8
+ # Defines a local ndr_encrypt working copy
9
+ class Repository
10
+ # rubocop:disable Style/SlicingWithRange
11
+ CSV_COLUMNS = %w[git_blobid path].freeze
12
+ ENCRYPTED_DIR = 'ndr_encrypted/'.freeze
13
+
14
+ def initialize(repo_dir: nil)
15
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
16
+ raise(ArgumentError, 'missing keyword: :repo_dir') unless repo_dir
17
+
18
+ @repo_dir = repo_dir
19
+ end
20
+
21
+ # Create directory structure
22
+ def init
23
+ FileUtils.mkdir_p(object_dir)
24
+ return false if valid_repository?
25
+
26
+ CSV.open(index_filename, 'wb') { |csv| csv << CSV_COLUMNS }
27
+ true
28
+ end
29
+
30
+ # Add file contents to the encrypted store and index
31
+ def add(paths, key_name: nil, pub_key: nil)
32
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
33
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
34
+ raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
35
+ raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
36
+
37
+ paths.each do |path|
38
+ git_blobid, _encrypted_id = hash_object(path,
39
+ key_name: key_name, pub_key: pub_key, write: true)
40
+ File.open(index_filename, 'ab') { |f| f << [git_blobid, path].to_csv }
41
+ end
42
+ end
43
+
44
+ # Cleanup unnecessary index entries and optimize the encrypted store
45
+ def gc(output_stream: StringIO.new)
46
+ raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
47
+
48
+ output_stream.print('Reading index: ')
49
+ csv_data = CSV.read(index_filename)
50
+ header = csv_data.shift
51
+ raise(ArgumentError, 'Invalid header in index file') unless CSV_COLUMNS == header
52
+
53
+ count0 = csv_data.size
54
+ output_stream.print("#{count0} entries.\nRemoving duplicates: ")
55
+ csv_data.each.with_index do |row, i|
56
+ unless row.size == 2 && row[0] =~ /\A[0-9a-f]+\z/
57
+ raise(ArgumentError, "Invalid index entry on data row #{i + 1}")
58
+ end
59
+ end
60
+ csv_data = csv_data.sort.uniq
61
+ count1 = csv_data.size
62
+ output_stream.print("#{count1} entries remaining.\nWriting objects: ")
63
+ # Move aside index file temporarily to reduce race conditions
64
+ # Note: should use a proper lock file for all index interactions
65
+ orig_filename = "#{index_filename}.orig"
66
+ temp_filename = "#{index_filename}.new"
67
+ FileUtils.mv(index_filename, "#{index_filename}.orig")
68
+ CSV.open(temp_filename, 'wb') do |csv|
69
+ csv << header
70
+ csv_data.each { |row| csv << row }
71
+ end
72
+ FileUtils.mv(temp_filename, index_filename)
73
+ FileUtils.rm(orig_filename)
74
+ output_stream.puts("100% (#{count1}/#{count1}), done.\n")
75
+ output_stream.puts("Total #{count1} (delta #{count0 - count1})")
76
+ end
77
+
78
+ # Retrieve local file(s) based on CSV entry
79
+ def get(paths, key_name: nil, private_key: nil, passin: nil)
80
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
81
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
82
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
83
+ raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
84
+
85
+ path_set = Set.new(paths)
86
+ paths = path_set.to_a # Keep only unique entries
87
+ found = Set.new # index may have duplicate objects if not garbage collected
88
+ CSV.foreach(index_filename, headers: true) do |row|
89
+ # Only keep first matching entry for each path
90
+ if path_set.include?(row['path'])
91
+ found << row
92
+ path_set.delete(row['path'])
93
+ break if path_set.empty?
94
+ end
95
+ end
96
+ raise(ArgumentError, 'Cannot find some files') unless found.size == paths.size
97
+
98
+ found.each do |row|
99
+ data = cat_file(row['git_blobid'], key_name: key_name, private_key: private_key,
100
+ passin: passin)
101
+ File.open(row['path'], 'wb') { |f| f << data }
102
+ end
103
+ end
104
+
105
+ # Compute object IDs and optionally creates an encrypted object from a file
106
+ # Returns [git_blobid, encrypted_id]
107
+ def hash_object(path, key_name: nil, pub_key: nil, write: nil)
108
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
109
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
110
+ raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
111
+
112
+ data = File.binread(path)
113
+ blob = NdrEncrypt::EncryptedObject.blob(data)
114
+ git_blobid = NdrEncrypt::EncryptedObject.digest(blob)
115
+ encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
116
+ if write
117
+ encrypted_dir = File.join(object_dir, encrypted_id[0..1])
118
+ encrypted_filename = File.join(encrypted_dir, encrypted_id[2..-1])
119
+ unless File.exist?(encrypted_filename) # Don't override existing file
120
+ contents = NdrEncrypt::EncryptedObject.compress(blob)
121
+ encrypted_contents = NdrEncrypt::EncryptedObject.encrypt(contents, pub_key: pub_key)
122
+ FileUtils.mkdir_p(encrypted_dir)
123
+ File.open(encrypted_filename, 'wb') { |f| f << encrypted_contents }
124
+ end
125
+ end
126
+ [git_blobid, encrypted_id]
127
+ end
128
+
129
+ # Retrieve local file(s) based on git_blobid
130
+ def cat_file(git_blobid, key_name: nil, private_key: nil, passin: nil)
131
+ # We need to support ruby 2.0 so cannot use required keyword arguments syntax
132
+ raise(ArgumentError, 'missing keyword: :key_name') unless key_name
133
+ raise(ArgumentError, 'missing keyword: :private_key') unless private_key
134
+
135
+ encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
136
+ encrypted_filename = File.join(object_dir, encrypted_id[0..1], encrypted_id[2..-1])
137
+ unless File.exist?(encrypted_filename)
138
+ raise(ArgumentError, 'File does not exist in encrypted storage')
139
+ end
140
+
141
+ rawdata = File.binread(encrypted_filename)
142
+ contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
143
+ passin: passin)
144
+ blob = NdrEncrypt::EncryptedObject.decompress(contents)
145
+ NdrEncrypt::EncryptedObject.unpack_blob(blob)
146
+ end
147
+
148
+ private
149
+
150
+ # Does the repository have a valid structure
151
+ def valid_repository?
152
+ Dir.exist?(object_dir) && File.exist?(index_filename)
153
+ end
154
+
155
+ def object_dir
156
+ File.join(@repo_dir, ENCRYPTED_DIR, 'objects')
157
+ end
158
+
159
+ def index_filename
160
+ File.join(@repo_dir, ENCRYPTED_DIR, 'index.csv')
161
+ end
162
+ # rubocop:enable Style/SlicingWithRange
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,10 @@
1
+ require 'ndr_pseudonymise/ndr_encrypt/command_line'
2
+ require 'ndr_pseudonymise/ndr_encrypt/encrypted_object'
3
+ require 'ndr_pseudonymise/ndr_encrypt/remote_repository'
4
+ require 'ndr_pseudonymise/ndr_encrypt/repository'
5
+
6
+ module NdrPseudonymise
7
+ # Utilities and a command line tool for an encrypted object store
8
+ module NdrEncrypt
9
+ end
10
+ end
@@ -0,0 +1,71 @@
1
+ # Fast, simple pseudonymisation of prescription data with a very controlled
2
+ # format.
3
+ # Only the first 2 fields are potentially identifiable: nhs number and date of
4
+ # birth.
5
+
6
+ require 'ndr_pseudonymise/simple_pseudonymisation'
7
+ require 'ndr_pseudonymise/pseudonymisation_specification'
8
+
9
+ require 'json'
10
+
11
+ module NdrPseudonymise
12
+ # Pseudonymise prescription data
13
+ class PrescriptionPseudonymiser < PseudonymisationSpecification
14
+ PREAMBLE_V2_DEMOG_ONLY = 'Pseudonymised matching data v2.0-demog-only'.freeze
15
+
16
+ def initialize(format_spec, key_bundle)
17
+ super
18
+ return if @format_spec[:demographics] == [0, 1]
19
+ raise 'Invalid specification: expected nhsnumber and birthdate in first 2 columns'
20
+ end
21
+
22
+ # Validate a row of prescription data
23
+ # Return false if this row is a valid data row, otherwise a list of errors
24
+ def row_errors2(row)
25
+ # Not significantly faster than optimised general #row_errors method
26
+ (nhsnumber, birthdate) = row[0..1]
27
+ unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
28
+ raise 'Invalid NHS number'
29
+ end
30
+ raise 'Missing NHS number' if nhsnumber.size < 10
31
+ unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
32
+ raise 'Invalid birthdate'
33
+ end
34
+ end
35
+
36
+ # Pseudonymise a row of prescription data, returning an array of a single row:
37
+ # [[packed_pseudoid_and_demographics, clinical_data1, ...]]
38
+ # Where packed_pseudoid_and_demographics consists of
39
+ # "pseudo_id1 (key_bundle) packed_pseudoid_and_demographics"
40
+ def pseudonymise_row(row)
41
+ @key_cache ||= {} # Cache pseudonymisation keys for more compact import
42
+ all_demographics = { 'nhsnumber' => row[0], 'birthdate' => row[1] }
43
+ key = all_demographics.to_json
44
+ if @key_cache.key?(key)
45
+ pseudo_id1, key_bundle, demog_key = @key_cache[key]
46
+ else
47
+ pseudo_id1, key_bundle, demog_key = NdrPseudonymise::SimplePseudonymisation.
48
+ generate_keys_nhsnumber_demog_only(@salt1, @salt2, row[0])
49
+ if !row[0].to_s.empty? && !row[1].to_s.empty? # && false to stop caching
50
+ @key_cache = {} if @key_cache.size > 10000 # Limit cache size
51
+ @key_cache[key] = [pseudo_id1, key_bundle, demog_key]
52
+ end
53
+ end
54
+ encrypted_demographics = NdrPseudonymise::SimplePseudonymisation.
55
+ encrypt_data64(demog_key, all_demographics.to_json)
56
+ packed_pseudoid_and_demographics = format('%s (%s) %s', pseudo_id1, key_bundle,
57
+ encrypted_demographics)
58
+ [[packed_pseudoid_and_demographics] + row[2..-1]]
59
+ end
60
+
61
+ # Header row for CSV data
62
+ def csv_header_row
63
+ [PREAMBLE_V2_DEMOG_ONLY]
64
+ end
65
+
66
+ # Append the output of pseudonymise_row to a CSV file
67
+ def emit_csv_rows(out_csv, pseudonymised_row)
68
+ out_csv << pseudonymised_row[0]
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,53 @@
1
+ module NdrPseudonymise
2
+ # Log percentage progress on pseudonymisation
3
+ # Starts logging after 1 minute or 5%, then at 5% / 5 minute intervals
4
+ class ProgressPrinter
5
+ # Logs progress to the given stream (default $stdout)
6
+ # If verbose = false, only log percentages on a single line
7
+ # If verbose = true, log verbose output
8
+ # If verbose = :dynamic, act like verbose = false, but if the total time is
9
+ # more than 5 minutes, move into verbose = true mode
10
+ def initialize(dest = $stdout, verbose = false)
11
+ @dest = dest
12
+ @verbose = verbose
13
+ @last_percent = 0
14
+ @last_log = Time.current - (60 * 4) # First log entry after 1 minute
15
+ end
16
+
17
+ # Returns a lambda that prints progress to stdout (or another stream).
18
+ # parameter _csv_row is not used.
19
+ def log_progress(start_time, time_now, _csv_row, progress, total)
20
+ current_percentage = total == 0 ? 0 : (progress * 100 / total).to_i
21
+ now = Time.current
22
+ if (current_percentage / 5 > @last_percent / 5) || # Log at 5% / 5 minute intervals
23
+ (now - @last_log >= 60 * 5) || current_percentage == 100
24
+ if @verbose == :dynamic && (time_now - start_time >= 60 * 5)
25
+ @verbose = true
26
+ @dest << '...'
27
+ end
28
+ if @verbose == true
29
+ # TODO: Add estimated completion time
30
+ tfin = if progress > 0
31
+ time_now + (time_now - start_time) * (total - progress) / progress
32
+ end
33
+ completion = tfin ? ', expected completion' : ''
34
+ @dest << format("Completed %s%% in %.1f minutes%s\n",
35
+ current_percentage, (now - start_time) / 60.0, completion)
36
+
37
+ # @dest << ("Completed %s%% in %.1f minutes#{", expected completion #{tfin}" if tfin}\n" %
38
+ # [current_percentage, (now - start_time) / 60.0])
39
+
40
+ else
41
+ @dest << "#{'...' if @last_percent > 0}#{current_percentage}%"
42
+ @dest << "\n" if current_percentage == 100
43
+ end
44
+ # if current_percentage == 100 # Uncomment for performance debugging
45
+ # @dest << "Finished %s rows in %.3f secs\n" % [csv_row, time_now - start_time]
46
+ # end
47
+ @dest.flush
48
+ @last_percent = current_percentage
49
+ @last_log = now
50
+ end
51
+ end
52
+ end
53
+ end