ndr_pseudonymise 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +64 -0
- data/Rakefile +14 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/ndr_pseudonymise/client.rb +115 -0
- data/lib/ndr_pseudonymise/demographics_only_pseudonymiser.rb +88 -0
- data/lib/ndr_pseudonymise/engine.rb +6 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/command_line.rb +194 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/encrypted_object.rb +124 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/remote_repository.rb +44 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/repository.rb +165 -0
- data/lib/ndr_pseudonymise/ndr_encrypt.rb +10 -0
- data/lib/ndr_pseudonymise/prescription_pseudonymiser.rb +71 -0
- data/lib/ndr_pseudonymise/progress_printer.rb +53 -0
- data/lib/ndr_pseudonymise/pseudonymisation_specification.rb +379 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_converter.rb +92 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_wrapper.rb +96 -0
- data/lib/ndr_pseudonymise/simple_pseudonymisation.rb +125 -0
- data/lib/ndr_pseudonymise/version.rb +3 -0
- data/lib/ndr_pseudonymise.rb +16 -0
- data/lib/rsa_aes_cbc.rb +114 -0
- data/ndr_pseudonymise.gemspec +36 -0
- data/script/ndr_encrypt/README.md +154 -0
- data/script/ndr_encrypt/ndr_encrypt +4 -0
- metadata +197 -0
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'digest'
|
2
|
+
require 'io/console'
|
3
|
+
require 'openssl'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
module NdrPseudonymise
|
7
|
+
module NdrEncrypt
|
8
|
+
# Defines utility methods for encrypting / decrypting objects
|
9
|
+
module EncryptedObject
|
10
|
+
# rubocop:disable Style/SlicingWithRange
|
11
|
+
def self.blob(data)
|
12
|
+
"blob #{data.size}\0#{data}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.unpack_blob(blob)
|
16
|
+
prefix, data = blob.split("\x00", 2)
|
17
|
+
raise(ArgumentError, 'Invalid blob format') unless /\Ablob [0-9]+\z/ =~ prefix
|
18
|
+
|
19
|
+
size = prefix[5..-1].to_i
|
20
|
+
raise(ArgumentError, 'Incorrect blob size') unless size == data.size
|
21
|
+
|
22
|
+
data
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.digest(blob)
|
26
|
+
Digest::SHA256.hexdigest(blob)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Create zlib-compressed version of the content
|
30
|
+
def self.compress(blob)
|
31
|
+
Zlib::Deflate.deflate(blob)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Unpack zlib-compressed content
|
35
|
+
def self.decompress(contents)
|
36
|
+
Zlib::Inflate.inflate(contents)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.encrypted_id(git_blobid, key_name: nil)
|
40
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
41
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
42
|
+
|
43
|
+
temp_id = "ndr_encrypt #{git_blobid} #{key_name}"
|
44
|
+
digest(blob(temp_id))
|
45
|
+
end
|
46
|
+
|
47
|
+
# Encrypt sensitive secret data, given a public key file as a String
|
48
|
+
# Returns the encrypted output data
|
49
|
+
# Result can either be decrypted using the decrypt method on this class.
|
50
|
+
# TODO: write equivalent command-line method using only openssl and shell scripts
|
51
|
+
def self.encrypt(secret_data, pub_key: nil)
|
52
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
53
|
+
raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
|
54
|
+
return nil unless secret_data
|
55
|
+
|
56
|
+
public_key_data = File.read(pub_key)
|
57
|
+
cipher = OpenSSL::Cipher.new('aes-256-cbc')
|
58
|
+
cipher.encrypt
|
59
|
+
cipher.key = random_key = cipher.random_key
|
60
|
+
cipher.iv = random_iv = cipher.random_iv
|
61
|
+
rawdata = cipher.update(secret_data)
|
62
|
+
rawdata << cipher.final
|
63
|
+
public_key = OpenSSL::PKey::RSA.new(public_key_data)
|
64
|
+
public_key.public_encrypt(random_key) + random_iv + rawdata
|
65
|
+
end
|
66
|
+
|
67
|
+
# Decrypt sensitive secret data, given a private key and its password
|
68
|
+
# Returns the decrypted output data
|
69
|
+
# TODO: write equivalent command-line method using only openssl and shell scripts
|
70
|
+
# TODO: Refactor with code from era UnifiedSources::ApiRetrieval::Extractor
|
71
|
+
def self.decrypt(rawdata, private_key: nil, passin: nil)
|
72
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
73
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
74
|
+
return nil unless rawdata
|
75
|
+
|
76
|
+
password = get_passphrase(private_key: private_key, passin: passin)
|
77
|
+
private_key_data = File.read(private_key)
|
78
|
+
cipher = OpenSSL::Cipher.new('aes-256-cbc')
|
79
|
+
cipher.decrypt
|
80
|
+
private_key = OpenSSL::PKey::RSA.new(private_key_data, password)
|
81
|
+
key_size = private_key.n.num_bytes
|
82
|
+
cipher.key = private_key.private_decrypt(rawdata[0..key_size - 1])
|
83
|
+
cipher.iv = rawdata[key_size..key_size + 15]
|
84
|
+
decrypted_data = cipher.update(rawdata[key_size + 16..-1])
|
85
|
+
decrypted_data << cipher.final
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.get_passphrase(private_key: nil, passin: nil)
|
89
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
90
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
91
|
+
|
92
|
+
@passphrase_cache ||= {}
|
93
|
+
return @passphrase_cache[private_key] if @passphrase_cache.key?(private_key)
|
94
|
+
|
95
|
+
raise(ArgumentError, 'Missing private key file') unless File.exist?(private_key)
|
96
|
+
|
97
|
+
# Implement a subset of the openssl -passin options in
|
98
|
+
# https://www.openssl.org/docs/man3.0/man1/openssl-passphrase-options.html
|
99
|
+
result = case passin
|
100
|
+
when nil, ''
|
101
|
+
msg = "Enter passphrase for #{private_key}: "
|
102
|
+
if IO.console.respond_to?(:getpass)
|
103
|
+
IO.console.getpass msg
|
104
|
+
else
|
105
|
+
$stdout.print msg
|
106
|
+
password = $stdin.noecho(&:gets).chomp
|
107
|
+
puts
|
108
|
+
password
|
109
|
+
end
|
110
|
+
when /\Apass:/
|
111
|
+
passin[5..-1]
|
112
|
+
when /\Aenv:/
|
113
|
+
ENV[passin[4..-1]]
|
114
|
+
when 'stdin'
|
115
|
+
$stdin.readline.chomp
|
116
|
+
else
|
117
|
+
raise(ArgumentError, 'Unsupported passin option')
|
118
|
+
end
|
119
|
+
@passphrase_cache[private_key] = result
|
120
|
+
end
|
121
|
+
# rubocop:enable Style/SlicingWithRange
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module NdrPseudonymise
|
5
|
+
module NdrEncrypt
|
6
|
+
# Defines a local ndr_encrypt working copy
|
7
|
+
class RemoteRepository
|
8
|
+
# rubocop:disable Style/SlicingWithRange
|
9
|
+
def initialize(base_url: nil)
|
10
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
11
|
+
raise(ArgumentError, 'missing keyword: :base_url') unless base_url
|
12
|
+
|
13
|
+
@base_url = base_url
|
14
|
+
end
|
15
|
+
|
16
|
+
# Retrieve remote file(s) based on git_blobid
|
17
|
+
def cat_remote(git_blobid, key_name: nil, private_key: nil, passin: nil)
|
18
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
19
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
20
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
21
|
+
|
22
|
+
encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
|
23
|
+
rawdata = retrieve_remote_url(encrypted_id)
|
24
|
+
contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
|
25
|
+
passin: passin)
|
26
|
+
blob = NdrEncrypt::EncryptedObject.decompress(contents)
|
27
|
+
NdrEncrypt::EncryptedObject.unpack_blob(blob)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
# Retrieve remote encrypted file(s) based on encrypted_id
|
33
|
+
def retrieve_remote_url(encrypted_id)
|
34
|
+
uri = URI.join(@base_url, "#{encrypted_id[0..1]}/#{encrypted_id[2..-1]}")
|
35
|
+
res = Net::HTTP.get_response(uri)
|
36
|
+
# TODO: More finegrained error messages
|
37
|
+
raise(ArgumentError, 'Could not retrieve URL') unless res.is_a?(Net::HTTPSuccess)
|
38
|
+
|
39
|
+
res.body
|
40
|
+
end
|
41
|
+
# rubocop:enable Style/SlicingWithRange
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'set'
|
4
|
+
require 'stringio'
|
5
|
+
|
6
|
+
module NdrPseudonymise
|
7
|
+
module NdrEncrypt
|
8
|
+
# Defines a local ndr_encrypt working copy
|
9
|
+
class Repository
|
10
|
+
# rubocop:disable Style/SlicingWithRange
|
11
|
+
CSV_COLUMNS = %w[git_blobid path].freeze
|
12
|
+
ENCRYPTED_DIR = 'ndr_encrypted/'.freeze
|
13
|
+
|
14
|
+
def initialize(repo_dir: nil)
|
15
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
16
|
+
raise(ArgumentError, 'missing keyword: :repo_dir') unless repo_dir
|
17
|
+
|
18
|
+
@repo_dir = repo_dir
|
19
|
+
end
|
20
|
+
|
21
|
+
# Create directory structure
|
22
|
+
def init
|
23
|
+
FileUtils.mkdir_p(object_dir)
|
24
|
+
return false if valid_repository?
|
25
|
+
|
26
|
+
CSV.open(index_filename, 'wb') { |csv| csv << CSV_COLUMNS }
|
27
|
+
true
|
28
|
+
end
|
29
|
+
|
30
|
+
# Add file contents to the encrypted store and index
|
31
|
+
def add(paths, key_name: nil, pub_key: nil)
|
32
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
33
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
34
|
+
raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
|
35
|
+
raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
|
36
|
+
|
37
|
+
paths.each do |path|
|
38
|
+
git_blobid, _encrypted_id = hash_object(path,
|
39
|
+
key_name: key_name, pub_key: pub_key, write: true)
|
40
|
+
File.open(index_filename, 'ab') { |f| f << [git_blobid, path].to_csv }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Cleanup unnecessary index entries and optimize the encrypted store
|
45
|
+
def gc(output_stream: StringIO.new)
|
46
|
+
raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
|
47
|
+
|
48
|
+
output_stream.print('Reading index: ')
|
49
|
+
csv_data = CSV.read(index_filename)
|
50
|
+
header = csv_data.shift
|
51
|
+
raise(ArgumentError, 'Invalid header in index file') unless CSV_COLUMNS == header
|
52
|
+
|
53
|
+
count0 = csv_data.size
|
54
|
+
output_stream.print("#{count0} entries.\nRemoving duplicates: ")
|
55
|
+
csv_data.each.with_index do |row, i|
|
56
|
+
unless row.size == 2 && row[0] =~ /\A[0-9a-f]+\z/
|
57
|
+
raise(ArgumentError, "Invalid index entry on data row #{i + 1}")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
csv_data = csv_data.sort.uniq
|
61
|
+
count1 = csv_data.size
|
62
|
+
output_stream.print("#{count1} entries remaining.\nWriting objects: ")
|
63
|
+
# Move aside index file temporarily to reduce race conditions
|
64
|
+
# Note: should use a proper lock file for all index interactions
|
65
|
+
orig_filename = "#{index_filename}.orig"
|
66
|
+
temp_filename = "#{index_filename}.new"
|
67
|
+
FileUtils.mv(index_filename, "#{index_filename}.orig")
|
68
|
+
CSV.open(temp_filename, 'wb') do |csv|
|
69
|
+
csv << header
|
70
|
+
csv_data.each { |row| csv << row }
|
71
|
+
end
|
72
|
+
FileUtils.mv(temp_filename, index_filename)
|
73
|
+
FileUtils.rm(orig_filename)
|
74
|
+
output_stream.puts("100% (#{count1}/#{count1}), done.\n")
|
75
|
+
output_stream.puts("Total #{count1} (delta #{count0 - count1})")
|
76
|
+
end
|
77
|
+
|
78
|
+
# Retrieve local file(s) based on CSV entry
|
79
|
+
def get(paths, key_name: nil, private_key: nil, passin: nil)
|
80
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
81
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
82
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
83
|
+
raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
|
84
|
+
|
85
|
+
path_set = Set.new(paths)
|
86
|
+
paths = path_set.to_a # Keep only unique entries
|
87
|
+
found = Set.new # index may have duplicate objects if not garbage collected
|
88
|
+
CSV.foreach(index_filename, headers: true) do |row|
|
89
|
+
# Only keep first matching entry for each path
|
90
|
+
if path_set.include?(row['path'])
|
91
|
+
found << row
|
92
|
+
path_set.delete(row['path'])
|
93
|
+
break if path_set.empty?
|
94
|
+
end
|
95
|
+
end
|
96
|
+
raise(ArgumentError, 'Cannot find some files') unless found.size == paths.size
|
97
|
+
|
98
|
+
found.each do |row|
|
99
|
+
data = cat_file(row['git_blobid'], key_name: key_name, private_key: private_key,
|
100
|
+
passin: passin)
|
101
|
+
File.open(row['path'], 'wb') { |f| f << data }
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Compute object IDs and optionally creates an encrypted object from a file
|
106
|
+
# Returns [git_blobid, encrypted_id]
|
107
|
+
def hash_object(path, key_name: nil, pub_key: nil, write: nil)
|
108
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
109
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
110
|
+
raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
|
111
|
+
|
112
|
+
data = File.binread(path)
|
113
|
+
blob = NdrEncrypt::EncryptedObject.blob(data)
|
114
|
+
git_blobid = NdrEncrypt::EncryptedObject.digest(blob)
|
115
|
+
encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
|
116
|
+
if write
|
117
|
+
encrypted_dir = File.join(object_dir, encrypted_id[0..1])
|
118
|
+
encrypted_filename = File.join(encrypted_dir, encrypted_id[2..-1])
|
119
|
+
unless File.exist?(encrypted_filename) # Don't override existing file
|
120
|
+
contents = NdrEncrypt::EncryptedObject.compress(blob)
|
121
|
+
encrypted_contents = NdrEncrypt::EncryptedObject.encrypt(contents, pub_key: pub_key)
|
122
|
+
FileUtils.mkdir_p(encrypted_dir)
|
123
|
+
File.open(encrypted_filename, 'wb') { |f| f << encrypted_contents }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
[git_blobid, encrypted_id]
|
127
|
+
end
|
128
|
+
|
129
|
+
# Retrieve local file(s) based on git_blobid
|
130
|
+
def cat_file(git_blobid, key_name: nil, private_key: nil, passin: nil)
|
131
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
132
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
133
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
134
|
+
|
135
|
+
encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
|
136
|
+
encrypted_filename = File.join(object_dir, encrypted_id[0..1], encrypted_id[2..-1])
|
137
|
+
unless File.exist?(encrypted_filename)
|
138
|
+
raise(ArgumentError, 'File does not exist in encrypted storage')
|
139
|
+
end
|
140
|
+
|
141
|
+
rawdata = File.binread(encrypted_filename)
|
142
|
+
contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
|
143
|
+
passin: passin)
|
144
|
+
blob = NdrEncrypt::EncryptedObject.decompress(contents)
|
145
|
+
NdrEncrypt::EncryptedObject.unpack_blob(blob)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
# Does the repository have a valid structure
|
151
|
+
def valid_repository?
|
152
|
+
Dir.exist?(object_dir) && File.exist?(index_filename)
|
153
|
+
end
|
154
|
+
|
155
|
+
def object_dir
|
156
|
+
File.join(@repo_dir, ENCRYPTED_DIR, 'objects')
|
157
|
+
end
|
158
|
+
|
159
|
+
def index_filename
|
160
|
+
File.join(@repo_dir, ENCRYPTED_DIR, 'index.csv')
|
161
|
+
end
|
162
|
+
# rubocop:enable Style/SlicingWithRange
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'ndr_pseudonymise/ndr_encrypt/command_line'
|
2
|
+
require 'ndr_pseudonymise/ndr_encrypt/encrypted_object'
|
3
|
+
require 'ndr_pseudonymise/ndr_encrypt/remote_repository'
|
4
|
+
require 'ndr_pseudonymise/ndr_encrypt/repository'
|
5
|
+
|
6
|
+
module NdrPseudonymise
|
7
|
+
# Utilities and a command line tool for an encrypted object store
|
8
|
+
module NdrEncrypt
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Fast, simple pseudonymisation of prescription data with a very controlled
|
2
|
+
# format.
|
3
|
+
# Only the first 2 fields are potentially identifiable: nhs number and date of
|
4
|
+
# birth.
|
5
|
+
|
6
|
+
require 'ndr_pseudonymise/simple_pseudonymisation'
|
7
|
+
require 'ndr_pseudonymise/pseudonymisation_specification'
|
8
|
+
|
9
|
+
require 'json'
|
10
|
+
|
11
|
+
module NdrPseudonymise
|
12
|
+
# Pseudonymise prescription data
|
13
|
+
class PrescriptionPseudonymiser < PseudonymisationSpecification
|
14
|
+
PREAMBLE_V2_DEMOG_ONLY = 'Pseudonymised matching data v2.0-demog-only'.freeze
|
15
|
+
|
16
|
+
def initialize(format_spec, key_bundle)
|
17
|
+
super
|
18
|
+
return if @format_spec[:demographics] == [0, 1]
|
19
|
+
raise 'Invalid specification: expected nhsnumber and birthdate in first 2 columns'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Validate a row of prescription data
|
23
|
+
# Return false if this row is a valid data row, otherwise a list of errors
|
24
|
+
def row_errors2(row)
|
25
|
+
# Not significantly faster than optimised general #row_errors method
|
26
|
+
(nhsnumber, birthdate) = row[0..1]
|
27
|
+
unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
|
28
|
+
raise 'Invalid NHS number'
|
29
|
+
end
|
30
|
+
raise 'Missing NHS number' if nhsnumber.size < 10
|
31
|
+
unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
|
32
|
+
raise 'Invalid birthdate'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Pseudonymise a row of prescription data, returning an array of a single row:
|
37
|
+
# [[packed_pseudoid_and_demographics, clinical_data1, ...]]
|
38
|
+
# Where packed_pseudoid_and_demographics consists of
|
39
|
+
# "pseudo_id1 (key_bundle) packed_pseudoid_and_demographics"
|
40
|
+
def pseudonymise_row(row)
|
41
|
+
@key_cache ||= {} # Cache pseudonymisation keys for more compact import
|
42
|
+
all_demographics = { 'nhsnumber' => row[0], 'birthdate' => row[1] }
|
43
|
+
key = all_demographics.to_json
|
44
|
+
if @key_cache.key?(key)
|
45
|
+
pseudo_id1, key_bundle, demog_key = @key_cache[key]
|
46
|
+
else
|
47
|
+
pseudo_id1, key_bundle, demog_key = NdrPseudonymise::SimplePseudonymisation.
|
48
|
+
generate_keys_nhsnumber_demog_only(@salt1, @salt2, row[0])
|
49
|
+
if !row[0].to_s.empty? && !row[1].to_s.empty? # && false to stop caching
|
50
|
+
@key_cache = {} if @key_cache.size > 10000 # Limit cache size
|
51
|
+
@key_cache[key] = [pseudo_id1, key_bundle, demog_key]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
encrypted_demographics = NdrPseudonymise::SimplePseudonymisation.
|
55
|
+
encrypt_data64(demog_key, all_demographics.to_json)
|
56
|
+
packed_pseudoid_and_demographics = format('%s (%s) %s', pseudo_id1, key_bundle,
|
57
|
+
encrypted_demographics)
|
58
|
+
[[packed_pseudoid_and_demographics] + row[2..-1]]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Header row for CSV data
|
62
|
+
def csv_header_row
|
63
|
+
[PREAMBLE_V2_DEMOG_ONLY]
|
64
|
+
end
|
65
|
+
|
66
|
+
# Append the output of pseudonymise_row to a CSV file
|
67
|
+
def emit_csv_rows(out_csv, pseudonymised_row)
|
68
|
+
out_csv << pseudonymised_row[0]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module NdrPseudonymise
|
2
|
+
# Log percentage progress on pseudonymisation
|
3
|
+
# Starts logging after 1 minute or 5%, then at 5% / 5 minute intervals
|
4
|
+
class ProgressPrinter
|
5
|
+
# Logs progress to the given stream (default $stdout)
|
6
|
+
# If verbose = false, only log percentages on a single line
|
7
|
+
# If verbose = true, log verbose output
|
8
|
+
# If verbose = :dynamic, act like verbose = false, but if the total time is
|
9
|
+
# more than 5 minutes, move into verbose = true mode
|
10
|
+
def initialize(dest = $stdout, verbose = false)
|
11
|
+
@dest = dest
|
12
|
+
@verbose = verbose
|
13
|
+
@last_percent = 0
|
14
|
+
@last_log = Time.current - (60 * 4) # First log entry after 1 minute
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns a lambda that prints progress to stdout (or another stream).
|
18
|
+
# parameter _csv_row is not used.
|
19
|
+
def log_progress(start_time, time_now, _csv_row, progress, total)
|
20
|
+
current_percentage = total == 0 ? 0 : (progress * 100 / total).to_i
|
21
|
+
now = Time.current
|
22
|
+
if (current_percentage / 5 > @last_percent / 5) || # Log at 5% / 5 minute intervals
|
23
|
+
(now - @last_log >= 60 * 5) || current_percentage == 100
|
24
|
+
if @verbose == :dynamic && (time_now - start_time >= 60 * 5)
|
25
|
+
@verbose = true
|
26
|
+
@dest << '...'
|
27
|
+
end
|
28
|
+
if @verbose == true
|
29
|
+
# TODO: Add estimated completion time
|
30
|
+
tfin = if progress > 0
|
31
|
+
time_now + (time_now - start_time) * (total - progress) / progress
|
32
|
+
end
|
33
|
+
completion = tfin ? ', expected completion' : ''
|
34
|
+
@dest << format("Completed %s%% in %.1f minutes%s\n",
|
35
|
+
current_percentage, (now - start_time) / 60.0, completion)
|
36
|
+
|
37
|
+
# @dest << ("Completed %s%% in %.1f minutes#{", expected completion #{tfin}" if tfin}\n" %
|
38
|
+
# [current_percentage, (now - start_time) / 60.0])
|
39
|
+
|
40
|
+
else
|
41
|
+
@dest << "#{'...' if @last_percent > 0}#{current_percentage}%"
|
42
|
+
@dest << "\n" if current_percentage == 100
|
43
|
+
end
|
44
|
+
# if current_percentage == 100 # Uncomment for performance debugging
|
45
|
+
# @dest << "Finished %s rows in %.3f secs\n" % [csv_row, time_now - start_time]
|
46
|
+
# end
|
47
|
+
@dest.flush
|
48
|
+
@last_percent = current_percentage
|
49
|
+
@last_log = now
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|