ndr_pseudonymise 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +64 -0
- data/Rakefile +14 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/ndr_pseudonymise/client.rb +115 -0
- data/lib/ndr_pseudonymise/demographics_only_pseudonymiser.rb +88 -0
- data/lib/ndr_pseudonymise/engine.rb +6 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/command_line.rb +194 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/encrypted_object.rb +124 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/remote_repository.rb +44 -0
- data/lib/ndr_pseudonymise/ndr_encrypt/repository.rb +165 -0
- data/lib/ndr_pseudonymise/ndr_encrypt.rb +10 -0
- data/lib/ndr_pseudonymise/prescription_pseudonymiser.rb +71 -0
- data/lib/ndr_pseudonymise/progress_printer.rb +53 -0
- data/lib/ndr_pseudonymise/pseudonymisation_specification.rb +379 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_converter.rb +92 -0
- data/lib/ndr_pseudonymise/pseudonymised_file_wrapper.rb +96 -0
- data/lib/ndr_pseudonymise/simple_pseudonymisation.rb +125 -0
- data/lib/ndr_pseudonymise/version.rb +3 -0
- data/lib/ndr_pseudonymise.rb +16 -0
- data/lib/rsa_aes_cbc.rb +114 -0
- data/ndr_pseudonymise.gemspec +36 -0
- data/script/ndr_encrypt/README.md +154 -0
- data/script/ndr_encrypt/ndr_encrypt +4 -0
- metadata +197 -0
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'digest'
|
2
|
+
require 'io/console'
|
3
|
+
require 'openssl'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
module NdrPseudonymise
|
7
|
+
module NdrEncrypt
|
8
|
+
# Defines utility methods for encrypting / decrypting objects
|
9
|
+
module EncryptedObject
|
10
|
+
# rubocop:disable Style/SlicingWithRange
|
11
|
+
def self.blob(data)
|
12
|
+
"blob #{data.size}\0#{data}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.unpack_blob(blob)
|
16
|
+
prefix, data = blob.split("\x00", 2)
|
17
|
+
raise(ArgumentError, 'Invalid blob format') unless /\Ablob [0-9]+\z/ =~ prefix
|
18
|
+
|
19
|
+
size = prefix[5..-1].to_i
|
20
|
+
raise(ArgumentError, 'Incorrect blob size') unless size == data.size
|
21
|
+
|
22
|
+
data
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.digest(blob)
|
26
|
+
Digest::SHA256.hexdigest(blob)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Create zlib-compressed version of the content
|
30
|
+
def self.compress(blob)
|
31
|
+
Zlib::Deflate.deflate(blob)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Unpack zlib-compressed content
|
35
|
+
def self.decompress(contents)
|
36
|
+
Zlib::Inflate.inflate(contents)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.encrypted_id(git_blobid, key_name: nil)
|
40
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
41
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
42
|
+
|
43
|
+
temp_id = "ndr_encrypt #{git_blobid} #{key_name}"
|
44
|
+
digest(blob(temp_id))
|
45
|
+
end
|
46
|
+
|
47
|
+
# Encrypt sensitive secret data, given a public key file as a String
|
48
|
+
# Returns the encrypted output data
|
49
|
+
# Result can either be decrypted using the decrypt method on this class.
|
50
|
+
# TODO: write equivalent command-line method using only openssl and shell scripts
|
51
|
+
def self.encrypt(secret_data, pub_key: nil)
|
52
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
53
|
+
raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
|
54
|
+
return nil unless secret_data
|
55
|
+
|
56
|
+
public_key_data = File.read(pub_key)
|
57
|
+
cipher = OpenSSL::Cipher.new('aes-256-cbc')
|
58
|
+
cipher.encrypt
|
59
|
+
cipher.key = random_key = cipher.random_key
|
60
|
+
cipher.iv = random_iv = cipher.random_iv
|
61
|
+
rawdata = cipher.update(secret_data)
|
62
|
+
rawdata << cipher.final
|
63
|
+
public_key = OpenSSL::PKey::RSA.new(public_key_data)
|
64
|
+
public_key.public_encrypt(random_key) + random_iv + rawdata
|
65
|
+
end
|
66
|
+
|
67
|
+
# Decrypt sensitive secret data, given a private key and its password
|
68
|
+
# Returns the decrypted output data
|
69
|
+
# TODO: write equivalent command-line method using only openssl and shell scripts
|
70
|
+
# TODO: Refactor with code from era UnifiedSources::ApiRetrieval::Extractor
|
71
|
+
def self.decrypt(rawdata, private_key: nil, passin: nil)
|
72
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
73
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
74
|
+
return nil unless rawdata
|
75
|
+
|
76
|
+
password = get_passphrase(private_key: private_key, passin: passin)
|
77
|
+
private_key_data = File.read(private_key)
|
78
|
+
cipher = OpenSSL::Cipher.new('aes-256-cbc')
|
79
|
+
cipher.decrypt
|
80
|
+
private_key = OpenSSL::PKey::RSA.new(private_key_data, password)
|
81
|
+
key_size = private_key.n.num_bytes
|
82
|
+
cipher.key = private_key.private_decrypt(rawdata[0..key_size - 1])
|
83
|
+
cipher.iv = rawdata[key_size..key_size + 15]
|
84
|
+
decrypted_data = cipher.update(rawdata[key_size + 16..-1])
|
85
|
+
decrypted_data << cipher.final
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.get_passphrase(private_key: nil, passin: nil)
|
89
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
90
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
91
|
+
|
92
|
+
@passphrase_cache ||= {}
|
93
|
+
return @passphrase_cache[private_key] if @passphrase_cache.key?(private_key)
|
94
|
+
|
95
|
+
raise(ArgumentError, 'Missing private key file') unless File.exist?(private_key)
|
96
|
+
|
97
|
+
# Implement a subset of the openssl -passin options in
|
98
|
+
# https://www.openssl.org/docs/man3.0/man1/openssl-passphrase-options.html
|
99
|
+
result = case passin
|
100
|
+
when nil, ''
|
101
|
+
msg = "Enter passphrase for #{private_key}: "
|
102
|
+
if IO.console.respond_to?(:getpass)
|
103
|
+
IO.console.getpass msg
|
104
|
+
else
|
105
|
+
$stdout.print msg
|
106
|
+
password = $stdin.noecho(&:gets).chomp
|
107
|
+
puts
|
108
|
+
password
|
109
|
+
end
|
110
|
+
when /\Apass:/
|
111
|
+
passin[5..-1]
|
112
|
+
when /\Aenv:/
|
113
|
+
ENV[passin[4..-1]]
|
114
|
+
when 'stdin'
|
115
|
+
$stdin.readline.chomp
|
116
|
+
else
|
117
|
+
raise(ArgumentError, 'Unsupported passin option')
|
118
|
+
end
|
119
|
+
@passphrase_cache[private_key] = result
|
120
|
+
end
|
121
|
+
# rubocop:enable Style/SlicingWithRange
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module NdrPseudonymise
|
5
|
+
module NdrEncrypt
|
6
|
+
# Defines a local ndr_encrypt working copy
|
7
|
+
class RemoteRepository
|
8
|
+
# rubocop:disable Style/SlicingWithRange
|
9
|
+
def initialize(base_url: nil)
|
10
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
11
|
+
raise(ArgumentError, 'missing keyword: :base_url') unless base_url
|
12
|
+
|
13
|
+
@base_url = base_url
|
14
|
+
end
|
15
|
+
|
16
|
+
# Retrieve remote file(s) based on git_blobid
|
17
|
+
def cat_remote(git_blobid, key_name: nil, private_key: nil, passin: nil)
|
18
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
19
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
20
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
21
|
+
|
22
|
+
encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
|
23
|
+
rawdata = retrieve_remote_url(encrypted_id)
|
24
|
+
contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
|
25
|
+
passin: passin)
|
26
|
+
blob = NdrEncrypt::EncryptedObject.decompress(contents)
|
27
|
+
NdrEncrypt::EncryptedObject.unpack_blob(blob)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
# Retrieve remote encrypted file(s) based on encrypted_id
|
33
|
+
def retrieve_remote_url(encrypted_id)
|
34
|
+
uri = URI.join(@base_url, "#{encrypted_id[0..1]}/#{encrypted_id[2..-1]}")
|
35
|
+
res = Net::HTTP.get_response(uri)
|
36
|
+
# TODO: More finegrained error messages
|
37
|
+
raise(ArgumentError, 'Could not retrieve URL') unless res.is_a?(Net::HTTPSuccess)
|
38
|
+
|
39
|
+
res.body
|
40
|
+
end
|
41
|
+
# rubocop:enable Style/SlicingWithRange
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'set'
|
4
|
+
require 'stringio'
|
5
|
+
|
6
|
+
module NdrPseudonymise
|
7
|
+
module NdrEncrypt
|
8
|
+
# Defines a local ndr_encrypt working copy
|
9
|
+
class Repository
|
10
|
+
# rubocop:disable Style/SlicingWithRange
|
11
|
+
CSV_COLUMNS = %w[git_blobid path].freeze
|
12
|
+
ENCRYPTED_DIR = 'ndr_encrypted/'.freeze
|
13
|
+
|
14
|
+
def initialize(repo_dir: nil)
|
15
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
16
|
+
raise(ArgumentError, 'missing keyword: :repo_dir') unless repo_dir
|
17
|
+
|
18
|
+
@repo_dir = repo_dir
|
19
|
+
end
|
20
|
+
|
21
|
+
# Create directory structure
|
22
|
+
def init
|
23
|
+
FileUtils.mkdir_p(object_dir)
|
24
|
+
return false if valid_repository?
|
25
|
+
|
26
|
+
CSV.open(index_filename, 'wb') { |csv| csv << CSV_COLUMNS }
|
27
|
+
true
|
28
|
+
end
|
29
|
+
|
30
|
+
# Add file contents to the encrypted store and index
|
31
|
+
def add(paths, key_name: nil, pub_key: nil)
|
32
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
33
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
34
|
+
raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
|
35
|
+
raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
|
36
|
+
|
37
|
+
paths.each do |path|
|
38
|
+
git_blobid, _encrypted_id = hash_object(path,
|
39
|
+
key_name: key_name, pub_key: pub_key, write: true)
|
40
|
+
File.open(index_filename, 'ab') { |f| f << [git_blobid, path].to_csv }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Cleanup unnecessary index entries and optimize the encrypted store
|
45
|
+
def gc(output_stream: StringIO.new)
|
46
|
+
raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
|
47
|
+
|
48
|
+
output_stream.print('Reading index: ')
|
49
|
+
csv_data = CSV.read(index_filename)
|
50
|
+
header = csv_data.shift
|
51
|
+
raise(ArgumentError, 'Invalid header in index file') unless CSV_COLUMNS == header
|
52
|
+
|
53
|
+
count0 = csv_data.size
|
54
|
+
output_stream.print("#{count0} entries.\nRemoving duplicates: ")
|
55
|
+
csv_data.each.with_index do |row, i|
|
56
|
+
unless row.size == 2 && row[0] =~ /\A[0-9a-f]+\z/
|
57
|
+
raise(ArgumentError, "Invalid index entry on data row #{i + 1}")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
csv_data = csv_data.sort.uniq
|
61
|
+
count1 = csv_data.size
|
62
|
+
output_stream.print("#{count1} entries remaining.\nWriting objects: ")
|
63
|
+
# Move aside index file temporarily to reduce race conditions
|
64
|
+
# Note: should use a proper lock file for all index interactions
|
65
|
+
orig_filename = "#{index_filename}.orig"
|
66
|
+
temp_filename = "#{index_filename}.new"
|
67
|
+
FileUtils.mv(index_filename, "#{index_filename}.orig")
|
68
|
+
CSV.open(temp_filename, 'wb') do |csv|
|
69
|
+
csv << header
|
70
|
+
csv_data.each { |row| csv << row }
|
71
|
+
end
|
72
|
+
FileUtils.mv(temp_filename, index_filename)
|
73
|
+
FileUtils.rm(orig_filename)
|
74
|
+
output_stream.puts("100% (#{count1}/#{count1}), done.\n")
|
75
|
+
output_stream.puts("Total #{count1} (delta #{count0 - count1})")
|
76
|
+
end
|
77
|
+
|
78
|
+
# Retrieve local file(s) based on CSV entry
|
79
|
+
def get(paths, key_name: nil, private_key: nil, passin: nil)
|
80
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
81
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
82
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
83
|
+
raise(ArgumentError, 'Invalid ndr_encrypted encrypted store') unless valid_repository?
|
84
|
+
|
85
|
+
path_set = Set.new(paths)
|
86
|
+
paths = path_set.to_a # Keep only unique entries
|
87
|
+
found = Set.new # index may have duplicate objects if not garbage collected
|
88
|
+
CSV.foreach(index_filename, headers: true) do |row|
|
89
|
+
# Only keep first matching entry for each path
|
90
|
+
if path_set.include?(row['path'])
|
91
|
+
found << row
|
92
|
+
path_set.delete(row['path'])
|
93
|
+
break if path_set.empty?
|
94
|
+
end
|
95
|
+
end
|
96
|
+
raise(ArgumentError, 'Cannot find some files') unless found.size == paths.size
|
97
|
+
|
98
|
+
found.each do |row|
|
99
|
+
data = cat_file(row['git_blobid'], key_name: key_name, private_key: private_key,
|
100
|
+
passin: passin)
|
101
|
+
File.open(row['path'], 'wb') { |f| f << data }
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Compute object IDs and optionally creates an encrypted object from a file
|
106
|
+
# Returns [git_blobid, encrypted_id]
|
107
|
+
def hash_object(path, key_name: nil, pub_key: nil, write: nil)
|
108
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
109
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
110
|
+
raise(ArgumentError, 'missing keyword: :pub_key') unless pub_key
|
111
|
+
|
112
|
+
data = File.binread(path)
|
113
|
+
blob = NdrEncrypt::EncryptedObject.blob(data)
|
114
|
+
git_blobid = NdrEncrypt::EncryptedObject.digest(blob)
|
115
|
+
encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
|
116
|
+
if write
|
117
|
+
encrypted_dir = File.join(object_dir, encrypted_id[0..1])
|
118
|
+
encrypted_filename = File.join(encrypted_dir, encrypted_id[2..-1])
|
119
|
+
unless File.exist?(encrypted_filename) # Don't override existing file
|
120
|
+
contents = NdrEncrypt::EncryptedObject.compress(blob)
|
121
|
+
encrypted_contents = NdrEncrypt::EncryptedObject.encrypt(contents, pub_key: pub_key)
|
122
|
+
FileUtils.mkdir_p(encrypted_dir)
|
123
|
+
File.open(encrypted_filename, 'wb') { |f| f << encrypted_contents }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
[git_blobid, encrypted_id]
|
127
|
+
end
|
128
|
+
|
129
|
+
# Retrieve local file(s) based on git_blobid
|
130
|
+
def cat_file(git_blobid, key_name: nil, private_key: nil, passin: nil)
|
131
|
+
# We need to support ruby 2.0 so cannot use required keyword arguments syntax
|
132
|
+
raise(ArgumentError, 'missing keyword: :key_name') unless key_name
|
133
|
+
raise(ArgumentError, 'missing keyword: :private_key') unless private_key
|
134
|
+
|
135
|
+
encrypted_id = NdrEncrypt::EncryptedObject.encrypted_id(git_blobid, key_name: key_name)
|
136
|
+
encrypted_filename = File.join(object_dir, encrypted_id[0..1], encrypted_id[2..-1])
|
137
|
+
unless File.exist?(encrypted_filename)
|
138
|
+
raise(ArgumentError, 'File does not exist in encrypted storage')
|
139
|
+
end
|
140
|
+
|
141
|
+
rawdata = File.binread(encrypted_filename)
|
142
|
+
contents = NdrEncrypt::EncryptedObject.decrypt(rawdata, private_key: private_key,
|
143
|
+
passin: passin)
|
144
|
+
blob = NdrEncrypt::EncryptedObject.decompress(contents)
|
145
|
+
NdrEncrypt::EncryptedObject.unpack_blob(blob)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
# Does the repository have a valid structure
|
151
|
+
def valid_repository?
|
152
|
+
Dir.exist?(object_dir) && File.exist?(index_filename)
|
153
|
+
end
|
154
|
+
|
155
|
+
def object_dir
|
156
|
+
File.join(@repo_dir, ENCRYPTED_DIR, 'objects')
|
157
|
+
end
|
158
|
+
|
159
|
+
def index_filename
|
160
|
+
File.join(@repo_dir, ENCRYPTED_DIR, 'index.csv')
|
161
|
+
end
|
162
|
+
# rubocop:enable Style/SlicingWithRange
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'ndr_pseudonymise/ndr_encrypt/command_line'
|
2
|
+
require 'ndr_pseudonymise/ndr_encrypt/encrypted_object'
|
3
|
+
require 'ndr_pseudonymise/ndr_encrypt/remote_repository'
|
4
|
+
require 'ndr_pseudonymise/ndr_encrypt/repository'
|
5
|
+
|
6
|
+
module NdrPseudonymise
|
7
|
+
# Utilities and a command line tool for an encrypted object store
|
8
|
+
module NdrEncrypt
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Fast, simple pseudonymisation of prescription data with a very controlled
|
2
|
+
# format.
|
3
|
+
# Only the first 2 fields are potentially identifiable: nhs number and date of
|
4
|
+
# birth.
|
5
|
+
|
6
|
+
require 'ndr_pseudonymise/simple_pseudonymisation'
|
7
|
+
require 'ndr_pseudonymise/pseudonymisation_specification'
|
8
|
+
|
9
|
+
require 'json'
|
10
|
+
|
11
|
+
module NdrPseudonymise
|
12
|
+
# Pseudonymise prescription data
|
13
|
+
class PrescriptionPseudonymiser < PseudonymisationSpecification
|
14
|
+
PREAMBLE_V2_DEMOG_ONLY = 'Pseudonymised matching data v2.0-demog-only'.freeze
|
15
|
+
|
16
|
+
def initialize(format_spec, key_bundle)
|
17
|
+
super
|
18
|
+
return if @format_spec[:demographics] == [0, 1]
|
19
|
+
raise 'Invalid specification: expected nhsnumber and birthdate in first 2 columns'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Validate a row of prescription data
|
23
|
+
# Return false if this row is a valid data row, otherwise a list of errors
|
24
|
+
def row_errors2(row)
|
25
|
+
# Not significantly faster than optimised general #row_errors method
|
26
|
+
(nhsnumber, birthdate) = row[0..1]
|
27
|
+
unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/
|
28
|
+
raise 'Invalid NHS number'
|
29
|
+
end
|
30
|
+
raise 'Missing NHS number' if nhsnumber.size < 10
|
31
|
+
unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/
|
32
|
+
raise 'Invalid birthdate'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Pseudonymise a row of prescription data, returning an array of a single row:
|
37
|
+
# [[packed_pseudoid_and_demographics, clinical_data1, ...]]
|
38
|
+
# Where packed_pseudoid_and_demographics consists of
|
39
|
+
# "pseudo_id1 (key_bundle) packed_pseudoid_and_demographics"
|
40
|
+
def pseudonymise_row(row)
|
41
|
+
@key_cache ||= {} # Cache pseudonymisation keys for more compact import
|
42
|
+
all_demographics = { 'nhsnumber' => row[0], 'birthdate' => row[1] }
|
43
|
+
key = all_demographics.to_json
|
44
|
+
if @key_cache.key?(key)
|
45
|
+
pseudo_id1, key_bundle, demog_key = @key_cache[key]
|
46
|
+
else
|
47
|
+
pseudo_id1, key_bundle, demog_key = NdrPseudonymise::SimplePseudonymisation.
|
48
|
+
generate_keys_nhsnumber_demog_only(@salt1, @salt2, row[0])
|
49
|
+
if !row[0].to_s.empty? && !row[1].to_s.empty? # && false to stop caching
|
50
|
+
@key_cache = {} if @key_cache.size > 10000 # Limit cache size
|
51
|
+
@key_cache[key] = [pseudo_id1, key_bundle, demog_key]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
encrypted_demographics = NdrPseudonymise::SimplePseudonymisation.
|
55
|
+
encrypt_data64(demog_key, all_demographics.to_json)
|
56
|
+
packed_pseudoid_and_demographics = format('%s (%s) %s', pseudo_id1, key_bundle,
|
57
|
+
encrypted_demographics)
|
58
|
+
[[packed_pseudoid_and_demographics] + row[2..-1]]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Header row for CSV data
|
62
|
+
def csv_header_row
|
63
|
+
[PREAMBLE_V2_DEMOG_ONLY]
|
64
|
+
end
|
65
|
+
|
66
|
+
# Append the output of pseudonymise_row to a CSV file
|
67
|
+
def emit_csv_rows(out_csv, pseudonymised_row)
|
68
|
+
out_csv << pseudonymised_row[0]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module NdrPseudonymise
|
2
|
+
# Log percentage progress on pseudonymisation
|
3
|
+
# Starts logging after 1 minute or 5%, then at 5% / 5 minute intervals
|
4
|
+
class ProgressPrinter
|
5
|
+
# Logs progress to the given stream (default $stdout)
|
6
|
+
# If verbose = false, only log percentages on a single line
|
7
|
+
# If verbose = true, log verbose output
|
8
|
+
# If verbose = :dynamic, act like verbose = false, but if the total time is
|
9
|
+
# more than 5 minutes, move into verbose = true mode
|
10
|
+
def initialize(dest = $stdout, verbose = false)
|
11
|
+
@dest = dest
|
12
|
+
@verbose = verbose
|
13
|
+
@last_percent = 0
|
14
|
+
@last_log = Time.current - (60 * 4) # First log entry after 1 minute
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns a lambda that prints progress to stdout (or another stream).
|
18
|
+
# parameter _csv_row is not used.
|
19
|
+
def log_progress(start_time, time_now, _csv_row, progress, total)
|
20
|
+
current_percentage = total == 0 ? 0 : (progress * 100 / total).to_i
|
21
|
+
now = Time.current
|
22
|
+
if (current_percentage / 5 > @last_percent / 5) || # Log at 5% / 5 minute intervals
|
23
|
+
(now - @last_log >= 60 * 5) || current_percentage == 100
|
24
|
+
if @verbose == :dynamic && (time_now - start_time >= 60 * 5)
|
25
|
+
@verbose = true
|
26
|
+
@dest << '...'
|
27
|
+
end
|
28
|
+
if @verbose == true
|
29
|
+
# TODO: Add estimated completion time
|
30
|
+
tfin = if progress > 0
|
31
|
+
time_now + (time_now - start_time) * (total - progress) / progress
|
32
|
+
end
|
33
|
+
completion = tfin ? ', expected completion' : ''
|
34
|
+
@dest << format("Completed %s%% in %.1f minutes%s\n",
|
35
|
+
current_percentage, (now - start_time) / 60.0, completion)
|
36
|
+
|
37
|
+
# @dest << ("Completed %s%% in %.1f minutes#{", expected completion #{tfin}" if tfin}\n" %
|
38
|
+
# [current_percentage, (now - start_time) / 60.0])
|
39
|
+
|
40
|
+
else
|
41
|
+
@dest << "#{'...' if @last_percent > 0}#{current_percentage}%"
|
42
|
+
@dest << "\n" if current_percentage == 100
|
43
|
+
end
|
44
|
+
# if current_percentage == 100 # Uncomment for performance debugging
|
45
|
+
# @dest << "Finished %s rows in %.3f secs\n" % [csv_row, time_now - start_time]
|
46
|
+
# end
|
47
|
+
@dest.flush
|
48
|
+
@last_percent = current_percentage
|
49
|
+
@last_log = now
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|