data-anonymization 0.5.0 → 0.5.1.rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +7 -2
- data/examples/whitelist_dsl.rb +14 -10
- data/lib/core/table_errors.rb +2 -1
- data/lib/strategy/base.rb +36 -9
- data/lib/utils/database.rb +4 -3
- data/lib/version.rb +1 -1
- data/spec/acceptance/rdbms_whitelist_spec.rb +2 -3
- data/spec/acceptance/rdbms_whitelist_with_primary_key_spec.rb +55 -0
- data/spec/support/customer_sample.rb +1 -1
- metadata +7 -5
data/README.md
CHANGED
@@ -68,14 +68,14 @@ Postgresql database having **composite primary key**
|
|
68
68
|
|
69
69
|
## Changelog
|
70
70
|
|
71
|
-
#### 0.5.0
|
71
|
+
#### 0.5.0 (Sep 28, 2012)
|
72
72
|
|
73
73
|
Major changes:
|
74
74
|
|
75
75
|
1. MongoDB support
|
76
76
|
2. Command line utility to generate whitelist DSL for RDBMS & MongoDB (reduces pain for writing whitelist dsl)
|
77
77
|
3. Added support for reporting fields missing mapping in case of whitelist
|
78
|
-
4. Errors reported at the end of process. Job doesn't fail for a single error.
|
78
|
+
4. Errors reported at the end of process. Job doesn't fail for a single error, it fails it more than 100 records failed during anonymization.
|
79
79
|
|
80
80
|
|
81
81
|
Please see the [Github 0.5.0 milestone page](https://github.com/sunitparekh/data-anonymization/issues?milestone=2&state=open) for more details on changes/fixes in release 0.5.0
|
@@ -115,6 +115,8 @@ For almost all projects there is a need for production data dump in order to run
|
|
115
115
|
However, getting production data and using it is not feasible due to multiple reasons, primary being privacy concerns for user data. And thus the need for data anonymization.
|
116
116
|
This tool helps you to get anonymized production data dump using either Blacklist or Whitelist strategies.
|
117
117
|
|
118
|
+
Read more about [data anonymization here](http://sunitspace.blogspot.in/2012/09/data-anonymization.html)
|
119
|
+
|
118
120
|
## Anonymization Strategies
|
119
121
|
|
120
122
|
### Blacklist
|
@@ -147,6 +149,9 @@ database 'DatabaseName' do
|
|
147
149
|
end
|
148
150
|
```
|
149
151
|
|
152
|
+
Read more about [blacklist and whitelist here](http://sunitspace.blogspot.in/2012/09/data-anonymization-blacklist-whitelist.html)
|
153
|
+
|
154
|
+
|
150
155
|
## Tips
|
151
156
|
|
152
157
|
1. In Whitelist approach make source database connection READONLY.
|
data/examples/whitelist_dsl.rb
CHANGED
@@ -26,38 +26,42 @@ database 'Chinook' do
|
|
26
26
|
|
27
27
|
end
|
28
28
|
|
29
|
-
table '
|
30
|
-
primary_key '
|
31
|
-
|
32
|
-
|
29
|
+
table 'Customer' do
|
30
|
+
primary_key 'CustomerId'
|
31
|
+
batch_size 5 # batch_size works only if the primary_key is defined for the table
|
32
|
+
|
33
|
+
whitelist 'CustomerId', 'SupportRepId', 'Company'
|
34
|
+
anonymize('Phone').using FieldStrategy::RandomPhoneNumber.new
|
33
35
|
anonymize('FirstName').using FieldStrategy::RandomFirstName.new
|
34
36
|
anonymize('LastName').using FieldStrategy::RandomLastName.new
|
35
|
-
anonymize('HireDate').using FieldStrategy::DateTimeDelta.new(2, 0)
|
36
37
|
anonymize('Address').using FieldStrategy::RandomAddress.region_US
|
37
38
|
anonymize('City').using FieldStrategy::RandomCity.region_US
|
38
39
|
anonymize('State').using FieldStrategy::RandomProvince.region_US
|
39
40
|
anonymize('PostalCode').using FieldStrategy::RandomZipcode.region_US
|
40
41
|
anonymize('Country') {|field| "USA" }
|
41
|
-
anonymize('Phone').using FieldStrategy::RandomPhoneNumber.new
|
42
42
|
anonymize('Fax').using FieldStrategy::RandomPhoneNumber.new
|
43
43
|
anonymize('Email').using FieldStrategy::StringTemplate.new('test+#{row_number}@gmail.com')
|
44
44
|
end
|
45
45
|
|
46
|
-
table '
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
table 'Employee' do
|
47
|
+
batch_size 5 # this won't work since there is no 'primary_key' defined
|
48
|
+
|
49
|
+
whitelist 'EmployeeId', 'ReportsTo', 'Title'
|
50
|
+
anonymize('BirthDate').using FieldStrategy::DateTimeDelta.new(1, 1)
|
50
51
|
anonymize('FirstName').using FieldStrategy::RandomFirstName.new
|
51
52
|
anonymize('LastName').using FieldStrategy::RandomLastName.new
|
53
|
+
anonymize('HireDate').using FieldStrategy::DateTimeDelta.new(2, 0)
|
52
54
|
anonymize('Address').using FieldStrategy::RandomAddress.region_US
|
53
55
|
anonymize('City').using FieldStrategy::RandomCity.region_US
|
54
56
|
anonymize('State').using FieldStrategy::RandomProvince.region_US
|
55
57
|
anonymize('PostalCode').using FieldStrategy::RandomZipcode.region_US
|
56
58
|
anonymize('Country') {|field| "USA" }
|
59
|
+
anonymize('Phone').using FieldStrategy::RandomPhoneNumber.new
|
57
60
|
anonymize('Fax').using FieldStrategy::RandomPhoneNumber.new
|
58
61
|
anonymize('Email').using FieldStrategy::StringTemplate.new('test+#{row_number}@gmail.com')
|
59
62
|
end
|
60
63
|
|
61
64
|
|
65
|
+
|
62
66
|
end
|
63
67
|
|
data/lib/core/table_errors.rb
CHANGED
@@ -20,9 +20,10 @@ module DataAnon
|
|
20
20
|
|
21
21
|
def print
|
22
22
|
return if @errors.length == 0
|
23
|
-
logger.error("Errors while processing table #{@table_name}:")
|
23
|
+
logger.error("Errors while processing table '#{@table_name}':")
|
24
24
|
@errors.each do |error|
|
25
25
|
logger.error(error[:exception])
|
26
|
+
logger.error(error[:exception].backtrace.join("\n\t"))
|
26
27
|
end
|
27
28
|
end
|
28
29
|
|
data/lib/strategy/base.rb
CHANGED
@@ -13,6 +13,7 @@ module DataAnon
|
|
13
13
|
@destination_database = destination_database
|
14
14
|
@fields_missing_strategy = DataAnon::Core::FieldsMissingStrategy.new name
|
15
15
|
@errors = DataAnon::Core::TableErrors.new(@name)
|
16
|
+
@primary_keys = []
|
16
17
|
end
|
17
18
|
|
18
19
|
def self.whitelist?
|
@@ -28,6 +29,10 @@ module DataAnon
|
|
28
29
|
@primary_keys = fields
|
29
30
|
end
|
30
31
|
|
32
|
+
def batch_size size
|
33
|
+
@batch_size = size
|
34
|
+
end
|
35
|
+
|
31
36
|
def whitelist *fields
|
32
37
|
fields.each { |f| @fields[f.downcase] = DataAnon::Strategy::Field::Whitelist.new }
|
33
38
|
end
|
@@ -73,21 +78,43 @@ module DataAnon
|
|
73
78
|
logger.debug "Processing table #{@name} with fields strategies #{@fields}"
|
74
79
|
total = source_table.count
|
75
80
|
if total > 0
|
76
|
-
index = 0
|
77
81
|
progress = progress_bar.new(@name, total)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
rescue => exception
|
83
|
-
@errors.log_error record, exception
|
84
|
-
end
|
85
|
-
progress.show index
|
82
|
+
if @primary_keys.empty? || !@batch_size.present?
|
83
|
+
process_table progress
|
84
|
+
else
|
85
|
+
process_table_in_batches progress
|
86
86
|
end
|
87
87
|
progress.close
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
91
|
+
def process_table progress
|
92
|
+
index = 0
|
93
|
+
source_table.all.each do |record|
|
94
|
+
index += 1
|
95
|
+
begin
|
96
|
+
process_record index, record
|
97
|
+
rescue => exception
|
98
|
+
@errors.log_error record, exception
|
99
|
+
end
|
100
|
+
progress.show index
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def process_table_in_batches progress
|
105
|
+
logger.info "Processing table #{@name} records in batch size of #{@batch_size}"
|
106
|
+
index = 0
|
107
|
+
source_table.find_each(:batch_size => @batch_size) do |record|
|
108
|
+
index += 1
|
109
|
+
begin
|
110
|
+
process_record index, record
|
111
|
+
rescue => exception
|
112
|
+
@errors.log_error record, exception
|
113
|
+
end
|
114
|
+
progress.show index
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
91
118
|
def progress_bar
|
92
119
|
@progress_bar || DataAnon::Utils::ProgressBar
|
93
120
|
end
|
data/lib/utils/database.rb
CHANGED
@@ -28,11 +28,12 @@ module DataAnon
|
|
28
28
|
|
29
29
|
class BaseTable
|
30
30
|
|
31
|
-
def self.create_table database, table_name, primary_keys
|
31
|
+
def self.create_table database, table_name, primary_keys = []
|
32
32
|
Class.new(database) do
|
33
33
|
self.table_name = table_name
|
34
34
|
self.primary_keys = primary_keys if primary_keys.length > 1
|
35
35
|
self.primary_key = primary_keys[0] if primary_keys.length == 1
|
36
|
+
self.primary_key = nil if primary_keys.length == 0
|
36
37
|
self.inheritance_column = :_type_disabled
|
37
38
|
self.mass_assignment_sanitizer = MassAssignmentIgnoreSanitizer.new(self)
|
38
39
|
end
|
@@ -42,7 +43,7 @@ module DataAnon
|
|
42
43
|
|
43
44
|
class SourceTable < BaseTable
|
44
45
|
|
45
|
-
def self.create table_name, primary_key
|
46
|
+
def self.create table_name, primary_key = []
|
46
47
|
create_table SourceDatabase, table_name, primary_key
|
47
48
|
end
|
48
49
|
|
@@ -50,7 +51,7 @@ module DataAnon
|
|
50
51
|
|
51
52
|
class DestinationTable < BaseTable
|
52
53
|
|
53
|
-
def self.create table_name, primary_key
|
54
|
+
def self.create table_name, primary_key = []
|
54
55
|
create_table DestinationDatabase, table_name, primary_key
|
55
56
|
end
|
56
57
|
|
data/lib/version.rb
CHANGED
@@ -21,7 +21,6 @@ describe "End 2 End RDBMS Whitelist Acceptance Test using SQLite database" do
|
|
21
21
|
destination_db dest_connection_spec
|
22
22
|
|
23
23
|
table 'customers' do
|
24
|
-
primary_key 'cust_id'
|
25
24
|
whitelist 'cust_id', 'address', 'zipcode', 'blog_url'
|
26
25
|
anonymize('first_name').using FieldStrategy::RandomFirstName.new
|
27
26
|
anonymize('last_name').using FieldStrategy::RandomLastName.new
|
@@ -34,8 +33,8 @@ describe "End 2 End RDBMS Whitelist Acceptance Test using SQLite database" do
|
|
34
33
|
end
|
35
34
|
|
36
35
|
DataAnon::Utils::DestinationDatabase.establish_connection dest_connection_spec
|
37
|
-
dest_table = DataAnon::Utils::DestinationTable.create 'customers'
|
38
|
-
new_rec = dest_table.
|
36
|
+
dest_table = DataAnon::Utils::DestinationTable.create 'customers'
|
37
|
+
new_rec = dest_table.where("cust_id" => CustomerSample::SAMPLE_DATA[:cust_id]).first
|
39
38
|
new_rec.first_name.should_not be("Sunit")
|
40
39
|
new_rec.last_name.should_not be("Parekh")
|
41
40
|
new_rec.birth_date.should_not be(Date.new(1977,7,8))
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe "End 2 End RDBMS Whitelist Acceptance Test using SQLite database" do
|
4
|
+
|
5
|
+
source_connection_spec = {:adapter => 'sqlite3', :database => 'tmp/customer.sqlite'}
|
6
|
+
dest_connection_spec = {:adapter => 'sqlite3', :database => 'tmp/customer-dest.sqlite'}
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
CustomerSample.clean
|
10
|
+
CustomerSample.create_schema source_connection_spec
|
11
|
+
CustomerSample.insert_record source_connection_spec, CustomerSample::SAMPLE_DATA
|
12
|
+
|
13
|
+
CustomerSample.create_schema dest_connection_spec
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should anonymize customer table record " do
|
17
|
+
|
18
|
+
database "Customer" do
|
19
|
+
strategy DataAnon::Strategy::Whitelist
|
20
|
+
source_db source_connection_spec
|
21
|
+
destination_db dest_connection_spec
|
22
|
+
|
23
|
+
table 'customers' do
|
24
|
+
primary_key 'cust_id'
|
25
|
+
batch_size 1
|
26
|
+
|
27
|
+
whitelist 'cust_id', 'address', 'zipcode', 'blog_url'
|
28
|
+
anonymize('first_name').using FieldStrategy::RandomFirstName.new
|
29
|
+
anonymize('last_name').using FieldStrategy::RandomLastName.new
|
30
|
+
anonymize('state').using FieldStrategy::SelectFromList.new(['Gujrat','Karnataka'])
|
31
|
+
anonymize('phone').using FieldStrategy::RandomPhoneNumber.new
|
32
|
+
anonymize('email').using FieldStrategy::StringTemplate.new('test+#{row_number}@gmail.com')
|
33
|
+
anonymize 'terms_n_condition', 'age', 'longitude'
|
34
|
+
anonymize('latitude').using FieldStrategy::RandomFloatDelta.new(2.0)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
DataAnon::Utils::DestinationDatabase.establish_connection dest_connection_spec
|
39
|
+
dest_table = DataAnon::Utils::DestinationTable.create 'customers'
|
40
|
+
new_rec = dest_table.where("cust_id" => CustomerSample::SAMPLE_DATA[:cust_id]).first
|
41
|
+
new_rec.first_name.should_not be("Sunit")
|
42
|
+
new_rec.last_name.should_not be("Parekh")
|
43
|
+
new_rec.birth_date.should_not be(Date.new(1977,7,8))
|
44
|
+
new_rec.address.should == 'F 501 Shanti Nagar'
|
45
|
+
['Gujrat','Karnataka'].should include(new_rec.state)
|
46
|
+
new_rec.zipcode.should == '411048'
|
47
|
+
new_rec.phone.should_not be "9923700662"
|
48
|
+
new_rec.email.should == 'test+1@gmail.com'
|
49
|
+
[true,false].should include(new_rec.terms_n_condition)
|
50
|
+
new_rec.age.should be_between(0,100)
|
51
|
+
new_rec.latitude.should be_between( 38.689060, 42.689060)
|
52
|
+
new_rec.longitude.should be_between( -84.044636, -64.044636)
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -2,7 +2,7 @@ class CustomerSample
|
|
2
2
|
|
3
3
|
class CreateCustomer < ActiveRecord::Migration
|
4
4
|
def up
|
5
|
-
create_table :customers, { :id => false } do |t|
|
5
|
+
create_table :customers, { :id => false, :force => true } do |t|
|
6
6
|
t.integer :cust_id, :primary => true
|
7
7
|
t.string :first_name
|
8
8
|
t.string :last_name
|
metadata
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data-anonymization
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
5
|
-
prerelease:
|
4
|
+
version: 0.5.1.rc1
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Sunit Parekh
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-
|
14
|
+
date: 2012-10-23 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: activerecord
|
@@ -244,6 +244,7 @@ files:
|
|
244
244
|
- spec/acceptance/mongodb_whitelist_spec.rb
|
245
245
|
- spec/acceptance/rdbms_blacklist_spec.rb
|
246
246
|
- spec/acceptance/rdbms_whitelist_spec.rb
|
247
|
+
- spec/acceptance/rdbms_whitelist_with_primary_key_spec.rb
|
247
248
|
- spec/core/fields_missing_strategy_spec.rb
|
248
249
|
- spec/resource/sample.geojson
|
249
250
|
- spec/spec_helper.rb
|
@@ -306,9 +307,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
306
307
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
307
308
|
none: false
|
308
309
|
requirements:
|
309
|
-
- - ! '
|
310
|
+
- - ! '>'
|
310
311
|
- !ruby/object:Gem::Version
|
311
|
-
version:
|
312
|
+
version: 1.3.1
|
312
313
|
requirements: []
|
313
314
|
rubyforge_project:
|
314
315
|
rubygems_version: 1.8.24
|
@@ -321,6 +322,7 @@ test_files:
|
|
321
322
|
- spec/acceptance/mongodb_whitelist_spec.rb
|
322
323
|
- spec/acceptance/rdbms_blacklist_spec.rb
|
323
324
|
- spec/acceptance/rdbms_whitelist_spec.rb
|
325
|
+
- spec/acceptance/rdbms_whitelist_with_primary_key_spec.rb
|
324
326
|
- spec/core/fields_missing_strategy_spec.rb
|
325
327
|
- spec/resource/sample.geojson
|
326
328
|
- spec/spec_helper.rb
|