data-anonymization 0.5.0 → 0.5.1.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -2
- data/examples/whitelist_dsl.rb +14 -10
- data/lib/core/table_errors.rb +2 -1
- data/lib/strategy/base.rb +36 -9
- data/lib/utils/database.rb +4 -3
- data/lib/version.rb +1 -1
- data/spec/acceptance/rdbms_whitelist_spec.rb +2 -3
- data/spec/acceptance/rdbms_whitelist_with_primary_key_spec.rb +55 -0
- data/spec/support/customer_sample.rb +1 -1
- metadata +7 -5
data/README.md
CHANGED
@@ -68,14 +68,14 @@ Postgresql database having **composite primary key**
|
|
68
68
|
|
69
69
|
## Changelog
|
70
70
|
|
71
|
-
#### 0.5.0
|
71
|
+
#### 0.5.0 (Sep 28, 2012)
|
72
72
|
|
73
73
|
Major changes:
|
74
74
|
|
75
75
|
1. MongoDB support
|
76
76
|
2. Command line utility to generate whitelist DSL for RDBMS & MongoDB (reduces pain for writing whitelist dsl)
|
77
77
|
3. Added support for reporting fields missing mapping in case of whitelist
|
78
|
-
4. Errors reported at the end of process. Job doesn't fail for a single error.
|
78
|
+
4. Errors reported at the end of process. Job doesn't fail for a single error, it fails it more than 100 records failed during anonymization.
|
79
79
|
|
80
80
|
|
81
81
|
Please see the [Github 0.5.0 milestone page](https://github.com/sunitparekh/data-anonymization/issues?milestone=2&state=open) for more details on changes/fixes in release 0.5.0
|
@@ -115,6 +115,8 @@ For almost all projects there is a need for production data dump in order to run
|
|
115
115
|
However, getting production data and using it is not feasible due to multiple reasons, primary being privacy concerns for user data. And thus the need for data anonymization.
|
116
116
|
This tool helps you to get anonymized production data dump using either Blacklist or Whitelist strategies.
|
117
117
|
|
118
|
+
Read more about [data anonymization here](http://sunitspace.blogspot.in/2012/09/data-anonymization.html)
|
119
|
+
|
118
120
|
## Anonymization Strategies
|
119
121
|
|
120
122
|
### Blacklist
|
@@ -147,6 +149,9 @@ database 'DatabaseName' do
|
|
147
149
|
end
|
148
150
|
```
|
149
151
|
|
152
|
+
Read more about [blacklist and whitelist here](http://sunitspace.blogspot.in/2012/09/data-anonymization-blacklist-whitelist.html)
|
153
|
+
|
154
|
+
|
150
155
|
## Tips
|
151
156
|
|
152
157
|
1. In Whitelist approach make source database connection READONLY.
|
data/examples/whitelist_dsl.rb
CHANGED
@@ -26,38 +26,42 @@ database 'Chinook' do
|
|
26
26
|
|
27
27
|
end
|
28
28
|
|
29
|
-
table '
|
30
|
-
primary_key '
|
31
|
-
|
32
|
-
|
29
|
+
table 'Customer' do
|
30
|
+
primary_key 'CustomerId'
|
31
|
+
batch_size 5 # batch_size works only if the primary_key is defined for the table
|
32
|
+
|
33
|
+
whitelist 'CustomerId', 'SupportRepId', 'Company'
|
34
|
+
anonymize('Phone').using FieldStrategy::RandomPhoneNumber.new
|
33
35
|
anonymize('FirstName').using FieldStrategy::RandomFirstName.new
|
34
36
|
anonymize('LastName').using FieldStrategy::RandomLastName.new
|
35
|
-
anonymize('HireDate').using FieldStrategy::DateTimeDelta.new(2, 0)
|
36
37
|
anonymize('Address').using FieldStrategy::RandomAddress.region_US
|
37
38
|
anonymize('City').using FieldStrategy::RandomCity.region_US
|
38
39
|
anonymize('State').using FieldStrategy::RandomProvince.region_US
|
39
40
|
anonymize('PostalCode').using FieldStrategy::RandomZipcode.region_US
|
40
41
|
anonymize('Country') {|field| "USA" }
|
41
|
-
anonymize('Phone').using FieldStrategy::RandomPhoneNumber.new
|
42
42
|
anonymize('Fax').using FieldStrategy::RandomPhoneNumber.new
|
43
43
|
anonymize('Email').using FieldStrategy::StringTemplate.new('test+#{row_number}@gmail.com')
|
44
44
|
end
|
45
45
|
|
46
|
-
table '
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
table 'Employee' do
|
47
|
+
batch_size 5 # this won't work since there is no 'primary_key' defined
|
48
|
+
|
49
|
+
whitelist 'EmployeeId', 'ReportsTo', 'Title'
|
50
|
+
anonymize('BirthDate').using FieldStrategy::DateTimeDelta.new(1, 1)
|
50
51
|
anonymize('FirstName').using FieldStrategy::RandomFirstName.new
|
51
52
|
anonymize('LastName').using FieldStrategy::RandomLastName.new
|
53
|
+
anonymize('HireDate').using FieldStrategy::DateTimeDelta.new(2, 0)
|
52
54
|
anonymize('Address').using FieldStrategy::RandomAddress.region_US
|
53
55
|
anonymize('City').using FieldStrategy::RandomCity.region_US
|
54
56
|
anonymize('State').using FieldStrategy::RandomProvince.region_US
|
55
57
|
anonymize('PostalCode').using FieldStrategy::RandomZipcode.region_US
|
56
58
|
anonymize('Country') {|field| "USA" }
|
59
|
+
anonymize('Phone').using FieldStrategy::RandomPhoneNumber.new
|
57
60
|
anonymize('Fax').using FieldStrategy::RandomPhoneNumber.new
|
58
61
|
anonymize('Email').using FieldStrategy::StringTemplate.new('test+#{row_number}@gmail.com')
|
59
62
|
end
|
60
63
|
|
61
64
|
|
65
|
+
|
62
66
|
end
|
63
67
|
|
data/lib/core/table_errors.rb
CHANGED
@@ -20,9 +20,10 @@ module DataAnon
|
|
20
20
|
|
21
21
|
def print
|
22
22
|
return if @errors.length == 0
|
23
|
-
logger.error("Errors while processing table #{@table_name}:")
|
23
|
+
logger.error("Errors while processing table '#{@table_name}':")
|
24
24
|
@errors.each do |error|
|
25
25
|
logger.error(error[:exception])
|
26
|
+
logger.error(error[:exception].backtrace.join("\n\t"))
|
26
27
|
end
|
27
28
|
end
|
28
29
|
|
data/lib/strategy/base.rb
CHANGED
@@ -13,6 +13,7 @@ module DataAnon
|
|
13
13
|
@destination_database = destination_database
|
14
14
|
@fields_missing_strategy = DataAnon::Core::FieldsMissingStrategy.new name
|
15
15
|
@errors = DataAnon::Core::TableErrors.new(@name)
|
16
|
+
@primary_keys = []
|
16
17
|
end
|
17
18
|
|
18
19
|
def self.whitelist?
|
@@ -28,6 +29,10 @@ module DataAnon
|
|
28
29
|
@primary_keys = fields
|
29
30
|
end
|
30
31
|
|
32
|
+
def batch_size size
|
33
|
+
@batch_size = size
|
34
|
+
end
|
35
|
+
|
31
36
|
def whitelist *fields
|
32
37
|
fields.each { |f| @fields[f.downcase] = DataAnon::Strategy::Field::Whitelist.new }
|
33
38
|
end
|
@@ -73,21 +78,43 @@ module DataAnon
|
|
73
78
|
logger.debug "Processing table #{@name} with fields strategies #{@fields}"
|
74
79
|
total = source_table.count
|
75
80
|
if total > 0
|
76
|
-
index = 0
|
77
81
|
progress = progress_bar.new(@name, total)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
rescue => exception
|
83
|
-
@errors.log_error record, exception
|
84
|
-
end
|
85
|
-
progress.show index
|
82
|
+
if @primary_keys.empty? || !@batch_size.present?
|
83
|
+
process_table progress
|
84
|
+
else
|
85
|
+
process_table_in_batches progress
|
86
86
|
end
|
87
87
|
progress.close
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
91
|
+
def process_table progress
|
92
|
+
index = 0
|
93
|
+
source_table.all.each do |record|
|
94
|
+
index += 1
|
95
|
+
begin
|
96
|
+
process_record index, record
|
97
|
+
rescue => exception
|
98
|
+
@errors.log_error record, exception
|
99
|
+
end
|
100
|
+
progress.show index
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def process_table_in_batches progress
|
105
|
+
logger.info "Processing table #{@name} records in batch size of #{@batch_size}"
|
106
|
+
index = 0
|
107
|
+
source_table.find_each(:batch_size => @batch_size) do |record|
|
108
|
+
index += 1
|
109
|
+
begin
|
110
|
+
process_record index, record
|
111
|
+
rescue => exception
|
112
|
+
@errors.log_error record, exception
|
113
|
+
end
|
114
|
+
progress.show index
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
91
118
|
def progress_bar
|
92
119
|
@progress_bar || DataAnon::Utils::ProgressBar
|
93
120
|
end
|
data/lib/utils/database.rb
CHANGED
@@ -28,11 +28,12 @@ module DataAnon
|
|
28
28
|
|
29
29
|
class BaseTable
|
30
30
|
|
31
|
-
def self.create_table database, table_name, primary_keys
|
31
|
+
def self.create_table database, table_name, primary_keys = []
|
32
32
|
Class.new(database) do
|
33
33
|
self.table_name = table_name
|
34
34
|
self.primary_keys = primary_keys if primary_keys.length > 1
|
35
35
|
self.primary_key = primary_keys[0] if primary_keys.length == 1
|
36
|
+
self.primary_key = nil if primary_keys.length == 0
|
36
37
|
self.inheritance_column = :_type_disabled
|
37
38
|
self.mass_assignment_sanitizer = MassAssignmentIgnoreSanitizer.new(self)
|
38
39
|
end
|
@@ -42,7 +43,7 @@ module DataAnon
|
|
42
43
|
|
43
44
|
class SourceTable < BaseTable
|
44
45
|
|
45
|
-
def self.create table_name, primary_key
|
46
|
+
def self.create table_name, primary_key = []
|
46
47
|
create_table SourceDatabase, table_name, primary_key
|
47
48
|
end
|
48
49
|
|
@@ -50,7 +51,7 @@ module DataAnon
|
|
50
51
|
|
51
52
|
class DestinationTable < BaseTable
|
52
53
|
|
53
|
-
def self.create table_name, primary_key
|
54
|
+
def self.create table_name, primary_key = []
|
54
55
|
create_table DestinationDatabase, table_name, primary_key
|
55
56
|
end
|
56
57
|
|
data/lib/version.rb
CHANGED
@@ -21,7 +21,6 @@ describe "End 2 End RDBMS Whitelist Acceptance Test using SQLite database" do
|
|
21
21
|
destination_db dest_connection_spec
|
22
22
|
|
23
23
|
table 'customers' do
|
24
|
-
primary_key 'cust_id'
|
25
24
|
whitelist 'cust_id', 'address', 'zipcode', 'blog_url'
|
26
25
|
anonymize('first_name').using FieldStrategy::RandomFirstName.new
|
27
26
|
anonymize('last_name').using FieldStrategy::RandomLastName.new
|
@@ -34,8 +33,8 @@ describe "End 2 End RDBMS Whitelist Acceptance Test using SQLite database" do
|
|
34
33
|
end
|
35
34
|
|
36
35
|
DataAnon::Utils::DestinationDatabase.establish_connection dest_connection_spec
|
37
|
-
dest_table = DataAnon::Utils::DestinationTable.create 'customers'
|
38
|
-
new_rec = dest_table.
|
36
|
+
dest_table = DataAnon::Utils::DestinationTable.create 'customers'
|
37
|
+
new_rec = dest_table.where("cust_id" => CustomerSample::SAMPLE_DATA[:cust_id]).first
|
39
38
|
new_rec.first_name.should_not be("Sunit")
|
40
39
|
new_rec.last_name.should_not be("Parekh")
|
41
40
|
new_rec.birth_date.should_not be(Date.new(1977,7,8))
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe "End 2 End RDBMS Whitelist Acceptance Test using SQLite database" do
|
4
|
+
|
5
|
+
source_connection_spec = {:adapter => 'sqlite3', :database => 'tmp/customer.sqlite'}
|
6
|
+
dest_connection_spec = {:adapter => 'sqlite3', :database => 'tmp/customer-dest.sqlite'}
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
CustomerSample.clean
|
10
|
+
CustomerSample.create_schema source_connection_spec
|
11
|
+
CustomerSample.insert_record source_connection_spec, CustomerSample::SAMPLE_DATA
|
12
|
+
|
13
|
+
CustomerSample.create_schema dest_connection_spec
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should anonymize customer table record " do
|
17
|
+
|
18
|
+
database "Customer" do
|
19
|
+
strategy DataAnon::Strategy::Whitelist
|
20
|
+
source_db source_connection_spec
|
21
|
+
destination_db dest_connection_spec
|
22
|
+
|
23
|
+
table 'customers' do
|
24
|
+
primary_key 'cust_id'
|
25
|
+
batch_size 1
|
26
|
+
|
27
|
+
whitelist 'cust_id', 'address', 'zipcode', 'blog_url'
|
28
|
+
anonymize('first_name').using FieldStrategy::RandomFirstName.new
|
29
|
+
anonymize('last_name').using FieldStrategy::RandomLastName.new
|
30
|
+
anonymize('state').using FieldStrategy::SelectFromList.new(['Gujrat','Karnataka'])
|
31
|
+
anonymize('phone').using FieldStrategy::RandomPhoneNumber.new
|
32
|
+
anonymize('email').using FieldStrategy::StringTemplate.new('test+#{row_number}@gmail.com')
|
33
|
+
anonymize 'terms_n_condition', 'age', 'longitude'
|
34
|
+
anonymize('latitude').using FieldStrategy::RandomFloatDelta.new(2.0)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
DataAnon::Utils::DestinationDatabase.establish_connection dest_connection_spec
|
39
|
+
dest_table = DataAnon::Utils::DestinationTable.create 'customers'
|
40
|
+
new_rec = dest_table.where("cust_id" => CustomerSample::SAMPLE_DATA[:cust_id]).first
|
41
|
+
new_rec.first_name.should_not be("Sunit")
|
42
|
+
new_rec.last_name.should_not be("Parekh")
|
43
|
+
new_rec.birth_date.should_not be(Date.new(1977,7,8))
|
44
|
+
new_rec.address.should == 'F 501 Shanti Nagar'
|
45
|
+
['Gujrat','Karnataka'].should include(new_rec.state)
|
46
|
+
new_rec.zipcode.should == '411048'
|
47
|
+
new_rec.phone.should_not be "9923700662"
|
48
|
+
new_rec.email.should == 'test+1@gmail.com'
|
49
|
+
[true,false].should include(new_rec.terms_n_condition)
|
50
|
+
new_rec.age.should be_between(0,100)
|
51
|
+
new_rec.latitude.should be_between( 38.689060, 42.689060)
|
52
|
+
new_rec.longitude.should be_between( -84.044636, -64.044636)
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -2,7 +2,7 @@ class CustomerSample
|
|
2
2
|
|
3
3
|
class CreateCustomer < ActiveRecord::Migration
|
4
4
|
def up
|
5
|
-
create_table :customers, { :id => false } do |t|
|
5
|
+
create_table :customers, { :id => false, :force => true } do |t|
|
6
6
|
t.integer :cust_id, :primary => true
|
7
7
|
t.string :first_name
|
8
8
|
t.string :last_name
|
metadata
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data-anonymization
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
5
|
-
prerelease:
|
4
|
+
version: 0.5.1.rc1
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Sunit Parekh
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-
|
14
|
+
date: 2012-10-23 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: activerecord
|
@@ -244,6 +244,7 @@ files:
|
|
244
244
|
- spec/acceptance/mongodb_whitelist_spec.rb
|
245
245
|
- spec/acceptance/rdbms_blacklist_spec.rb
|
246
246
|
- spec/acceptance/rdbms_whitelist_spec.rb
|
247
|
+
- spec/acceptance/rdbms_whitelist_with_primary_key_spec.rb
|
247
248
|
- spec/core/fields_missing_strategy_spec.rb
|
248
249
|
- spec/resource/sample.geojson
|
249
250
|
- spec/spec_helper.rb
|
@@ -306,9 +307,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
306
307
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
307
308
|
none: false
|
308
309
|
requirements:
|
309
|
-
- - ! '
|
310
|
+
- - ! '>'
|
310
311
|
- !ruby/object:Gem::Version
|
311
|
-
version:
|
312
|
+
version: 1.3.1
|
312
313
|
requirements: []
|
313
314
|
rubyforge_project:
|
314
315
|
rubygems_version: 1.8.24
|
@@ -321,6 +322,7 @@ test_files:
|
|
321
322
|
- spec/acceptance/mongodb_whitelist_spec.rb
|
322
323
|
- spec/acceptance/rdbms_blacklist_spec.rb
|
323
324
|
- spec/acceptance/rdbms_whitelist_spec.rb
|
325
|
+
- spec/acceptance/rdbms_whitelist_with_primary_key_spec.rb
|
324
326
|
- spec/core/fields_missing_strategy_spec.rb
|
325
327
|
- spec/resource/sample.geojson
|
326
328
|
- spec/spec_helper.rb
|