data-anonymization 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. data/.documentup.json +8 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/.rvmrc +1 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +12 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +256 -0
  9. data/Rakefile +9 -0
  10. data/blacklist_dsl.rb +19 -0
  11. data/blacklist_nosql_dsl.rb +36 -0
  12. data/data-anonymization.gemspec +22 -0
  13. data/lib/core/database.rb +36 -0
  14. data/lib/core/dsl.rb +16 -0
  15. data/lib/core/field.rb +18 -0
  16. data/lib/data-anonymization.rb +12 -0
  17. data/lib/strategy/base.rb +67 -0
  18. data/lib/strategy/blacklist.rb +18 -0
  19. data/lib/strategy/field/anonymize_time.rb +57 -0
  20. data/lib/strategy/field/anonymous.rb +21 -0
  21. data/lib/strategy/field/date_time_delta.rb +24 -0
  22. data/lib/strategy/field/default_anon.rb +28 -0
  23. data/lib/strategy/field/distinct_column_values.rb +25 -0
  24. data/lib/strategy/field/fields.rb +23 -0
  25. data/lib/strategy/field/gmail_template.rb +17 -0
  26. data/lib/strategy/field/lorem_ipsum.rb +29 -0
  27. data/lib/strategy/field/random_boolean.rb +19 -0
  28. data/lib/strategy/field/random_email.rb +31 -0
  29. data/lib/strategy/field/random_first_name.rb +18 -0
  30. data/lib/strategy/field/random_float_delta.rb +24 -0
  31. data/lib/strategy/field/random_full_name.rb +28 -0
  32. data/lib/strategy/field/random_int.rb +23 -0
  33. data/lib/strategy/field/random_integer_delta.rb +21 -0
  34. data/lib/strategy/field/random_last_name.rb +19 -0
  35. data/lib/strategy/field/random_mailinator_email.rb +20 -0
  36. data/lib/strategy/field/random_phone_number.rb +24 -0
  37. data/lib/strategy/field/random_selection.rb +23 -0
  38. data/lib/strategy/field/random_string.rb +22 -0
  39. data/lib/strategy/field/random_user_name.rb +23 -0
  40. data/lib/strategy/field/string_template.rb +22 -0
  41. data/lib/strategy/field/user_name_template.rb +22 -0
  42. data/lib/strategy/field/whitelist.rb +17 -0
  43. data/lib/strategy/strategies.rb +4 -0
  44. data/lib/strategy/whitelist.rb +21 -0
  45. data/lib/tasks/rake_tasks.rb +19 -0
  46. data/lib/utils/database.rb +53 -0
  47. data/lib/utils/logging.rb +29 -0
  48. data/lib/utils/random_int.rb +15 -0
  49. data/lib/utils/random_string.rb +14 -0
  50. data/lib/utils/resource.rb +13 -0
  51. data/lib/version.rb +3 -0
  52. data/resources/first_names.txt +500 -0
  53. data/resources/last_names.txt +500 -0
  54. data/spec/acceptance/rdbms_blacklist_spec.rb +30 -0
  55. data/spec/acceptance/rdbms_whitelist_spec.rb +50 -0
  56. data/spec/spec_helper.rb +26 -0
  57. data/spec/strategy/field/anonymize_time_spec.rb +23 -0
  58. data/spec/strategy/field/date_time_delta_spec.rb +43 -0
  59. data/spec/strategy/field/distinct_column_values_spec.rb +22 -0
  60. data/spec/strategy/field/gmail_template_spec.rb +14 -0
  61. data/spec/strategy/field/lorem_ipsum_spec.rb +27 -0
  62. data/spec/strategy/field/random_boolean_spec.rb +16 -0
  63. data/spec/strategy/field/random_email_spec.rb +18 -0
  64. data/spec/strategy/field/random_first_name_spec.rb +14 -0
  65. data/spec/strategy/field/random_float_delta_spec.rb +21 -0
  66. data/spec/strategy/field/random_full_name_spec.rb +23 -0
  67. data/spec/strategy/field/random_int_spec.rb +28 -0
  68. data/spec/strategy/field/random_integer_delta_spec.rb +23 -0
  69. data/spec/strategy/field/random_last_name_spec.rb +14 -0
  70. data/spec/strategy/field/random_mailinator_email_spec.rb +21 -0
  71. data/spec/strategy/field/random_phone_number_spec.rb +35 -0
  72. data/spec/strategy/field/random_selection_spec.rb +36 -0
  73. data/spec/strategy/field/random_string_spec.rb +23 -0
  74. data/spec/strategy/field/random_user_name_spec.rb +23 -0
  75. data/spec/strategy/field/string_template_spec.rb +15 -0
  76. data/spec/strategy/field/user_name_template_spec.rb +13 -0
  77. data/spec/strategy/field/whitelist_spec.rb +21 -0
  78. data/spec/support/customer_sample.rb +43 -0
  79. data/spec/utils/database_spec.rb +26 -0
  80. data/spec/utils/random_int_spec.rb +9 -0
  81. data/spec/utils/random_string_spec.rb +8 -0
  82. data/whitelist_dsl.rb +44 -0
  83. metadata +192 -0
@@ -0,0 +1,8 @@
1
+ {
2
+ "repo": "sunitparekh/data-anonymization",
3
+ "name": "Data Anonymization",
4
+ "theme": "v1",
5
+ "travis": true,
6
+ "twitter": ["dataanon"],
7
+ "google_analytics":"UA-34000799-1"
8
+ }
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea
19
+ sample-data/chinook-empty.sqlite
20
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ #--profile
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.3-p125@data-anon --create
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ before_install: gem install bundler --pre
3
+ before_script: rake empty_dest
4
+ rvm:
5
+ - 1.9.2
6
+ - 1.9.3
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :development, :test do
6
+ gem 'foreman'
7
+ gem 'rake'
8
+ gem 'rspec'
9
+ gem 'pry'
10
+ gem 'sqlite3'
11
+ end
12
+
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Sunit Parekh
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,256 @@
1
+ # Data::Anonymization
2
+ Tool to create anonymized production data dump to use for PREF and other TEST environments.
3
+
4
+ ## Getting started
5
+ Install gem using:
6
+
7
+ $ gem install data-anonymization
8
+
9
+ Create ruby program using data-anonymization DSL as following `my_dsl.rb`:
10
+
11
+ ```ruby
12
+ require 'data-anonymization'
13
+ DF = DataAnon::Strategy::Field
14
+
15
+ database 'DatabaseName' do
16
+ strategy DataAnon::Strategy::Blacklist # whitelist (default) or blacklist
17
+
18
+ # database config as active record connection hash
19
+ source_db :adapter => 'sqlite3', :database => 'sample-data/chinook-empty.sqlite'
20
+
21
+ table 'User' do
22
+ primary_key 'id'
23
+ anonymize 'DateOfBirth' # uses default anonymization based on data types
24
+ anonymize('UserName').using DF::StringTemplate.new('user#{row_number}')
25
+ anonymize('Password') { |field| "password" }
26
+ end
27
+
28
+ ...
29
+
30
+ end
31
+ ```
32
+
33
+ Run using:
34
+
35
+ $ ruby my_dsl.rb
36
+
37
+ ### Share feedback
38
+ Please use Github [issues](https://github.com/sunitparekh/data-anonymization/issues) to share feedback, feature suggestions or found any issues.
39
+
40
+ Read more to learn all the features of the tool...
41
+
42
+ ## What is data anonymization?
43
+
44
+ For almost all the project it is almost a need to have production data dump to run performance tests, rehearsal production releases and debugging production issues.
45
+ However, getting production data and using it is not feasible due to multiple reasons and one of them is users personal data in database. And hence the need of data anonymization.
46
+ This tool helps you to get anonymized production data dump using either Blacklist or Whitelist strategies.
47
+
48
+ ## Anonymization Strategies
49
+
50
+ ### Blacklist
51
+ This approach is essentially to leave all fields unchanged with the exception of a few which are scrambled/anonymized (hence the name blacklist).
52
+ Blacklist create a copy of prod database and choose the fields to be anonymized like e.g. username, password, email, name, geo location etc. Most of the fields had different rules e.g. password as always set to same value for all user, email need to be valid email (we used gmail trick with +N appended to it).
53
+ Problem with this approach is, if new fields are added it will not be anonymized be default. Risk of user personal data passing through in future.
54
+
55
+ ```ruby
56
+ database 'DatabaseName' do
57
+ strategy DataAnon::Strategy::Blacklist
58
+ source_db :adapter => 'sqlite3', :database => 'sample-data/chinook-empty.sqlite'
59
+ ...
60
+ end
61
+ ```
62
+
63
+ ### Whitelist
64
+ This approach is essentially to scramble/anonymize all fields except list of fields which are allowed to copy called as whitelist.
65
+ By default all data needs to be anonymized. So from production database sanitizing the data record by record and insert anonymized data into destination database. Source database is kind of readonly.
66
+ Have default anonymization rules based on data types. Have special rules for fields like username, password, email, name, geo location etc. And have list of whitelist fields means its okay to copy the data and doesn't need anonymization.
67
+ This way any new field will be default get anonymized and if we need them as is, add it to the whitelist explicitly.
68
+
69
+ ```ruby
70
+ database 'DatabaseName' do
71
+ strategy DataAnon::Strategy::Whitelist
72
+ source_db :adapter => 'sqlite3', :database => 'sample-data/chinook.sqlite'
73
+ destination_db :adapter => 'sqlite3', :database => 'sample-data/chinook-empty.sqlite'
74
+ ...
75
+ end
76
+ ```
77
+
78
+ ## DataAnon::Core::Field
79
+ The object that gets passed along with the field strategies.
80
+
81
+ has following attribute accessor
82
+
83
+ - `name` current field/column name
84
+ - `value` current field/column value
85
+ - `row_number` current row number
86
+ - `ar_record` active record of the current row under processing
87
+
88
+ ## Field Strategies
89
+
90
+ ### LoremIpsum
91
+ Default anonymization strategy for `string` content. Uses default 'Lorem ipsum...' text or text supplied in strategy to generate same length string.
92
+
93
+ ```ruby
94
+ anonymize('UserName').using DataAnon::Strategy::Field::LoremIpsum.new
95
+ ```
96
+ ```ruby
97
+ anonymize('UserName').using DataAnon::Strategy::Field::LoremIpsum.new("very large string....")
98
+ ```
99
+ ```ruby
100
+ anonymize('UserName').using DataAnon::Strategy::Field::LoremIpsum.new(File.read('my_file.txt'))
101
+ ```
102
+
103
+ ### RandomString
104
+ Generates random string of same length.
105
+ ```ruby
106
+ anonymize('UserName').using DataAnon::Strategy::Field::RandomString.new
107
+ ```
108
+
109
+ ### StringTemplate
110
+ Simple string evaluation within [DataAnon::Core::Field](#dataanon-core-field) context. Can be used for email, username anonymization.
111
+ Make sure to put the string in 'single quote' else it will get evaluated inline.
112
+ ```ruby
113
+ anonymize('UserName').using DataAnon::Strategy::Field::StringTemplate.new('user#{row_number}')
114
+ ```
115
+ ```ruby
116
+ anonymize('Email').using DataAnon::Strategy::Field::StringTemplate.new('valid.address+#{row_number}@gmail.com')
117
+ ```
118
+ ```ruby
119
+ anonymize('Email').using DataAnon::Strategy::Field::StringTemplate.new('useremail#{row_number}@mailinator.com')
120
+ ```
121
+
122
+ ### DateTimeDelta
123
+ Shifts data randomly within given range. Default shifts date within 10 days + or - and shifts time within 30 minutes.
124
+ ```ruby
125
+ anonymize('DateOfBirth').using DataAnon::Strategy::Field::DateTimeDelta.new
126
+ ```
127
+ ```ruby
128
+ # shifts date within 20 days and time within 50 minutes
129
+ anonymize('DateOfBirth').using DataAnon::Strategy::Field::DateTimeDelta.new(20, 50)
130
+ ```
131
+
132
+ ### RandomEmail
133
+ Generates email randomly using the given HOSTNAME and TLD.
134
+ By defaults generates hostname randomly along with email id.
135
+ ```ruby
136
+ anonymize('DateOfBirth').using DataAnon::Strategy::Field::RandomEmail.new('thoughtworks','com')
137
+ ```
138
+
139
+ ### RandomMailinatorEmail
140
+ Generates random email using mailinator hostname. e.g. <randomstring>@mailinator.com
141
+ ```ruby
142
+ anonymize('DateOfBirth').using DataAnon::Strategy::Field::RandomMailinatorEmail.new
143
+ ```
144
+
145
+ ### RandomUserName
146
+ ### RandomFirstName
147
+ ### RandomLastName
148
+ ### RandomFullName
149
+ ### RandomInt
150
+ ### RandomIntegerDelta
151
+ ### RandomFloatDelta
152
+
153
+ - - -
154
+
155
+
156
+
157
+ - - -
158
+
159
+ ## Write you own field strategies
160
+ field parameter in following code is [DataAnon::Core::Field](#dataanon-core-field)
161
+
162
+ ```ruby
163
+ class MyFieldStrategy
164
+
165
+ # method anonymize is what required
166
+ def anonymize field
167
+ # write your code here
168
+ end
169
+
170
+ end
171
+ ```
172
+
173
+ write your own anonymous field strategies within DSL,
174
+
175
+ ```ruby
176
+ table 'User' do
177
+ anonymize('Password') { |field| "password" }
178
+ anonymize('email') do |field|
179
+ "test+#{field.row_number}@gmail.com"
180
+ end
181
+ end
182
+ ```
183
+
184
+
185
+ ## Default field strategies
186
+
187
+ ```ruby
188
+ # Work in progress...
189
+ DEFAULT_STRATEGIES = {:string => FS::LoremIpsum.new,
190
+ :integer => FS::RandomInt.new(18,70),
191
+ :datetime => FS::DateTimeDelta.new,
192
+ :boolean => FS::RandomBoolean.new
193
+ }
194
+ ```
195
+
196
+ Overriding default field strategies,
197
+
198
+ ```ruby
199
+ database 'Chinook' do
200
+ ...
201
+ default_field_strategies :string => DataAnon::Strategy::Field::RandomString.new
202
+ ...
203
+ end
204
+ ```
205
+
206
+ ## Examples
207
+
208
+ 1. [Whitelist](https://github.com/sunitparekh/data-anonymization/blob/master/whitelist_dsl.rb)
209
+ 2. [Blacklist](https://github.com/sunitparekh/data-anonymization/blob/master/blacklist_dsl.rb)
210
+
211
+
212
+ ## Logging
213
+
214
+ `Progress Logger` provides progress of anonymization execution table by table.
215
+
216
+ ```ruby
217
+ DataAnon::Utils::Logging.progress_logger.level = Logger::WARN
218
+ ```
219
+
220
+ `Logger` provides debug level messages including database queries of active record.
221
+
222
+ ```ruby
223
+ DataAnon::Utils::Logging.logger.level = Logger::INFO
224
+ ```
225
+
226
+ ## Changelog
227
+
228
+
229
+ ### 0.1.1 (August 13, 2012)
230
+
231
+ 1. First initial release
232
+
233
+ ## What's plan ahead?
234
+
235
+ 1. Run anonymization in parallel threads (performance enchantments)
236
+ 2. MongoDB anonymization support (NoSQL document based database support)
237
+
238
+ ## Want to contribute?
239
+
240
+ 1. Fork it
241
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
242
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
243
+ 4. Push to the branch (`git push origin my-new-feature`)
244
+ 5. Create new Pull Request
245
+
246
+ ## License
247
+
248
+ [MIT License](https://github.com/sunitparekh/data-anonymization/blob/master/LICENSE.txt)
249
+
250
+ ## Credits
251
+
252
+ - [ThoughtWorks Inc](http://www.thoughtworks.com), for allowing us to build this tool and make if open source.
253
+ - [Birinder](https://twitter.com/birinder_) and [Panda](https://twitter.com/sarbashrestha) for reviewing the documentation.
254
+
255
+
256
+
@@ -0,0 +1,9 @@
1
+ require 'bundler'
2
+ require 'rspec/core/rake_task'
3
+ require "tasks/rake_tasks"
4
+
5
+ Bundler::GemHelper.install_tasks
6
+ RSpec::Core::RakeTask.new(:spec)
7
+ DataAnonymization::RakeTasks.new
8
+
9
+ task :default => :spec
@@ -0,0 +1,19 @@
1
+ system "bundle exec ruby whitelist_dsl.rb"
2
+
3
+ require 'data-anonymization'
4
+
5
+ FS = DataAnon::Strategy::Field
6
+
7
+ DataAnon::Utils::Logging.logger.level = Logger::INFO
8
+
9
+ database 'Chinook' do
10
+ strategy DataAnon::Strategy::Blacklist
11
+ source_db :adapter => 'sqlite3', :database => 'sample-data/chinook-empty.sqlite'
12
+
13
+ table 'MediaType' do
14
+ primary_key 'MediaTypeId'
15
+ anonymize('Name').using FS::StringTemplate.new('Media Type 100#{row_number}')
16
+ end
17
+
18
+ end
19
+
@@ -0,0 +1,36 @@
1
+ require 'data-anonymization'
2
+
3
+ FS = DataAnon::Strategy::Field
4
+
5
+ DataAnon::Utils::Logging.logger.level = Logger::INFO
6
+
7
+ # DSL for NOSQL database, NOT IMPLEMENTED YET
8
+
9
+ database 'Chinook' do
10
+ strategy DataAnon::Strategy::NoSQL::Blacklist
11
+ source_db :adapter => 'sqlite3', :database => 'sample-data/chinook-empty.sqlite'
12
+
13
+ document 'User' do
14
+ primary_key 'MediaTypeId'
15
+ whitelist
16
+
17
+ node 'address' do
18
+ whitelist 'addrssline1', 'addressline2'
19
+ anonymize
20
+ anonymize 'pincode'
21
+
22
+ node 'contacts' do
23
+ anonymize('phone-number').using FS::RandomPhoneNumber.new
24
+ end
25
+
26
+ anonymize 'contacts/phone-number'
27
+
28
+ end
29
+
30
+ anonymize 'address/contacts/phone-number'
31
+
32
+ anonymize('Name').using FS::StringTemplate.new('Media Type 100#{row_number}')
33
+ end
34
+
35
+ end
36
+
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "data-anonymization"
8
+ gem.version = DataAnonymization::VERSION
9
+ gem.authors = ["Sunit Parekh", "Anand Agrawal", "Satyam Agarwala"]
10
+ gem.email = ["parekh.sunit@gmail.com","anand.agrawal84@gmail.com", "satyamag@gmail.com"]
11
+ gem.description = %q{Data anonymization tool for RDBMS databases}
12
+ gem.summary = %q{Tool to create anonymized production data dump to use for PREF and other TEST environments.}
13
+ gem.homepage = "http://sunitparekh.github.com/data-anonymization"
14
+
15
+ gem.files = `git ls-files`.split($/).select { |f| !f.match(/^sample-data/) }
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency('activerecord', '~> 3.2.8')
21
+ gem.add_dependency('activesupport', '~> 3.2.8')
22
+ end
@@ -0,0 +1,36 @@
1
+ module DataAnon
2
+ module Core
3
+
4
+ class Database
5
+
6
+ def initialize name
7
+ @name = name
8
+ @strategy = DataAnon::Strategy::Whitelist
9
+ @user_defaults = {}
10
+ end
11
+
12
+ def strategy strategy
13
+ @strategy = strategy
14
+ end
15
+
16
+ def source_db connection_spec
17
+ DataAnon::Utils::SourceDatabase.establish_connection connection_spec
18
+ end
19
+
20
+ def destination_db connection_spec
21
+ DataAnon::Utils::DestinationDatabase.establish_connection connection_spec
22
+ end
23
+
24
+ def default_field_strategies default_strategies
25
+ @user_defaults = default_strategies
26
+ end
27
+
28
+ def table (name, &block)
29
+ @strategy.new(name, @user_defaults).process_fields(&block).process
30
+ end
31
+
32
+
33
+ end
34
+
35
+ end
36
+ end