data-anonymization 0.3.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. data/.gitignore +2 -1
  2. data/.rvmrc +1 -1
  3. data/.travis.yml +2 -0
  4. data/Gemfile +2 -0
  5. data/README.md +295 -258
  6. data/bin/datanon +57 -0
  7. data/data-anonymization.gemspec +2 -1
  8. data/examples/blacklist_dsl.rb +42 -0
  9. data/examples/mongodb_blacklist_dsl.rb +38 -0
  10. data/examples/mongodb_whitelist_dsl.rb +44 -0
  11. data/examples/whitelist_dsl.rb +63 -0
  12. data/lib/core/database.rb +21 -3
  13. data/lib/core/field.rb +5 -2
  14. data/lib/core/fields_missing_strategy.rb +30 -0
  15. data/lib/core/table_errors.rb +32 -0
  16. data/lib/data-anonymization.rb +11 -0
  17. data/lib/parallel/table.rb +8 -1
  18. data/lib/strategy/base.rb +35 -14
  19. data/lib/strategy/blacklist.rb +1 -1
  20. data/lib/strategy/field/anonymize_array.rb +28 -0
  21. data/lib/strategy/field/contact/random_address.rb +12 -0
  22. data/lib/strategy/field/contact/random_city.rb +12 -0
  23. data/lib/strategy/field/contact/random_phone_number.rb +4 -0
  24. data/lib/strategy/field/contact/random_province.rb +12 -0
  25. data/lib/strategy/field/contact/random_zipcode.rb +12 -0
  26. data/lib/strategy/field/datetime/anonymize_date.rb +15 -0
  27. data/lib/strategy/field/datetime/anonymize_datetime.rb +19 -0
  28. data/lib/strategy/field/datetime/anonymize_time.rb +19 -0
  29. data/lib/strategy/field/datetime/date_delta.rb +10 -0
  30. data/lib/strategy/field/datetime/date_time_delta.rb +9 -0
  31. data/lib/strategy/field/datetime/time_delta.rb +8 -0
  32. data/lib/strategy/field/default_anon.rb +4 -1
  33. data/lib/strategy/field/email/gmail_template.rb +8 -0
  34. data/lib/strategy/field/email/random_email.rb +7 -0
  35. data/lib/strategy/field/email/random_mailinator_email.rb +5 -0
  36. data/lib/strategy/field/fields.rb +4 -0
  37. data/lib/strategy/field/name/random_first_name.rb +10 -0
  38. data/lib/strategy/field/name/random_full_name.rb +10 -2
  39. data/lib/strategy/field/name/random_last_name.rb +9 -0
  40. data/lib/strategy/field/name/random_user_name.rb +5 -0
  41. data/lib/strategy/field/number/random_big_decimal_delta.rb +6 -0
  42. data/lib/strategy/field/number/random_float.rb +4 -0
  43. data/lib/strategy/field/number/random_float_delta.rb +6 -0
  44. data/lib/strategy/field/number/random_integer.rb +4 -0
  45. data/lib/strategy/field/number/random_integer_delta.rb +6 -0
  46. data/lib/strategy/field/string/formatted_string_numbers.rb +10 -6
  47. data/lib/strategy/field/string/lorem_ipsum.rb +9 -0
  48. data/lib/strategy/field/string/random_formatted_string.rb +39 -0
  49. data/lib/strategy/field/string/random_string.rb +6 -0
  50. data/lib/strategy/field/string/random_url.rb +7 -1
  51. data/lib/strategy/field/string/select_from_database.rb +7 -5
  52. data/lib/strategy/field/string/select_from_file.rb +7 -0
  53. data/lib/strategy/field/string/select_from_list.rb +8 -0
  54. data/lib/strategy/field/string/string_template.rb +11 -0
  55. data/lib/strategy/mongodb/anonymize_field.rb +44 -0
  56. data/lib/strategy/mongodb/blacklist.rb +29 -0
  57. data/lib/strategy/mongodb/whitelist.rb +62 -0
  58. data/lib/strategy/strategies.rb +10 -1
  59. data/lib/strategy/whitelist.rb +7 -2
  60. data/lib/thor/helpers/mongodb_dsl_generator.rb +66 -0
  61. data/lib/thor/helpers/rdbms_dsl_generator.rb +36 -0
  62. data/lib/thor/templates/mongodb_whitelist_template.erb +15 -0
  63. data/lib/thor/templates/whitelist_template.erb +21 -0
  64. data/lib/utils/database.rb +4 -0
  65. data/lib/utils/parallel_progress_bar.rb +24 -0
  66. data/lib/utils/progress_bar.rb +34 -22
  67. data/lib/utils/random_string.rb +3 -2
  68. data/lib/utils/random_string_chars_only.rb +3 -5
  69. data/lib/utils/template_helper.rb +44 -0
  70. data/lib/version.rb +1 -1
  71. data/spec/acceptance/mongodb_blacklist_spec.rb +75 -0
  72. data/spec/acceptance/mongodb_whitelist_spec.rb +107 -0
  73. data/spec/core/fields_missing_strategy_spec.rb +26 -0
  74. data/spec/strategy/field/name/random_first_name_spec.rb +1 -1
  75. data/spec/strategy/field/name/random_full_name_spec.rb +12 -7
  76. data/spec/strategy/field/name/random_last_name_spec.rb +1 -1
  77. data/spec/strategy/field/string/random_formatted_string_spec.rb +39 -0
  78. data/spec/strategy/field/string/select_from_file_spec.rb +21 -0
  79. data/spec/strategy/mongodb/anonymize_field_spec.rb +52 -0
  80. data/spec/utils/random_float_spec.rb +12 -0
  81. data/spec/utils/random_string_char_only_spec.rb +12 -0
  82. data/spec/utils/template_helper_spec.rb +14 -0
  83. metadata +56 -6
  84. data/blacklist_dsl.rb +0 -17
  85. data/blacklist_nosql_dsl.rb +0 -36
  86. data/whitelist_dsl.rb +0 -42
@@ -1,6 +1,12 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Shifts the current value randomly within given delta + and -. Default is 10.0
6
+ #
7
+ # !!!ruby
8
+ # anonymize('points').using FieldStrategy::RandomFloatDelta.new(2.5)
9
+
4
10
  class RandomFloatDelta
5
11
 
6
12
  def initialize delta = 10.0
@@ -2,6 +2,10 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random integer number between given two numbers. Default range is 0 to 100.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Age').using FieldStrategy::RandomInteger.new(18,70)
5
9
 
6
10
  class RandomInteger
7
11
 
@@ -1,6 +1,12 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Shifts the current value randomly within given delta + and -. Default is 10
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Age').using FieldStrategy::RandomIntegerDelta.new(2)
9
+
4
10
  class RandomIntegerDelta
5
11
 
6
12
  def initialize delta = 10
@@ -2,21 +2,25 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Keeping the format same it changes each digit in the string with random digit.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('CreditCardNumber').using FieldStrategy::FormattedStringNumber.new
5
9
 
6
10
  class FormattedStringNumber
7
11
 
8
12
  def anonymize field
9
- @original_phone_number = field.value
10
- @anonymized_phone_number = ""
11
- @original_phone_number.each_char do |char|
13
+ @original_string = field.value
14
+ @anonymized_string = ""
15
+ @original_string.each_char do |char|
12
16
  if /\d/.match(char).nil?
13
- @anonymized_phone_number += char
17
+ @anonymized_string += char
14
18
  else
15
- @anonymized_phone_number += DataAnon::Utils::RandomInt.generate(0,9).to_s
19
+ @anonymized_string += DataAnon::Utils::RandomInt.generate(0,9).to_s
16
20
  end
17
21
  end
18
22
 
19
- @anonymized_phone_number
23
+ @anonymized_string
20
24
  end
21
25
 
22
26
  end
@@ -2,6 +2,15 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Default anonymization strategy for `string` content. Uses default 'Lorem ipsum...' text or text supplied in strategy to generate same length string.
6
+ # !!!ruby
7
+ # anonymize('UserName').using FieldStrategy::LoremIpsum.new
8
+ #
9
+ # !!!ruby
10
+ # anonymize('UserName').using FieldStrategy::LoremIpsum.new("very large string....")
11
+ #
12
+ # !!!ruby
13
+ # anonymize('UserName').using FieldStrategy::LoremIpsum.new(File.read('my_file.txt'))
5
14
 
6
15
  class LoremIpsum
7
16
 
@@ -0,0 +1,39 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ # Keeping the format same it changes each digit with random digit, character with character preserving the case.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('PhoneNumber').using FieldStrategy::RandomFormattedString.new
9
+ # anonymize('Email').using FieldStrategy::RandomFormattedString.new
10
+
11
+ class RandomFormattedString
12
+
13
+ SMALL_CHARS = "abcdefghjkmnpqrstuvwxyz"
14
+ CAPS_CHARS = "ABCDEFGHJKLMNPQRSTUVWXYZ"
15
+
16
+ def anonymize field
17
+ @original_string = field.value
18
+ @anonymized_string = ""
19
+ @original_string.each_char do |char|
20
+ if /\d/.match(char)
21
+ @anonymized_string += DataAnon::Utils::RandomInt.generate(0, 9).to_s
22
+ elsif /[a-z]/.match(char)
23
+ @anonymized_string += SMALL_CHARS[rand(SMALL_CHARS.length)]
24
+ elsif /[A-Z]/.match(char)
25
+ @anonymized_string += CAPS_CHARS[rand(CAPS_CHARS.length)]
26
+ else
27
+ @anonymized_string += char
28
+ end
29
+ end
30
+
31
+ @anonymized_string
32
+ end
33
+
34
+ end
35
+
36
+
37
+ end
38
+ end
39
+ end
@@ -1,6 +1,12 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Generates random string of same length.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('UserName').using FieldStrategy::RandomString.new
9
+
4
10
  class RandomString
5
11
 
6
12
  def anonymize field
@@ -1,7 +1,13 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
- class RandomUrl
4
+
5
+ # Generates a randomized URL while maintaining the structure of the original url
6
+ #
7
+ # !!!ruby
8
+ # anonymize('fb_profile').using FieldStrategy::RandomURL.new
9
+
10
+ class RandomUrl
5
11
 
6
12
  def anonymize field
7
13
 
@@ -2,7 +2,13 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
- class SelectFromDatabase
5
+ # Similar to SelectFromList with difference is the list of values are collected from the database table using distinct column query.
6
+ #
7
+ # !!!ruby
8
+ # # values are collected using `select distinct state from customers` query
9
+ # anonymize('State').using FieldStrategy::SelectFromDatabase.new('customers','state')
10
+
11
+ class SelectFromDatabase < SelectFromFile
6
12
  include Utils::Logging
7
13
 
8
14
  def initialize table_name, field_name, connection_spec
@@ -13,10 +19,6 @@ module DataAnon
13
19
 
14
20
  end
15
21
 
16
- def anonymize field
17
- @values[DataAnon::Utils::RandomInt.generate(0,(@values.length - 1))]
18
- end
19
-
20
22
  end
21
23
 
22
24
 
@@ -2,6 +2,12 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to SelectFromList only difference is the list of values are picked up from file. Classical usage is like states field anonymization.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('State').using FieldStrategy::SelectFromFile.new('states.txt')
9
+ #
10
+
5
11
  class SelectFromFile
6
12
 
7
13
  def initialize file_path
@@ -9,6 +15,7 @@ module DataAnon
9
15
  end
10
16
 
11
17
  def anonymize field
18
+ return @values.sample(field.value.length) if field.value.kind_of? Array
12
19
  @values.sample
13
20
  end
14
21
 
@@ -2,6 +2,14 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Select randomly one of the values specified.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('State').using FieldStrategy::SelectFromList.new(['New York','Georgia',...])
9
+ #
10
+ # !!!ruby
11
+ # anonymize('NameTitle').using FieldStrategy::SelectFromList.new(['Mr','Mrs','Dr',...])
12
+ #
5
13
 
6
14
  class SelectFromList < SelectFromFile
7
15
 
@@ -2,6 +2,17 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Simple string evaluation within [DataAnon::Core::Field](#dataanon-core-field) context. Can be used for email, username anonymization.
6
+ # Make sure to put the string in 'single quote' else it will get evaluated inline.
7
+ #
8
+ # !!!ruby
9
+ # anonymize('UserName').using FieldStrategy::StringTemplate.new('user#{row_number}')
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Email').using FieldStrategy::StringTemplate.new('valid.address+#{row_number}@gmail.com')
13
+ #
14
+ # !!!ruby
15
+ # anonymize('Email').using FieldStrategy::StringTemplate.new('useremail#{row_number}@mailinator.com')
5
16
 
6
17
  class StringTemplate
7
18
 
@@ -0,0 +1,44 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module MongoDB
4
+ class AnonymizeField
5
+
6
+ def initialize field, field_strategy, anonymization_strategy
7
+ @field = field
8
+ @field_strategy = field_strategy
9
+ @anonymization_strategy = anonymization_strategy
10
+ end
11
+
12
+ def anonymize
13
+ if sub_document?
14
+ @anonymization_strategy.anonymize_document(@field.value, @field.row_number, @field_strategy)
15
+ elsif sub_documents?
16
+ anonymize_sub_documents
17
+ else
18
+ anonymize_field
19
+ end
20
+ end
21
+
22
+ def anonymize_sub_documents
23
+ @field.value.collect { |value| @anonymization_strategy.anonymize_document(value, @field.row_number, @field_strategy) }
24
+ end
25
+
26
+ def anonymize_field
27
+ @field_strategy = @field_strategy || @anonymization_strategy.default_strategy(@field.name)
28
+ raise "Improper fields strategy defined for '#{@field.name}' within document \n #{@field.ar_record}" unless @field_strategy.respond_to?(:anonymize)
29
+ @field_strategy.anonymize(@field)
30
+ end
31
+
32
+ def sub_documents?
33
+ @field.value.kind_of?(Array) && @field.value[0].kind_of?(Hash)
34
+ end
35
+
36
+ def sub_document?
37
+ @field.value.kind_of? Hash
38
+ end
39
+
40
+
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,29 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module MongoDB
4
+ class Blacklist < DataAnon::Strategy::MongoDB::Whitelist
5
+
6
+ def self.whitelist?
7
+ false
8
+ end
9
+
10
+ def process_record index, document
11
+ source_collection.save anonymize_document(document, index, @fields)
12
+ end
13
+
14
+ def anonymize_document document, index, field_strategies = {}
15
+ field_strategies.each do |field_name, field_strategy|
16
+ field_value = document[field_name]
17
+ unless field_value.nil?
18
+ field = DataAnon::Core::Field.new(field_name, field_value, index, document, @name)
19
+ document[field.name] = AnonymizeField.new(field, field_strategy, self).anonymize
20
+ end
21
+ end
22
+ document
23
+ end
24
+
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,62 @@
1
+ require 'mongo'
2
+
3
+ class Mongo::Collection
4
+ alias :all :find
5
+ end
6
+
7
+ module DataAnon
8
+ module Strategy
9
+ module MongoDB
10
+ class Whitelist < DataAnon::Strategy::Base
11
+
12
+ def self.whitelist?
13
+ true
14
+ end
15
+
16
+ def collection field, &block
17
+ whitelist = self.class.new @source_database, @destination_database, @name, @user_strategies
18
+ whitelist.process_fields &block
19
+ @fields[field] = whitelist.fields
20
+ end
21
+
22
+ alias :document :collection
23
+
24
+ def mongo_collection(database)
25
+ options = database[:options] || {}
26
+ Mongo::Connection.from_uri(database[:mongodb_uri], options)[database[:database]][@name]
27
+ end
28
+
29
+ def dest_collection
30
+ database = @destination_database
31
+ @dest_collection ||= mongo_collection(database)
32
+ end
33
+
34
+ def source_collection
35
+ @source_collection ||= mongo_collection(@source_database)
36
+ end
37
+
38
+ alias :source_table :source_collection
39
+ alias :dest_table :dest_collection
40
+
41
+ def process_record index, document
42
+ dest_collection.insert anonymize_document(document, index, @fields)
43
+ end
44
+
45
+ def anonymize_document document, index, field_strategies = {}
46
+ anonymized_document = {}
47
+ document.each do |field_name, field_value|
48
+ field_strategy = field_strategies[field_name.downcase] if field_strategies.kind_of?(Hash)
49
+ unless field_value.nil?
50
+ field = DataAnon::Core::Field.new(field_name, field_value, index, document, @name)
51
+ anonymized_document[field.name] = AnonymizeField.new(field, field_strategy, self).anonymize
52
+ end
53
+ end
54
+ anonymized_document
55
+ end
56
+
57
+
58
+ end
59
+
60
+ end
61
+ end
62
+ end
@@ -1,4 +1,13 @@
1
1
  require 'strategy/base'
2
2
  require 'strategy/whitelist'
3
3
  require 'strategy/blacklist'
4
- require 'strategy/field/fields'
4
+ require 'strategy/field/fields'
5
+
6
+ begin
7
+ require 'mongo'
8
+ require 'strategy/mongodb/anonymize_field'
9
+ require 'strategy/mongodb/whitelist'
10
+ require 'strategy/mongodb/blacklist'
11
+ rescue LoadError
12
+ "Ignoring the mongodb specific libraries if monog driver is not specified in gem"
13
+ end
@@ -2,12 +2,16 @@ module DataAnon
2
2
  module Strategy
3
3
  class Whitelist < DataAnon::Strategy::Base
4
4
 
5
+ def self.whitelist?
6
+ true
7
+ end
8
+
5
9
  def process_record(index, record)
6
10
  dest_record_map = {}
7
11
  record.attributes.each do |field_name, field_value|
8
12
  unless field_value.nil? || is_primary_key?(field_name)
9
- field = DataAnon::Core::Field.new(field_name, field_value, index, record)
10
- field_strategy = @fields[field_name.downcase] || DataAnon::Strategy::Field::DefaultAnon.new(@user_strategies)
13
+ field = DataAnon::Core::Field.new(field_name, field_value, index, record, @name)
14
+ field_strategy = @fields[field_name.downcase] || default_strategy(field_name)
11
15
  dest_record_map[field_name] = field_strategy.anonymize(field)
12
16
  end
13
17
  end
@@ -18,6 +22,7 @@ module DataAnon
18
22
  dest_record.save!
19
23
  end
20
24
 
25
+
21
26
  end
22
27
  end
23
28
  end
@@ -0,0 +1,66 @@
1
+ require 'erb'
2
+ require 'thor'
3
+
4
+ module DataAnon
5
+ module ThorHelpers
6
+ class MongoDBDSLGenerator
7
+
8
+ def self.source_root
9
+ File.dirname(__FILE__)
10
+ end
11
+
12
+ def initialize(configuration_hash, whitelist_patterns)
13
+ @mongodb_uri = DataAnon::Utils::TemplateHelper.mongo_uri(configuration_hash)
14
+ @whitelist_patterns = whitelist_patterns || [/^_/,/_at$/,/_id$/,/_type$/]
15
+ @configuration_hash = configuration_hash
16
+ @output = []
17
+ end
18
+
19
+ def generate
20
+
21
+ db = Mongo::Connection.from_uri(@mongodb_uri)[@configuration_hash[:database]]
22
+ collections = db.collections
23
+ collections.each do |collection|
24
+ unless collection.name.start_with?('system.')
25
+ depth = 2
26
+ @output << "\tcollection '#{collection.name}' do"
27
+ document = collection.find_one
28
+ process_document(depth, document)
29
+ @output << "\tend\n"
30
+ end
31
+ end
32
+
33
+ erb = ERB.new( File.new(RDBMSDSLGenerator.source_root + "/../templates/mongodb_whitelist_template.erb").read, nil, '-')
34
+ File.open('mongodb_whitelist_generated.rb', 'w') do |f|
35
+ f.write erb.result(binding)
36
+ f.close
37
+ end
38
+
39
+ end
40
+
41
+ def process_document(depth, document)
42
+ return if document.nil?
43
+ document.each do |key, value|
44
+ @output << ("\t"*depth)
45
+ if value.kind_of?(Hash)
46
+ end_statement = @output[-1]+"end"
47
+ @output[-1] << "document '#{key}' do"
48
+ process_document depth+1, value
49
+ @output << end_statement
50
+ elsif value.kind_of?(Array) && value[0].kind_of?(Hash)
51
+ end_statement = @output[-1]+"end"
52
+ @output[-1] << "collection '#{key}' do"
53
+ process_document depth+1, value[0]
54
+ @output << end_statement
55
+ elsif @whitelist_patterns.collect { |pattern| key.match(pattern) }.compact.length > 0
56
+ @output[-1] << "whitelist '#{key}'"
57
+ elsif
58
+ @output[-1] << "anonymize '#{key}'"
59
+ end
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+