data-anonymization 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. data/.gitignore +2 -1
  2. data/.rvmrc +1 -1
  3. data/.travis.yml +2 -0
  4. data/Gemfile +2 -0
  5. data/README.md +295 -258
  6. data/bin/datanon +57 -0
  7. data/data-anonymization.gemspec +2 -1
  8. data/examples/blacklist_dsl.rb +42 -0
  9. data/examples/mongodb_blacklist_dsl.rb +38 -0
  10. data/examples/mongodb_whitelist_dsl.rb +44 -0
  11. data/examples/whitelist_dsl.rb +63 -0
  12. data/lib/core/database.rb +21 -3
  13. data/lib/core/field.rb +5 -2
  14. data/lib/core/fields_missing_strategy.rb +30 -0
  15. data/lib/core/table_errors.rb +32 -0
  16. data/lib/data-anonymization.rb +11 -0
  17. data/lib/parallel/table.rb +8 -1
  18. data/lib/strategy/base.rb +35 -14
  19. data/lib/strategy/blacklist.rb +1 -1
  20. data/lib/strategy/field/anonymize_array.rb +28 -0
  21. data/lib/strategy/field/contact/random_address.rb +12 -0
  22. data/lib/strategy/field/contact/random_city.rb +12 -0
  23. data/lib/strategy/field/contact/random_phone_number.rb +4 -0
  24. data/lib/strategy/field/contact/random_province.rb +12 -0
  25. data/lib/strategy/field/contact/random_zipcode.rb +12 -0
  26. data/lib/strategy/field/datetime/anonymize_date.rb +15 -0
  27. data/lib/strategy/field/datetime/anonymize_datetime.rb +19 -0
  28. data/lib/strategy/field/datetime/anonymize_time.rb +19 -0
  29. data/lib/strategy/field/datetime/date_delta.rb +10 -0
  30. data/lib/strategy/field/datetime/date_time_delta.rb +9 -0
  31. data/lib/strategy/field/datetime/time_delta.rb +8 -0
  32. data/lib/strategy/field/default_anon.rb +4 -1
  33. data/lib/strategy/field/email/gmail_template.rb +8 -0
  34. data/lib/strategy/field/email/random_email.rb +7 -0
  35. data/lib/strategy/field/email/random_mailinator_email.rb +5 -0
  36. data/lib/strategy/field/fields.rb +4 -0
  37. data/lib/strategy/field/name/random_first_name.rb +10 -0
  38. data/lib/strategy/field/name/random_full_name.rb +10 -2
  39. data/lib/strategy/field/name/random_last_name.rb +9 -0
  40. data/lib/strategy/field/name/random_user_name.rb +5 -0
  41. data/lib/strategy/field/number/random_big_decimal_delta.rb +6 -0
  42. data/lib/strategy/field/number/random_float.rb +4 -0
  43. data/lib/strategy/field/number/random_float_delta.rb +6 -0
  44. data/lib/strategy/field/number/random_integer.rb +4 -0
  45. data/lib/strategy/field/number/random_integer_delta.rb +6 -0
  46. data/lib/strategy/field/string/formatted_string_numbers.rb +10 -6
  47. data/lib/strategy/field/string/lorem_ipsum.rb +9 -0
  48. data/lib/strategy/field/string/random_formatted_string.rb +39 -0
  49. data/lib/strategy/field/string/random_string.rb +6 -0
  50. data/lib/strategy/field/string/random_url.rb +7 -1
  51. data/lib/strategy/field/string/select_from_database.rb +7 -5
  52. data/lib/strategy/field/string/select_from_file.rb +7 -0
  53. data/lib/strategy/field/string/select_from_list.rb +8 -0
  54. data/lib/strategy/field/string/string_template.rb +11 -0
  55. data/lib/strategy/mongodb/anonymize_field.rb +44 -0
  56. data/lib/strategy/mongodb/blacklist.rb +29 -0
  57. data/lib/strategy/mongodb/whitelist.rb +62 -0
  58. data/lib/strategy/strategies.rb +10 -1
  59. data/lib/strategy/whitelist.rb +7 -2
  60. data/lib/thor/helpers/mongodb_dsl_generator.rb +66 -0
  61. data/lib/thor/helpers/rdbms_dsl_generator.rb +36 -0
  62. data/lib/thor/templates/mongodb_whitelist_template.erb +15 -0
  63. data/lib/thor/templates/whitelist_template.erb +21 -0
  64. data/lib/utils/database.rb +4 -0
  65. data/lib/utils/parallel_progress_bar.rb +24 -0
  66. data/lib/utils/progress_bar.rb +34 -22
  67. data/lib/utils/random_string.rb +3 -2
  68. data/lib/utils/random_string_chars_only.rb +3 -5
  69. data/lib/utils/template_helper.rb +44 -0
  70. data/lib/version.rb +1 -1
  71. data/spec/acceptance/mongodb_blacklist_spec.rb +75 -0
  72. data/spec/acceptance/mongodb_whitelist_spec.rb +107 -0
  73. data/spec/core/fields_missing_strategy_spec.rb +26 -0
  74. data/spec/strategy/field/name/random_first_name_spec.rb +1 -1
  75. data/spec/strategy/field/name/random_full_name_spec.rb +12 -7
  76. data/spec/strategy/field/name/random_last_name_spec.rb +1 -1
  77. data/spec/strategy/field/string/random_formatted_string_spec.rb +39 -0
  78. data/spec/strategy/field/string/select_from_file_spec.rb +21 -0
  79. data/spec/strategy/mongodb/anonymize_field_spec.rb +52 -0
  80. data/spec/utils/random_float_spec.rb +12 -0
  81. data/spec/utils/random_string_char_only_spec.rb +12 -0
  82. data/spec/utils/template_helper_spec.rb +14 -0
  83. metadata +56 -6
  84. data/blacklist_dsl.rb +0 -17
  85. data/blacklist_nosql_dsl.rb +0 -36
  86. data/whitelist_dsl.rb +0 -42
@@ -1,6 +1,12 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Shifts the current value randomly within given delta + and -. Default is 10.0
6
+ #
7
+ # !!!ruby
8
+ # anonymize('points').using FieldStrategy::RandomFloatDelta.new(2.5)
9
+
4
10
  class RandomFloatDelta
5
11
 
6
12
  def initialize delta = 10.0
@@ -2,6 +2,10 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random integer number between given two numbers. Default range is 0 to 100.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Age').using FieldStrategy::RandomInteger.new(18,70)
5
9
 
6
10
  class RandomInteger
7
11
 
@@ -1,6 +1,12 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Shifts the current value randomly within given delta + and -. Default is 10
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Age').using FieldStrategy::RandomIntegerDelta.new(2)
9
+
4
10
  class RandomIntegerDelta
5
11
 
6
12
  def initialize delta = 10
@@ -2,21 +2,25 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Keeping the format same it changes each digit in the string with random digit.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('CreditCardNumber').using FieldStrategy::FormattedStringNumber.new
5
9
 
6
10
  class FormattedStringNumber
7
11
 
8
12
  def anonymize field
9
- @original_phone_number = field.value
10
- @anonymized_phone_number = ""
11
- @original_phone_number.each_char do |char|
13
+ @original_string = field.value
14
+ @anonymized_string = ""
15
+ @original_string.each_char do |char|
12
16
  if /\d/.match(char).nil?
13
- @anonymized_phone_number += char
17
+ @anonymized_string += char
14
18
  else
15
- @anonymized_phone_number += DataAnon::Utils::RandomInt.generate(0,9).to_s
19
+ @anonymized_string += DataAnon::Utils::RandomInt.generate(0,9).to_s
16
20
  end
17
21
  end
18
22
 
19
- @anonymized_phone_number
23
+ @anonymized_string
20
24
  end
21
25
 
22
26
  end
@@ -2,6 +2,15 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Default anonymization strategy for `string` content. Uses default 'Lorem ipsum...' text or text supplied in strategy to generate same length string.
6
+ # !!!ruby
7
+ # anonymize('UserName').using FieldStrategy::LoremIpsum.new
8
+ #
9
+ # !!!ruby
10
+ # anonymize('UserName').using FieldStrategy::LoremIpsum.new("very large string....")
11
+ #
12
+ # !!!ruby
13
+ # anonymize('UserName').using FieldStrategy::LoremIpsum.new(File.read('my_file.txt'))
5
14
 
6
15
  class LoremIpsum
7
16
 
@@ -0,0 +1,39 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ # Keeping the format same it changes each digit with random digit, character with character preserving the case.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('PhoneNumber').using FieldStrategy::RandomFormattedString.new
9
+ # anonymize('Email').using FieldStrategy::RandomFormattedString.new
10
+
11
+ class RandomFormattedString
12
+
13
+ SMALL_CHARS = "abcdefghjkmnpqrstuvwxyz"
14
+ CAPS_CHARS = "ABCDEFGHJKLMNPQRSTUVWXYZ"
15
+
16
+ def anonymize field
17
+ @original_string = field.value
18
+ @anonymized_string = ""
19
+ @original_string.each_char do |char|
20
+ if /\d/.match(char)
21
+ @anonymized_string += DataAnon::Utils::RandomInt.generate(0, 9).to_s
22
+ elsif /[a-z]/.match(char)
23
+ @anonymized_string += SMALL_CHARS[rand(SMALL_CHARS.length)]
24
+ elsif /[A-Z]/.match(char)
25
+ @anonymized_string += CAPS_CHARS[rand(CAPS_CHARS.length)]
26
+ else
27
+ @anonymized_string += char
28
+ end
29
+ end
30
+
31
+ @anonymized_string
32
+ end
33
+
34
+ end
35
+
36
+
37
+ end
38
+ end
39
+ end
@@ -1,6 +1,12 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Generates random string of same length.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('UserName').using FieldStrategy::RandomString.new
9
+
4
10
  class RandomString
5
11
 
6
12
  def anonymize field
@@ -1,7 +1,13 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
- class RandomUrl
4
+
5
+ # Generates a randomized URL while maintaining the structure of the original url
6
+ #
7
+ # !!!ruby
8
+ # anonymize('fb_profile').using FieldStrategy::RandomURL.new
9
+
10
+ class RandomUrl
5
11
 
6
12
  def anonymize field
7
13
 
@@ -2,7 +2,13 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
- class SelectFromDatabase
5
+ # Similar to SelectFromList with difference is the list of values are collected from the database table using distinct column query.
6
+ #
7
+ # !!!ruby
8
+ # # values are collected using `select distinct state from customers` query
9
+ # anonymize('State').using FieldStrategy::SelectFromDatabase.new('customers','state')
10
+
11
+ class SelectFromDatabase < SelectFromFile
6
12
  include Utils::Logging
7
13
 
8
14
  def initialize table_name, field_name, connection_spec
@@ -13,10 +19,6 @@ module DataAnon
13
19
 
14
20
  end
15
21
 
16
- def anonymize field
17
- @values[DataAnon::Utils::RandomInt.generate(0,(@values.length - 1))]
18
- end
19
-
20
22
  end
21
23
 
22
24
 
@@ -2,6 +2,12 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to SelectFromList only difference is the list of values are picked up from file. Classical usage is like states field anonymization.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('State').using FieldStrategy::SelectFromFile.new('states.txt')
9
+ #
10
+
5
11
  class SelectFromFile
6
12
 
7
13
  def initialize file_path
@@ -9,6 +15,7 @@ module DataAnon
9
15
  end
10
16
 
11
17
  def anonymize field
18
+ return @values.sample(field.value.length) if field.value.kind_of? Array
12
19
  @values.sample
13
20
  end
14
21
 
@@ -2,6 +2,14 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Select randomly one of the values specified.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('State').using FieldStrategy::SelectFromList.new(['New York','Georgia',...])
9
+ #
10
+ # !!!ruby
11
+ # anonymize('NameTitle').using FieldStrategy::SelectFromList.new(['Mr','Mrs','Dr',...])
12
+ #
5
13
 
6
14
  class SelectFromList < SelectFromFile
7
15
 
@@ -2,6 +2,17 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Simple string evaluation within [DataAnon::Core::Field](#dataanon-core-field) context. Can be used for email, username anonymization.
6
+ # Make sure to put the string in 'single quote' else it will get evaluated inline.
7
+ #
8
+ # !!!ruby
9
+ # anonymize('UserName').using FieldStrategy::StringTemplate.new('user#{row_number}')
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Email').using FieldStrategy::StringTemplate.new('valid.address+#{row_number}@gmail.com')
13
+ #
14
+ # !!!ruby
15
+ # anonymize('Email').using FieldStrategy::StringTemplate.new('useremail#{row_number}@mailinator.com')
5
16
 
6
17
  class StringTemplate
7
18
 
@@ -0,0 +1,44 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module MongoDB
4
+ class AnonymizeField
5
+
6
+ def initialize field, field_strategy, anonymization_strategy
7
+ @field = field
8
+ @field_strategy = field_strategy
9
+ @anonymization_strategy = anonymization_strategy
10
+ end
11
+
12
+ def anonymize
13
+ if sub_document?
14
+ @anonymization_strategy.anonymize_document(@field.value, @field.row_number, @field_strategy)
15
+ elsif sub_documents?
16
+ anonymize_sub_documents
17
+ else
18
+ anonymize_field
19
+ end
20
+ end
21
+
22
+ def anonymize_sub_documents
23
+ @field.value.collect { |value| @anonymization_strategy.anonymize_document(value, @field.row_number, @field_strategy) }
24
+ end
25
+
26
+ def anonymize_field
27
+ @field_strategy = @field_strategy || @anonymization_strategy.default_strategy(@field.name)
28
+ raise "Improper fields strategy defined for '#{@field.name}' within document \n #{@field.ar_record}" unless @field_strategy.respond_to?(:anonymize)
29
+ @field_strategy.anonymize(@field)
30
+ end
31
+
32
+ def sub_documents?
33
+ @field.value.kind_of?(Array) && @field.value[0].kind_of?(Hash)
34
+ end
35
+
36
+ def sub_document?
37
+ @field.value.kind_of? Hash
38
+ end
39
+
40
+
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,29 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module MongoDB
4
+ class Blacklist < DataAnon::Strategy::MongoDB::Whitelist
5
+
6
+ def self.whitelist?
7
+ false
8
+ end
9
+
10
+ def process_record index, document
11
+ source_collection.save anonymize_document(document, index, @fields)
12
+ end
13
+
14
+ def anonymize_document document, index, field_strategies = {}
15
+ field_strategies.each do |field_name, field_strategy|
16
+ field_value = document[field_name]
17
+ unless field_value.nil?
18
+ field = DataAnon::Core::Field.new(field_name, field_value, index, document, @name)
19
+ document[field.name] = AnonymizeField.new(field, field_strategy, self).anonymize
20
+ end
21
+ end
22
+ document
23
+ end
24
+
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,62 @@
1
+ require 'mongo'
2
+
3
+ class Mongo::Collection
4
+ alias :all :find
5
+ end
6
+
7
+ module DataAnon
8
+ module Strategy
9
+ module MongoDB
10
+ class Whitelist < DataAnon::Strategy::Base
11
+
12
+ def self.whitelist?
13
+ true
14
+ end
15
+
16
+ def collection field, &block
17
+ whitelist = self.class.new @source_database, @destination_database, @name, @user_strategies
18
+ whitelist.process_fields &block
19
+ @fields[field] = whitelist.fields
20
+ end
21
+
22
+ alias :document :collection
23
+
24
+ def mongo_collection(database)
25
+ options = database[:options] || {}
26
+ Mongo::Connection.from_uri(database[:mongodb_uri], options)[database[:database]][@name]
27
+ end
28
+
29
+ def dest_collection
30
+ database = @destination_database
31
+ @dest_collection ||= mongo_collection(database)
32
+ end
33
+
34
+ def source_collection
35
+ @source_collection ||= mongo_collection(@source_database)
36
+ end
37
+
38
+ alias :source_table :source_collection
39
+ alias :dest_table :dest_collection
40
+
41
+ def process_record index, document
42
+ dest_collection.insert anonymize_document(document, index, @fields)
43
+ end
44
+
45
+ def anonymize_document document, index, field_strategies = {}
46
+ anonymized_document = {}
47
+ document.each do |field_name, field_value|
48
+ field_strategy = field_strategies[field_name.downcase] if field_strategies.kind_of?(Hash)
49
+ unless field_value.nil?
50
+ field = DataAnon::Core::Field.new(field_name, field_value, index, document, @name)
51
+ anonymized_document[field.name] = AnonymizeField.new(field, field_strategy, self).anonymize
52
+ end
53
+ end
54
+ anonymized_document
55
+ end
56
+
57
+
58
+ end
59
+
60
+ end
61
+ end
62
+ end
@@ -1,4 +1,13 @@
1
1
  require 'strategy/base'
2
2
  require 'strategy/whitelist'
3
3
  require 'strategy/blacklist'
4
- require 'strategy/field/fields'
4
+ require 'strategy/field/fields'
5
+
6
+ begin
7
+ require 'mongo'
8
+ require 'strategy/mongodb/anonymize_field'
9
+ require 'strategy/mongodb/whitelist'
10
+ require 'strategy/mongodb/blacklist'
11
+ rescue LoadError
12
+ "Ignoring the mongodb specific libraries if monog driver is not specified in gem"
13
+ end
@@ -2,12 +2,16 @@ module DataAnon
2
2
  module Strategy
3
3
  class Whitelist < DataAnon::Strategy::Base
4
4
 
5
+ def self.whitelist?
6
+ true
7
+ end
8
+
5
9
  def process_record(index, record)
6
10
  dest_record_map = {}
7
11
  record.attributes.each do |field_name, field_value|
8
12
  unless field_value.nil? || is_primary_key?(field_name)
9
- field = DataAnon::Core::Field.new(field_name, field_value, index, record)
10
- field_strategy = @fields[field_name.downcase] || DataAnon::Strategy::Field::DefaultAnon.new(@user_strategies)
13
+ field = DataAnon::Core::Field.new(field_name, field_value, index, record, @name)
14
+ field_strategy = @fields[field_name.downcase] || default_strategy(field_name)
11
15
  dest_record_map[field_name] = field_strategy.anonymize(field)
12
16
  end
13
17
  end
@@ -18,6 +22,7 @@ module DataAnon
18
22
  dest_record.save!
19
23
  end
20
24
 
25
+
21
26
  end
22
27
  end
23
28
  end
@@ -0,0 +1,66 @@
1
+ require 'erb'
2
+ require 'thor'
3
+
4
+ module DataAnon
5
+ module ThorHelpers
6
+ class MongoDBDSLGenerator
7
+
8
+ def self.source_root
9
+ File.dirname(__FILE__)
10
+ end
11
+
12
+ def initialize(configuration_hash, whitelist_patterns)
13
+ @mongodb_uri = DataAnon::Utils::TemplateHelper.mongo_uri(configuration_hash)
14
+ @whitelist_patterns = whitelist_patterns || [/^_/,/_at$/,/_id$/,/_type$/]
15
+ @configuration_hash = configuration_hash
16
+ @output = []
17
+ end
18
+
19
+ def generate
20
+
21
+ db = Mongo::Connection.from_uri(@mongodb_uri)[@configuration_hash[:database]]
22
+ collections = db.collections
23
+ collections.each do |collection|
24
+ unless collection.name.start_with?('system.')
25
+ depth = 2
26
+ @output << "\tcollection '#{collection.name}' do"
27
+ document = collection.find_one
28
+ process_document(depth, document)
29
+ @output << "\tend\n"
30
+ end
31
+ end
32
+
33
+ erb = ERB.new( File.new(RDBMSDSLGenerator.source_root + "/../templates/mongodb_whitelist_template.erb").read, nil, '-')
34
+ File.open('mongodb_whitelist_generated.rb', 'w') do |f|
35
+ f.write erb.result(binding)
36
+ f.close
37
+ end
38
+
39
+ end
40
+
41
+ def process_document(depth, document)
42
+ return if document.nil?
43
+ document.each do |key, value|
44
+ @output << ("\t"*depth)
45
+ if value.kind_of?(Hash)
46
+ end_statement = @output[-1]+"end"
47
+ @output[-1] << "document '#{key}' do"
48
+ process_document depth+1, value
49
+ @output << end_statement
50
+ elsif value.kind_of?(Array) && value[0].kind_of?(Hash)
51
+ end_statement = @output[-1]+"end"
52
+ @output[-1] << "collection '#{key}' do"
53
+ process_document depth+1, value[0]
54
+ @output << end_statement
55
+ elsif @whitelist_patterns.collect { |pattern| key.match(pattern) }.compact.length > 0
56
+ @output[-1] << "whitelist '#{key}'"
57
+ elsif
58
+ @output[-1] << "anonymize '#{key}'"
59
+ end
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+