data-anonymization 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. data/.gitignore +2 -1
  2. data/.rvmrc +1 -1
  3. data/.travis.yml +2 -0
  4. data/Gemfile +2 -0
  5. data/README.md +295 -258
  6. data/bin/datanon +57 -0
  7. data/data-anonymization.gemspec +2 -1
  8. data/examples/blacklist_dsl.rb +42 -0
  9. data/examples/mongodb_blacklist_dsl.rb +38 -0
  10. data/examples/mongodb_whitelist_dsl.rb +44 -0
  11. data/examples/whitelist_dsl.rb +63 -0
  12. data/lib/core/database.rb +21 -3
  13. data/lib/core/field.rb +5 -2
  14. data/lib/core/fields_missing_strategy.rb +30 -0
  15. data/lib/core/table_errors.rb +32 -0
  16. data/lib/data-anonymization.rb +11 -0
  17. data/lib/parallel/table.rb +8 -1
  18. data/lib/strategy/base.rb +35 -14
  19. data/lib/strategy/blacklist.rb +1 -1
  20. data/lib/strategy/field/anonymize_array.rb +28 -0
  21. data/lib/strategy/field/contact/random_address.rb +12 -0
  22. data/lib/strategy/field/contact/random_city.rb +12 -0
  23. data/lib/strategy/field/contact/random_phone_number.rb +4 -0
  24. data/lib/strategy/field/contact/random_province.rb +12 -0
  25. data/lib/strategy/field/contact/random_zipcode.rb +12 -0
  26. data/lib/strategy/field/datetime/anonymize_date.rb +15 -0
  27. data/lib/strategy/field/datetime/anonymize_datetime.rb +19 -0
  28. data/lib/strategy/field/datetime/anonymize_time.rb +19 -0
  29. data/lib/strategy/field/datetime/date_delta.rb +10 -0
  30. data/lib/strategy/field/datetime/date_time_delta.rb +9 -0
  31. data/lib/strategy/field/datetime/time_delta.rb +8 -0
  32. data/lib/strategy/field/default_anon.rb +4 -1
  33. data/lib/strategy/field/email/gmail_template.rb +8 -0
  34. data/lib/strategy/field/email/random_email.rb +7 -0
  35. data/lib/strategy/field/email/random_mailinator_email.rb +5 -0
  36. data/lib/strategy/field/fields.rb +4 -0
  37. data/lib/strategy/field/name/random_first_name.rb +10 -0
  38. data/lib/strategy/field/name/random_full_name.rb +10 -2
  39. data/lib/strategy/field/name/random_last_name.rb +9 -0
  40. data/lib/strategy/field/name/random_user_name.rb +5 -0
  41. data/lib/strategy/field/number/random_big_decimal_delta.rb +6 -0
  42. data/lib/strategy/field/number/random_float.rb +4 -0
  43. data/lib/strategy/field/number/random_float_delta.rb +6 -0
  44. data/lib/strategy/field/number/random_integer.rb +4 -0
  45. data/lib/strategy/field/number/random_integer_delta.rb +6 -0
  46. data/lib/strategy/field/string/formatted_string_numbers.rb +10 -6
  47. data/lib/strategy/field/string/lorem_ipsum.rb +9 -0
  48. data/lib/strategy/field/string/random_formatted_string.rb +39 -0
  49. data/lib/strategy/field/string/random_string.rb +6 -0
  50. data/lib/strategy/field/string/random_url.rb +7 -1
  51. data/lib/strategy/field/string/select_from_database.rb +7 -5
  52. data/lib/strategy/field/string/select_from_file.rb +7 -0
  53. data/lib/strategy/field/string/select_from_list.rb +8 -0
  54. data/lib/strategy/field/string/string_template.rb +11 -0
  55. data/lib/strategy/mongodb/anonymize_field.rb +44 -0
  56. data/lib/strategy/mongodb/blacklist.rb +29 -0
  57. data/lib/strategy/mongodb/whitelist.rb +62 -0
  58. data/lib/strategy/strategies.rb +10 -1
  59. data/lib/strategy/whitelist.rb +7 -2
  60. data/lib/thor/helpers/mongodb_dsl_generator.rb +66 -0
  61. data/lib/thor/helpers/rdbms_dsl_generator.rb +36 -0
  62. data/lib/thor/templates/mongodb_whitelist_template.erb +15 -0
  63. data/lib/thor/templates/whitelist_template.erb +21 -0
  64. data/lib/utils/database.rb +4 -0
  65. data/lib/utils/parallel_progress_bar.rb +24 -0
  66. data/lib/utils/progress_bar.rb +34 -22
  67. data/lib/utils/random_string.rb +3 -2
  68. data/lib/utils/random_string_chars_only.rb +3 -5
  69. data/lib/utils/template_helper.rb +44 -0
  70. data/lib/version.rb +1 -1
  71. data/spec/acceptance/mongodb_blacklist_spec.rb +75 -0
  72. data/spec/acceptance/mongodb_whitelist_spec.rb +107 -0
  73. data/spec/core/fields_missing_strategy_spec.rb +26 -0
  74. data/spec/strategy/field/name/random_first_name_spec.rb +1 -1
  75. data/spec/strategy/field/name/random_full_name_spec.rb +12 -7
  76. data/spec/strategy/field/name/random_last_name_spec.rb +1 -1
  77. data/spec/strategy/field/string/random_formatted_string_spec.rb +39 -0
  78. data/spec/strategy/field/string/select_from_file_spec.rb +21 -0
  79. data/spec/strategy/mongodb/anonymize_field_spec.rb +52 -0
  80. data/spec/utils/random_float_spec.rb +12 -0
  81. data/spec/utils/random_string_char_only_spec.rb +12 -0
  82. data/spec/utils/template_helper_spec.rb +14 -0
  83. metadata +56 -6
  84. data/blacklist_dsl.rb +0 -17
  85. data/blacklist_nosql_dsl.rb +0 -36
  86. data/whitelist_dsl.rb +0 -42
@@ -7,7 +7,7 @@ module DataAnon
7
7
  database_field_name = record.attributes.select { |k,v| k.downcase == field }.keys[0]
8
8
  field_value = record.attributes[database_field_name]
9
9
  unless field_value.nil? || is_primary_key?(database_field_name)
10
- field = DataAnon::Core::Field.new(database_field_name, field_value, index, record)
10
+ field = DataAnon::Core::Field.new(database_field_name, field_value, index, record, @name)
11
11
  record[database_field_name] = strategy.anonymize(field)
12
12
  end
13
13
  end
@@ -0,0 +1,28 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class AnonymizeArray
7
+
8
+ def self.user_defaults user_defaults
9
+ @@user_defaults = user_defaults
10
+ end
11
+
12
+ def initialize strategy
13
+ @strategy = strategy
14
+ end
15
+
16
+ def anonymize field
17
+ field.value.collect do |v|
18
+ strategy = @strategy || @@user_defaults[v.class.to_s.downcase.to_sym]
19
+ strategy.anonymize DataAnon::Core::Field.new(field.name, v, field.row_number, field.ar_record, field.table_name)
20
+ end
21
+ end
22
+
23
+ end
24
+
25
+
26
+ end
27
+ end
28
+ end
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates address using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Address').using FieldStrategy::RandomAddress.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Address').using FieldStrategy::RandomAddress.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('Address').using FieldStrategy::RandomAddress.new('my_geo_json.json')
5
17
 
6
18
  class RandomAddress < GeojsonBase
7
19
 
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to RandomAddress, generates city using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('City').using FieldStrategy::RandomCity.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('City').using FieldStrategy::RandomCity.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('City').using FieldStrategy::RandomCity.new('my_geo_json.json')
5
17
 
6
18
  class RandomCity < GeojsonBase
7
19
 
@@ -2,6 +2,10 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Keeping the format same it changes each digit in the string with random digit.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('PhoneNumber').using FieldStrategy::RandomPhoneNumber.new
5
9
 
6
10
  class RandomPhoneNumber < FormattedStringNumber
7
11
 
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to RandomAddress, generates province using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Province').using FieldStrategy::RandomProvince.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Province').using FieldStrategy::RandomProvince.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('Province').using FieldStrategy::RandomProvince.new('my_geo_json.json')
5
17
 
6
18
  class RandomProvince < GeojsonBase
7
19
 
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to RandomAddress, generates zipcode using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Address').using FieldStrategy::RandomZipcode.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Address').using FieldStrategy::RandomZipcode.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('Address').using FieldStrategy::RandomZipcode.new('my_geo_json.json')
5
17
 
6
18
  class RandomZipcode < GeojsonBase
7
19
 
@@ -2,6 +2,21 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Anonmizes day and month fields within natural range based on true/false input for that field. By defaut both fields are
6
+ # anonymized
7
+ #
8
+ # !!!ruby
9
+ # # anonymizes month and leaves day unchanged
10
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.new(true,false)
11
+ #
12
+ # In addition to customizing which fields you want anonymized, there are some helper methods which allow for quick anonymization
13
+ #
14
+ # ```ruby
15
+ # # anonymizes only the month field
16
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.only_month
17
+ # # anonymizes only the day field
18
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.only_day
19
+
5
20
  class AnonymizeDate
6
21
 
7
22
 
@@ -2,6 +2,25 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Anonymizes each field(except year and seconds) within the natural range (e.g. hour between 1-24 and day within the month) based on true/false
6
+ # input for that field. By default, all fields are anonymized.
7
+ #
8
+ # !!!ruby
9
+ # # anonymizes month and hour fields, leaving the day and minute fields untouched
10
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.new(true,false,true,false)
11
+ #
12
+ # In addition to customizing which fields you want anonymized, there are some helper methods which allow for quick anonymization
13
+ #
14
+ # !!!ruby
15
+ # # anonymizes only the month field
16
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_month
17
+ # # anonymizes only the day field
18
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_day
19
+ # # anonymizes only the hour field
20
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_hour
21
+ # # anonymizes only the minute field
22
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_minute
23
+
5
24
  class AnonymizeDateTime < AnonymizeTime
6
25
 
7
26
  private
@@ -2,6 +2,25 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Anonymizes each field(except year and seconds) within the natural range (e.g. hour between 1-24 and day within the month) based on true/false
6
+ # input for that field. By default, all fields are anonymized.
7
+ #
8
+ # !!!ruby
9
+ # # anonymizes month and hour fields, leaving the day and minute fields untouched
10
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.new(true,false,true,false)
11
+ #
12
+ # In addition to customizing which fields you want anonymized, there are some helper methods which allow for quick anonymization
13
+ #
14
+ # !!!ruby
15
+ # # anonymizes only the month field
16
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_month
17
+ # # anonymizes only the day field
18
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_day
19
+ # # anonymizes only the hour field
20
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_hour
21
+ # # anonymizes only the minute field
22
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_minute
23
+
5
24
  class AnonymizeTime
6
25
 
7
26
  DEFAULT_ANONYMIZATION = true
@@ -2,6 +2,16 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Shifts date randomly within given delta range. Default shits date within 10 days + or -
6
+ #
7
+ # !!!ruby
8
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.new
9
+ #
10
+ # !!!ruby
11
+ # # shifts date within 25 days
12
+ # anonymize('DateOfBirth').using FieldStrategy::DateDelta.new(25)
13
+ #
14
+
5
15
  class DateDelta
6
16
 
7
17
  DEFAULT_DAY_DELTA = 10
@@ -2,6 +2,15 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Shifts data randomly within given range. Default shifts date within 10 days + or - and shifts time within 30 minutes.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('DateOfBirth').using FieldStrategy::DateTimeDelta.new
9
+ #
10
+ # !!!ruby
11
+ # # shifts date within 20 days and time within 50 minutes
12
+ # anonymize('DateOfBirth').using FieldStrategy::DateTimeDelta.new(20, 50)
13
+
5
14
  class DateTimeDelta
6
15
 
7
16
  DEFAULT_DAY_DELTA = 10
@@ -2,6 +2,14 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Shifts data randomly within given range. Default shifts date within 10 days + or - and shifts time within 30 minutes.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('DateOfBirth').using FieldStrategy::TimeDelta.new
9
+ #
10
+ # !!!ruby
11
+ # # shifts date within 20 days and time within 50 minutes
12
+ # anonymize('DateOfBirth').using FieldStrategy::TimeDelta.new(20, 50)
5
13
 
6
14
  class TimeDelta < DateTimeDelta
7
15
  end
@@ -12,17 +12,20 @@ module DataAnon
12
12
  :datetime => FieldStrategy::DateTimeDelta.new,
13
13
  :time => FieldStrategy::TimeDelta.new,
14
14
  :date => FieldStrategy::DateDelta.new,
15
+ :array => FieldStrategy::AnonymizeArray.new(nil),
15
16
  :trueclass => FieldStrategy::RandomBoolean.new,
17
+ :"bson::objectid" => FieldStrategy::Whitelist.new,
16
18
  :falseclass => FieldStrategy::RandomBoolean.new
17
19
  }
18
20
 
19
21
  def initialize user_defaults = {}
20
22
  @user_defaults = DEFAULT_STRATEGIES.merge user_defaults
23
+ FieldStrategy::AnonymizeArray.user_defaults @user_defaults
21
24
  end
22
25
 
23
26
  def anonymize field
24
27
  strategy = @user_defaults[field.value.class.to_s.downcase.to_sym]
25
- raise "No strategy defined for datatype #{field.value.class}. Use 'default_field_strategies' option in your script. Refer to http://sunitparekh.github.com/data-anonymization/#default-field-strategies for more details. " unless strategy
28
+ raise "No strategy defined for datatype #{field.value.class}. Use 'default_field_strategies' option in your script. Refer to http://sunitparekh.github.com/data-anonymization/#default-field-strategies for more details. #{field.inspect}" unless strategy
26
29
  strategy.anonymize field
27
30
  end
28
31
 
@@ -1,6 +1,14 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Generates a valid unique gmail address by taking advantage of the gmail + strategy. Takes in a valid gmail username and
6
+ # generates emails of the form username+<number>@gmail.com
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Email').using FieldStrategy::GmailTemplate.new('username')
10
+ #
11
+
4
12
  class GmailTemplate
5
13
 
6
14
  def initialize username = 'someusername'
@@ -2,6 +2,13 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates email randomly using the given HOSTNAME and TLD.
6
+ # By defaults generates hostname randomly along with email id.
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Email').using FieldStrategy::RandomEmail.new('thoughtworks','com')
10
+ #
11
+
5
12
  class RandomEmail
6
13
 
7
14
  TLDS = ['com','org','net','edu','gov','mil','biz','info']
@@ -2,6 +2,11 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random email using mailinator hostname. e.g. <randomstring>@mailinator.com
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Email').using FieldStrategy::RandomMailinatorEmail.new
9
+
5
10
  class RandomMailinatorEmail
6
11
 
7
12
  def initialize
@@ -3,12 +3,16 @@ require 'strategy/field/random_boolean'
3
3
 
4
4
  require 'strategy/field/anonymous'
5
5
 
6
+ #array
7
+ require 'strategy/field/anonymize_array'
8
+
6
9
  # string
7
10
  require 'strategy/field/string/lorem_ipsum'
8
11
  require 'strategy/field/string/string_template'
9
12
  require 'strategy/field/string/random_string'
10
13
  require 'strategy/field/string/random_url'
11
14
  require 'strategy/field/string/formatted_string_numbers'
15
+ require 'strategy/field/string/random_formatted_string'
12
16
 
13
17
  require 'strategy/field/string/select_from_file'
14
18
  require 'strategy/field/string/select_from_list'
@@ -2,6 +2,16 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Randomly picks up first name from the predefined list in the file. Default [file](https://raw.github.com/sunitparekh/data-anonymization/master/resources/first_names.txt) is part of the gem.
6
+ # File should contain first name on each line.
7
+ #
8
+ # !!!ruby ```ruby
9
+ # anonymize('FirstName').using FieldStrategy::RandomFirstName.new
10
+ #
11
+ # !!!ruby
12
+ # anonymize('FirstName').using FieldStrategy::RandomFirstName.new('my_first_names.txt')
13
+ #
14
+
5
15
  class RandomFirstName < SelectFromFile
6
16
 
7
17
  def initialize file_path = nil
@@ -2,6 +2,14 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates full name using the RandomFirstName and RandomLastName strategies.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('FullName').using FieldStrategy::RandomFullName.new
9
+ #
10
+ # !!!ruby
11
+ # anonymize('FullName').using FieldStrategy::RandomLastName.new('my_first_names.txt', 'my_last_names.txt')
12
+
5
13
  class RandomFullName
6
14
 
7
15
  def initialize first_names = nil, last_names = nil
@@ -13,10 +21,10 @@ module DataAnon
13
21
 
14
22
  name_words = field.value.split(' ')
15
23
 
16
- anonymized_first_name = @first_name_anonymizer.anonymize(name_words[0])
24
+ anonymized_first_name = @first_name_anonymizer.anonymize(field)
17
25
  anonymized_last_name = ""
18
26
  for counter in (1..name_words.size-1)
19
- anonymized_last_name = anonymized_last_name + " " + @last_name_anonymizer.anonymize(name_words[counter])
27
+ anonymized_last_name = anonymized_last_name + " " + @last_name_anonymizer.anonymize(field)
20
28
  end
21
29
 
22
30
  return anonymized_first_name + anonymized_last_name
@@ -2,6 +2,15 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Randomly picks up last name from the predefined list in the file. Default [file](https://raw.github.com/sunitparekh/data-anonymization/master/resources/last_names.txt) is part of the gem.
6
+ # File should contain last name on each line.
7
+ #
8
+ # !!!ruby
9
+ # anonymize('LastName').using FieldStrategy::RandomLastName.new
10
+ #
11
+ # !!!ruby
12
+ # anonymize('LastName').using FieldStrategy::RandomLastName.new('my_last_names.txt')
13
+
5
14
  class RandomLastName < SelectFromFile
6
15
 
7
16
  def initialize file_path = nil
@@ -2,6 +2,11 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random user name of same length as original user name.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Username').using FieldStrategy::RandomUserName.new
9
+ #
5
10
  class RandomUserName
6
11
 
7
12
  DEFAULT_MIN_LENGTH = 5
@@ -3,6 +3,12 @@ require 'bigdecimal'
3
3
  module DataAnon
4
4
  module Strategy
5
5
  module Field
6
+
7
+ # Shifts the current value randomly within given delta + and -. Default is 10.0
8
+ #
9
+ # !!!ruby
10
+ # anonymize('points').using FieldStrategy::RandomFloatDelta.new(2.5)
11
+
6
12
  class RandomBigDecimalDelta
7
13
 
8
14
  def initialize delta = 100.0
@@ -2,6 +2,10 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random float number between given two numbers. Default range is 0.0 to 100.0
6
+ #
7
+ # !!!ruby
8
+ # anonymize('points').using FieldStrategy::RandomFloat.new(3.0,5.0)
5
9
 
6
10
  class RandomFloat
7
11