data-anonymization 0.3.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. data/.gitignore +2 -1
  2. data/.rvmrc +1 -1
  3. data/.travis.yml +2 -0
  4. data/Gemfile +2 -0
  5. data/README.md +295 -258
  6. data/bin/datanon +57 -0
  7. data/data-anonymization.gemspec +2 -1
  8. data/examples/blacklist_dsl.rb +42 -0
  9. data/examples/mongodb_blacklist_dsl.rb +38 -0
  10. data/examples/mongodb_whitelist_dsl.rb +44 -0
  11. data/examples/whitelist_dsl.rb +63 -0
  12. data/lib/core/database.rb +21 -3
  13. data/lib/core/field.rb +5 -2
  14. data/lib/core/fields_missing_strategy.rb +30 -0
  15. data/lib/core/table_errors.rb +32 -0
  16. data/lib/data-anonymization.rb +11 -0
  17. data/lib/parallel/table.rb +8 -1
  18. data/lib/strategy/base.rb +35 -14
  19. data/lib/strategy/blacklist.rb +1 -1
  20. data/lib/strategy/field/anonymize_array.rb +28 -0
  21. data/lib/strategy/field/contact/random_address.rb +12 -0
  22. data/lib/strategy/field/contact/random_city.rb +12 -0
  23. data/lib/strategy/field/contact/random_phone_number.rb +4 -0
  24. data/lib/strategy/field/contact/random_province.rb +12 -0
  25. data/lib/strategy/field/contact/random_zipcode.rb +12 -0
  26. data/lib/strategy/field/datetime/anonymize_date.rb +15 -0
  27. data/lib/strategy/field/datetime/anonymize_datetime.rb +19 -0
  28. data/lib/strategy/field/datetime/anonymize_time.rb +19 -0
  29. data/lib/strategy/field/datetime/date_delta.rb +10 -0
  30. data/lib/strategy/field/datetime/date_time_delta.rb +9 -0
  31. data/lib/strategy/field/datetime/time_delta.rb +8 -0
  32. data/lib/strategy/field/default_anon.rb +4 -1
  33. data/lib/strategy/field/email/gmail_template.rb +8 -0
  34. data/lib/strategy/field/email/random_email.rb +7 -0
  35. data/lib/strategy/field/email/random_mailinator_email.rb +5 -0
  36. data/lib/strategy/field/fields.rb +4 -0
  37. data/lib/strategy/field/name/random_first_name.rb +10 -0
  38. data/lib/strategy/field/name/random_full_name.rb +10 -2
  39. data/lib/strategy/field/name/random_last_name.rb +9 -0
  40. data/lib/strategy/field/name/random_user_name.rb +5 -0
  41. data/lib/strategy/field/number/random_big_decimal_delta.rb +6 -0
  42. data/lib/strategy/field/number/random_float.rb +4 -0
  43. data/lib/strategy/field/number/random_float_delta.rb +6 -0
  44. data/lib/strategy/field/number/random_integer.rb +4 -0
  45. data/lib/strategy/field/number/random_integer_delta.rb +6 -0
  46. data/lib/strategy/field/string/formatted_string_numbers.rb +10 -6
  47. data/lib/strategy/field/string/lorem_ipsum.rb +9 -0
  48. data/lib/strategy/field/string/random_formatted_string.rb +39 -0
  49. data/lib/strategy/field/string/random_string.rb +6 -0
  50. data/lib/strategy/field/string/random_url.rb +7 -1
  51. data/lib/strategy/field/string/select_from_database.rb +7 -5
  52. data/lib/strategy/field/string/select_from_file.rb +7 -0
  53. data/lib/strategy/field/string/select_from_list.rb +8 -0
  54. data/lib/strategy/field/string/string_template.rb +11 -0
  55. data/lib/strategy/mongodb/anonymize_field.rb +44 -0
  56. data/lib/strategy/mongodb/blacklist.rb +29 -0
  57. data/lib/strategy/mongodb/whitelist.rb +62 -0
  58. data/lib/strategy/strategies.rb +10 -1
  59. data/lib/strategy/whitelist.rb +7 -2
  60. data/lib/thor/helpers/mongodb_dsl_generator.rb +66 -0
  61. data/lib/thor/helpers/rdbms_dsl_generator.rb +36 -0
  62. data/lib/thor/templates/mongodb_whitelist_template.erb +15 -0
  63. data/lib/thor/templates/whitelist_template.erb +21 -0
  64. data/lib/utils/database.rb +4 -0
  65. data/lib/utils/parallel_progress_bar.rb +24 -0
  66. data/lib/utils/progress_bar.rb +34 -22
  67. data/lib/utils/random_string.rb +3 -2
  68. data/lib/utils/random_string_chars_only.rb +3 -5
  69. data/lib/utils/template_helper.rb +44 -0
  70. data/lib/version.rb +1 -1
  71. data/spec/acceptance/mongodb_blacklist_spec.rb +75 -0
  72. data/spec/acceptance/mongodb_whitelist_spec.rb +107 -0
  73. data/spec/core/fields_missing_strategy_spec.rb +26 -0
  74. data/spec/strategy/field/name/random_first_name_spec.rb +1 -1
  75. data/spec/strategy/field/name/random_full_name_spec.rb +12 -7
  76. data/spec/strategy/field/name/random_last_name_spec.rb +1 -1
  77. data/spec/strategy/field/string/random_formatted_string_spec.rb +39 -0
  78. data/spec/strategy/field/string/select_from_file_spec.rb +21 -0
  79. data/spec/strategy/mongodb/anonymize_field_spec.rb +52 -0
  80. data/spec/utils/random_float_spec.rb +12 -0
  81. data/spec/utils/random_string_char_only_spec.rb +12 -0
  82. data/spec/utils/template_helper_spec.rb +14 -0
  83. metadata +56 -6
  84. data/blacklist_dsl.rb +0 -17
  85. data/blacklist_nosql_dsl.rb +0 -36
  86. data/whitelist_dsl.rb +0 -42
@@ -7,7 +7,7 @@ module DataAnon
7
7
  database_field_name = record.attributes.select { |k,v| k.downcase == field }.keys[0]
8
8
  field_value = record.attributes[database_field_name]
9
9
  unless field_value.nil? || is_primary_key?(database_field_name)
10
- field = DataAnon::Core::Field.new(database_field_name, field_value, index, record)
10
+ field = DataAnon::Core::Field.new(database_field_name, field_value, index, record, @name)
11
11
  record[database_field_name] = strategy.anonymize(field)
12
12
  end
13
13
  end
@@ -0,0 +1,28 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class AnonymizeArray
7
+
8
+ def self.user_defaults user_defaults
9
+ @@user_defaults = user_defaults
10
+ end
11
+
12
+ def initialize strategy
13
+ @strategy = strategy
14
+ end
15
+
16
+ def anonymize field
17
+ field.value.collect do |v|
18
+ strategy = @strategy || @@user_defaults[v.class.to_s.downcase.to_sym]
19
+ strategy.anonymize DataAnon::Core::Field.new(field.name, v, field.row_number, field.ar_record, field.table_name)
20
+ end
21
+ end
22
+
23
+ end
24
+
25
+
26
+ end
27
+ end
28
+ end
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates address using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Address').using FieldStrategy::RandomAddress.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Address').using FieldStrategy::RandomAddress.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('Address').using FieldStrategy::RandomAddress.new('my_geo_json.json')
5
17
 
6
18
  class RandomAddress < GeojsonBase
7
19
 
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to RandomAddress, generates city using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('City').using FieldStrategy::RandomCity.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('City').using FieldStrategy::RandomCity.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('City').using FieldStrategy::RandomCity.new('my_geo_json.json')
5
17
 
6
18
  class RandomCity < GeojsonBase
7
19
 
@@ -2,6 +2,10 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Keeping the format same it changes each digit in the string with random digit.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('PhoneNumber').using FieldStrategy::RandomPhoneNumber.new
5
9
 
6
10
  class RandomPhoneNumber < FormattedStringNumber
7
11
 
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to RandomAddress, generates province using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Province').using FieldStrategy::RandomProvince.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Province').using FieldStrategy::RandomProvince.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('Province').using FieldStrategy::RandomProvince.new('my_geo_json.json')
5
17
 
6
18
  class RandomProvince < GeojsonBase
7
19
 
@@ -2,6 +2,18 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Similar to RandomAddress, generates zipcode using the [geojson](http://www.geojson.org/geojson-spec.html) format file. The default US/UK file chooses randomly from 300 addresses.
6
+ # The large data set can be downloaded from [here](http://www.infochimps.com/datasets/simplegeo-places-dump)
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Address').using FieldStrategy::RandomZipcode.region_US
10
+ #
11
+ # !!!ruby
12
+ # anonymize('Address').using FieldStrategy::RandomZipcode.region_UK
13
+ #
14
+ # !!!ruby
15
+ # # get your own geo_json file and use it
16
+ # anonymize('Address').using FieldStrategy::RandomZipcode.new('my_geo_json.json')
5
17
 
6
18
  class RandomZipcode < GeojsonBase
7
19
 
@@ -2,6 +2,21 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Anonmizes day and month fields within natural range based on true/false input for that field. By defaut both fields are
6
+ # anonymized
7
+ #
8
+ # !!!ruby
9
+ # # anonymizes month and leaves day unchanged
10
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.new(true,false)
11
+ #
12
+ # In addition to customizing which fields you want anonymized, there are some helper methods which allow for quick anonymization
13
+ #
14
+ # ```ruby
15
+ # # anonymizes only the month field
16
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.only_month
17
+ # # anonymizes only the day field
18
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.only_day
19
+
5
20
  class AnonymizeDate
6
21
 
7
22
 
@@ -2,6 +2,25 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Anonymizes each field(except year and seconds) within the natural range (e.g. hour between 1-24 and day within the month) based on true/false
6
+ # input for that field. By default, all fields are anonymized.
7
+ #
8
+ # !!!ruby
9
+ # # anonymizes month and hour fields, leaving the day and minute fields untouched
10
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.new(true,false,true,false)
11
+ #
12
+ # In addition to customizing which fields you want anonymized, there are some helper methods which allow for quick anonymization
13
+ #
14
+ # !!!ruby
15
+ # # anonymizes only the month field
16
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_month
17
+ # # anonymizes only the day field
18
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_day
19
+ # # anonymizes only the hour field
20
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_hour
21
+ # # anonymizes only the minute field
22
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDateTime.only_minute
23
+
5
24
  class AnonymizeDateTime < AnonymizeTime
6
25
 
7
26
  private
@@ -2,6 +2,25 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Anonymizes each field(except year and seconds) within the natural range (e.g. hour between 1-24 and day within the month) based on true/false
6
+ # input for that field. By default, all fields are anonymized.
7
+ #
8
+ # !!!ruby
9
+ # # anonymizes month and hour fields, leaving the day and minute fields untouched
10
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.new(true,false,true,false)
11
+ #
12
+ # In addition to customizing which fields you want anonymized, there are some helper methods which allow for quick anonymization
13
+ #
14
+ # !!!ruby
15
+ # # anonymizes only the month field
16
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_month
17
+ # # anonymizes only the day field
18
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_day
19
+ # # anonymizes only the hour field
20
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_hour
21
+ # # anonymizes only the minute field
22
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeTime.only_minute
23
+
5
24
  class AnonymizeTime
6
25
 
7
26
  DEFAULT_ANONYMIZATION = true
@@ -2,6 +2,16 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Shifts date randomly within given delta range. Default shits date within 10 days + or -
6
+ #
7
+ # !!!ruby
8
+ # anonymize('DateOfBirth').using FieldStrategy::AnonymizeDate.new
9
+ #
10
+ # !!!ruby
11
+ # # shifts date within 25 days
12
+ # anonymize('DateOfBirth').using FieldStrategy::DateDelta.new(25)
13
+ #
14
+
5
15
  class DateDelta
6
16
 
7
17
  DEFAULT_DAY_DELTA = 10
@@ -2,6 +2,15 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Shifts data randomly within given range. Default shifts date within 10 days + or - and shifts time within 30 minutes.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('DateOfBirth').using FieldStrategy::DateTimeDelta.new
9
+ #
10
+ # !!!ruby
11
+ # # shifts date within 20 days and time within 50 minutes
12
+ # anonymize('DateOfBirth').using FieldStrategy::DateTimeDelta.new(20, 50)
13
+
5
14
  class DateTimeDelta
6
15
 
7
16
  DEFAULT_DAY_DELTA = 10
@@ -2,6 +2,14 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Shifts data randomly within given range. Default shifts date within 10 days + or - and shifts time within 30 minutes.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('DateOfBirth').using FieldStrategy::TimeDelta.new
9
+ #
10
+ # !!!ruby
11
+ # # shifts date within 20 days and time within 50 minutes
12
+ # anonymize('DateOfBirth').using FieldStrategy::TimeDelta.new(20, 50)
5
13
 
6
14
  class TimeDelta < DateTimeDelta
7
15
  end
@@ -12,17 +12,20 @@ module DataAnon
12
12
  :datetime => FieldStrategy::DateTimeDelta.new,
13
13
  :time => FieldStrategy::TimeDelta.new,
14
14
  :date => FieldStrategy::DateDelta.new,
15
+ :array => FieldStrategy::AnonymizeArray.new(nil),
15
16
  :trueclass => FieldStrategy::RandomBoolean.new,
17
+ :"bson::objectid" => FieldStrategy::Whitelist.new,
16
18
  :falseclass => FieldStrategy::RandomBoolean.new
17
19
  }
18
20
 
19
21
  def initialize user_defaults = {}
20
22
  @user_defaults = DEFAULT_STRATEGIES.merge user_defaults
23
+ FieldStrategy::AnonymizeArray.user_defaults @user_defaults
21
24
  end
22
25
 
23
26
  def anonymize field
24
27
  strategy = @user_defaults[field.value.class.to_s.downcase.to_sym]
25
- raise "No strategy defined for datatype #{field.value.class}. Use 'default_field_strategies' option in your script. Refer to http://sunitparekh.github.com/data-anonymization/#default-field-strategies for more details. " unless strategy
28
+ raise "No strategy defined for datatype #{field.value.class}. Use 'default_field_strategies' option in your script. Refer to http://sunitparekh.github.com/data-anonymization/#default-field-strategies for more details. #{field.inspect}" unless strategy
26
29
  strategy.anonymize field
27
30
  end
28
31
 
@@ -1,6 +1,14 @@
1
1
  module DataAnon
2
2
  module Strategy
3
3
  module Field
4
+
5
+ # Generates a valid unique gmail address by taking advantage of the gmail + strategy. Takes in a valid gmail username and
6
+ # generates emails of the form username+<number>@gmail.com
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Email').using FieldStrategy::GmailTemplate.new('username')
10
+ #
11
+
4
12
  class GmailTemplate
5
13
 
6
14
  def initialize username = 'someusername'
@@ -2,6 +2,13 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates email randomly using the given HOSTNAME and TLD.
6
+ # By defaults generates hostname randomly along with email id.
7
+ #
8
+ # !!!ruby
9
+ # anonymize('Email').using FieldStrategy::RandomEmail.new('thoughtworks','com')
10
+ #
11
+
5
12
  class RandomEmail
6
13
 
7
14
  TLDS = ['com','org','net','edu','gov','mil','biz','info']
@@ -2,6 +2,11 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random email using mailinator hostname. e.g. <randomstring>@mailinator.com
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Email').using FieldStrategy::RandomMailinatorEmail.new
9
+
5
10
  class RandomMailinatorEmail
6
11
 
7
12
  def initialize
@@ -3,12 +3,16 @@ require 'strategy/field/random_boolean'
3
3
 
4
4
  require 'strategy/field/anonymous'
5
5
 
6
+ #array
7
+ require 'strategy/field/anonymize_array'
8
+
6
9
  # string
7
10
  require 'strategy/field/string/lorem_ipsum'
8
11
  require 'strategy/field/string/string_template'
9
12
  require 'strategy/field/string/random_string'
10
13
  require 'strategy/field/string/random_url'
11
14
  require 'strategy/field/string/formatted_string_numbers'
15
+ require 'strategy/field/string/random_formatted_string'
12
16
 
13
17
  require 'strategy/field/string/select_from_file'
14
18
  require 'strategy/field/string/select_from_list'
@@ -2,6 +2,16 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Randomly picks up first name from the predefined list in the file. Default [file](https://raw.github.com/sunitparekh/data-anonymization/master/resources/first_names.txt) is part of the gem.
6
+ # File should contain first name on each line.
7
+ #
8
+ # !!!ruby ```ruby
9
+ # anonymize('FirstName').using FieldStrategy::RandomFirstName.new
10
+ #
11
+ # !!!ruby
12
+ # anonymize('FirstName').using FieldStrategy::RandomFirstName.new('my_first_names.txt')
13
+ #
14
+
5
15
  class RandomFirstName < SelectFromFile
6
16
 
7
17
  def initialize file_path = nil
@@ -2,6 +2,14 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates full name using the RandomFirstName and RandomLastName strategies.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('FullName').using FieldStrategy::RandomFullName.new
9
+ #
10
+ # !!!ruby
11
+ # anonymize('FullName').using FieldStrategy::RandomLastName.new('my_first_names.txt', 'my_last_names.txt')
12
+
5
13
  class RandomFullName
6
14
 
7
15
  def initialize first_names = nil, last_names = nil
@@ -13,10 +21,10 @@ module DataAnon
13
21
 
14
22
  name_words = field.value.split(' ')
15
23
 
16
- anonymized_first_name = @first_name_anonymizer.anonymize(name_words[0])
24
+ anonymized_first_name = @first_name_anonymizer.anonymize(field)
17
25
  anonymized_last_name = ""
18
26
  for counter in (1..name_words.size-1)
19
- anonymized_last_name = anonymized_last_name + " " + @last_name_anonymizer.anonymize(name_words[counter])
27
+ anonymized_last_name = anonymized_last_name + " " + @last_name_anonymizer.anonymize(field)
20
28
  end
21
29
 
22
30
  return anonymized_first_name + anonymized_last_name
@@ -2,6 +2,15 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Randomly picks up last name from the predefined list in the file. Default [file](https://raw.github.com/sunitparekh/data-anonymization/master/resources/last_names.txt) is part of the gem.
6
+ # File should contain last name on each line.
7
+ #
8
+ # !!!ruby
9
+ # anonymize('LastName').using FieldStrategy::RandomLastName.new
10
+ #
11
+ # !!!ruby
12
+ # anonymize('LastName').using FieldStrategy::RandomLastName.new('my_last_names.txt')
13
+
5
14
  class RandomLastName < SelectFromFile
6
15
 
7
16
  def initialize file_path = nil
@@ -2,6 +2,11 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random user name of same length as original user name.
6
+ #
7
+ # !!!ruby
8
+ # anonymize('Username').using FieldStrategy::RandomUserName.new
9
+ #
5
10
  class RandomUserName
6
11
 
7
12
  DEFAULT_MIN_LENGTH = 5
@@ -3,6 +3,12 @@ require 'bigdecimal'
3
3
  module DataAnon
4
4
  module Strategy
5
5
  module Field
6
+
7
+ # Shifts the current value randomly within given delta + and -. Default is 10.0
8
+ #
9
+ # !!!ruby
10
+ # anonymize('points').using FieldStrategy::RandomFloatDelta.new(2.5)
11
+
6
12
  class RandomBigDecimalDelta
7
13
 
8
14
  def initialize delta = 100.0
@@ -2,6 +2,10 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
+ # Generates random float number between given two numbers. Default range is 0.0 to 100.0
6
+ #
7
+ # !!!ruby
8
+ # anonymize('points').using FieldStrategy::RandomFloat.new(3.0,5.0)
5
9
 
6
10
  class RandomFloat
7
11