data-anonymization 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. data/.documentup.json +1 -0
  2. data/.travis.yml +0 -1
  3. data/README.md +277 -52
  4. data/blacklist_dsl.rb +1 -3
  5. data/data-anonymization.gemspec +4 -0
  6. data/lib/core/dsl.rb +1 -1
  7. data/lib/data-anonymization.rb +3 -0
  8. data/lib/strategy/base.rb +21 -11
  9. data/lib/strategy/blacklist.rb +2 -1
  10. data/lib/strategy/field/contact/geojson_base.rb +24 -0
  11. data/lib/strategy/field/contact/random_address.rb +17 -0
  12. data/lib/strategy/field/contact/random_city.rb +17 -0
  13. data/lib/strategy/field/contact/random_phone_number.rb +13 -0
  14. data/lib/strategy/field/contact/random_province.rb +17 -0
  15. data/lib/strategy/field/contact/random_zipcode.rb +17 -0
  16. data/lib/strategy/field/datetime/anonymize_date.rb +39 -0
  17. data/lib/strategy/field/datetime/anonymize_datetime.rb +15 -0
  18. data/lib/strategy/field/datetime/anonymize_time.rb +58 -0
  19. data/lib/strategy/field/datetime/date_delta.rb +21 -0
  20. data/lib/strategy/field/{date_time_delta.rb → datetime/date_time_delta.rb} +3 -3
  21. data/lib/strategy/field/datetime/time_delta.rb +12 -0
  22. data/lib/strategy/field/default_anon.rb +12 -7
  23. data/lib/strategy/field/email/gmail_template.rb +16 -0
  24. data/lib/strategy/field/{random_email.rb → email/random_email.rb} +0 -0
  25. data/lib/strategy/field/{random_mailinator_email.rb → email/random_mailinator_email.rb} +0 -2
  26. data/lib/strategy/field/fields.rb +51 -20
  27. data/lib/strategy/field/name/random_first_name.rb +14 -0
  28. data/lib/strategy/field/{random_full_name.rb → name/random_full_name.rb} +0 -0
  29. data/lib/strategy/field/name/random_last_name.rb +14 -0
  30. data/lib/strategy/field/{random_user_name.rb → name/random_user_name.rb} +0 -0
  31. data/lib/strategy/field/number/random_float.rb +23 -0
  32. data/lib/strategy/field/{random_float_delta.rb → number/random_float_delta.rb} +2 -4
  33. data/lib/strategy/field/{random_int.rb → number/random_integer.rb} +1 -1
  34. data/lib/strategy/field/{random_integer_delta.rb → number/random_integer_delta.rb} +2 -5
  35. data/lib/strategy/field/{random_phone_number.rb → string/formatted_string_numbers.rb} +4 -1
  36. data/lib/strategy/field/{lorem_ipsum.rb → string/lorem_ipsum.rb} +0 -0
  37. data/lib/strategy/field/{random_string.rb → string/random_string.rb} +0 -0
  38. data/lib/strategy/field/{distinct_column_values.rb → string/select_from_database.rb} +2 -3
  39. data/lib/strategy/field/string/select_from_file.rb +18 -0
  40. data/lib/strategy/field/string/select_from_list.rb +17 -0
  41. data/lib/strategy/field/{string_template.rb → string/string_template.rb} +0 -0
  42. data/lib/strategy/whitelist.rb +4 -2
  43. data/lib/utils/database.rb +8 -6
  44. data/lib/utils/geojson_parser.rb +42 -0
  45. data/lib/utils/logging.rb +0 -9
  46. data/lib/utils/progress_bar.rb +29 -0
  47. data/lib/utils/random_float.rb +12 -0
  48. data/lib/utils/random_int.rb +3 -7
  49. data/lib/utils/resource.rb +4 -0
  50. data/lib/version.rb +1 -1
  51. data/resources/UK_addresses.geojson +300 -0
  52. data/resources/US_addresses.geojson +300 -0
  53. data/spec/acceptance/rdbms_blacklist_spec.rb +2 -2
  54. data/spec/acceptance/rdbms_whitelist_spec.rb +6 -8
  55. data/spec/resource/sample.geojson +1 -0
  56. data/spec/spec_helper.rb +3 -2
  57. data/spec/strategy/field/contact/random_address_spec.rb +12 -0
  58. data/spec/strategy/field/contact/random_city_spec.rb +14 -0
  59. data/spec/strategy/field/contact/random_phone_number_spec.rb +16 -0
  60. data/spec/strategy/field/contact/random_province_spec.rb +14 -0
  61. data/spec/strategy/field/contact/random_zipcode_spec.rb +14 -0
  62. data/spec/strategy/field/datetime/anonymize_date_spec.rb +27 -0
  63. data/spec/strategy/field/datetime/anonymize_datetime_spec.rb +57 -0
  64. data/spec/strategy/field/datetime/anonymize_time_spec.rb +57 -0
  65. data/spec/strategy/field/datetime/date_delta_spec.rb +36 -0
  66. data/spec/strategy/field/{date_time_delta_spec.rb → datetime/date_time_delta_spec.rb} +3 -2
  67. data/spec/strategy/field/datetime/time_delta_spec.rb +44 -0
  68. data/spec/strategy/field/default_anon_spec.rb +42 -0
  69. data/spec/strategy/field/email/gmail_template_spec.rb +17 -0
  70. data/spec/strategy/field/{random_email_spec.rb → email/random_email_spec.rb} +2 -2
  71. data/spec/strategy/field/email/random_mailinator_email_spec.rb +14 -0
  72. data/spec/strategy/field/{random_first_name_spec.rb → name/random_first_name_spec.rb} +2 -2
  73. data/spec/strategy/field/{random_full_name_spec.rb → name/random_full_name_spec.rb} +2 -2
  74. data/spec/strategy/field/{random_last_name_spec.rb → name/random_last_name_spec.rb} +2 -2
  75. data/spec/strategy/field/{random_user_name_spec.rb → name/random_user_name_spec.rb} +2 -2
  76. data/spec/strategy/field/{random_float_delta_spec.rb → number/random_float_delta_spec.rb} +2 -2
  77. data/spec/strategy/field/number/random_float_spec.rb +28 -0
  78. data/spec/strategy/field/{random_integer_delta_spec.rb → number/random_integer_delta_spec.rb} +3 -5
  79. data/spec/strategy/field/{random_int_spec.rb → number/random_integer_spec.rb} +4 -4
  80. data/spec/strategy/field/random_boolean_spec.rb +2 -2
  81. data/spec/strategy/field/string/formatted_string_numbers_spec.rb +15 -0
  82. data/spec/strategy/field/{lorem_ipsum_spec.rb → string/lorem_ipsum_spec.rb} +2 -2
  83. data/spec/strategy/field/{random_string_spec.rb → string/random_string_spec.rb} +2 -2
  84. data/spec/strategy/field/{distinct_column_values_spec.rb → string/select_from_database_spec.rb} +3 -3
  85. data/spec/strategy/field/{random_selection_spec.rb → string/select_from_list_spec.rb} +5 -5
  86. data/spec/strategy/field/{string_template_spec.rb → string/string_template_spec.rb} +2 -2
  87. data/spec/strategy/field/whitelist_spec.rb +2 -2
  88. data/spec/support/customer_sample.rb +1 -1
  89. data/spec/utils/database_spec.rb +2 -2
  90. data/spec/utils/geojson_parser_spec.rb +38 -0
  91. data/whitelist_dsl.rb +4 -6
  92. metadata +163 -59
  93. data/lib/strategy/field/anonymize_time.rb +0 -57
  94. data/lib/strategy/field/gmail_template.rb +0 -17
  95. data/lib/strategy/field/random_first_name.rb +0 -18
  96. data/lib/strategy/field/random_last_name.rb +0 -19
  97. data/lib/strategy/field/random_selection.rb +0 -23
  98. data/lib/strategy/field/user_name_template.rb +0 -22
  99. data/spec/strategy/field/anonymize_time_spec.rb +0 -23
  100. data/spec/strategy/field/gmail_template_spec.rb +0 -14
  101. data/spec/strategy/field/random_mailinator_email_spec.rb +0 -21
  102. data/spec/strategy/field/random_phone_number_spec.rb +0 -35
  103. data/spec/strategy/field/user_name_template_spec.rb +0 -13
data/blacklist_dsl.rb CHANGED
@@ -2,8 +2,6 @@ system "bundle exec ruby whitelist_dsl.rb"
2
2
 
3
3
  require 'data-anonymization'
4
4
 
5
- FS = DataAnon::Strategy::Field
6
-
7
5
  DataAnon::Utils::Logging.logger.level = Logger::INFO
8
6
 
9
7
  database 'Chinook' do
@@ -12,7 +10,7 @@ database 'Chinook' do
12
10
 
13
11
  table 'MediaType' do
14
12
  primary_key 'MediaTypeId'
15
- anonymize('Name').using FS::StringTemplate.new('Media Type 100#{row_number}')
13
+ anonymize('Name').using FieldStrategy::StringTemplate.new('Media Type 100#{row_number}')
16
14
  end
17
15
 
18
16
  end
@@ -18,5 +18,9 @@ Gem::Specification.new do |gem|
18
18
  gem.require_paths = ["lib"]
19
19
 
20
20
  gem.add_dependency('activerecord', '~> 3.2.8')
21
+ gem.add_dependency('composite_primary_keys', '~> 5.0.8')
21
22
  gem.add_dependency('activesupport', '~> 3.2.8')
23
+ gem.add_dependency('rgeo', '~> 0.3.15')
24
+ gem.add_dependency('rgeo-geojson', '~> 0.2.3')
25
+ gem.add_dependency('powerbar', '~> 1.0.8')
22
26
  end
data/lib/core/dsl.rb CHANGED
@@ -4,7 +4,7 @@ module DataAnon
4
4
  include Utils::Logging
5
5
 
6
6
  def database(name, &block)
7
- logger.debug "#{name} : Database"
7
+ logger.debug "Processing Database: #{name}"
8
8
  DataAnon::Core::Database.new(name).instance_eval &block
9
9
  end
10
10
 
@@ -2,7 +2,10 @@ require "version"
2
2
 
3
3
  require "utils/logging"
4
4
  require "utils/random_int"
5
+ require "utils/random_float"
5
6
  require "utils/random_string"
7
+ require "utils/geojson_parser"
8
+ require "utils/progress_bar"
6
9
  require "utils/resource"
7
10
  require "core/database"
8
11
  require "core/field"
data/lib/strategy/base.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'powerbar'
2
+
1
3
  module DataAnon
2
4
  module Strategy
3
5
  class Base
@@ -14,10 +16,15 @@ module DataAnon
14
16
  self
15
17
  end
16
18
 
17
- def primary_key field
18
- @primary_key = field
19
+ def primary_key *fields
20
+ @primary_keys = fields
21
+ end
22
+
23
+ def is_primary_key? field
24
+ @primary_keys.select { |key| field.downcase == key.downcase }.length > 0
19
25
  end
20
26
 
27
+
21
28
  def whitelist *fields
22
29
  fields.each { |f| @fields[f.downcase] = DataAnon::Strategy::Field::Whitelist.new }
23
30
  end
@@ -43,23 +50,26 @@ module DataAnon
43
50
  end
44
51
 
45
52
  def dest_table
46
- @dest_table ||= Utils::DestinationTable.create @name, @primary_key
53
+ @dest_table ||= Utils::DestinationTable.create @name, @primary_keys
47
54
  end
48
55
 
49
56
  def source_table
50
- @source_table ||= Utils::SourceTable.create @name, @primary_key
57
+ @source_table ||= Utils::SourceTable.create @name, @primary_keys
51
58
  end
52
59
 
53
60
  def process
54
61
  logger.debug "Processing table #{@name} with fields strategies #{@fields}"
55
- progress_logger.info "Table: #{@name} (#{source_table.count} records) "
56
- index = 1
57
- source_table.find_each(:batch_size => 100) do |record|
58
- progress_logger.info "."
59
- process_record index, record
60
- index += 1
62
+ total = source_table.count
63
+ if total > 0
64
+ index = 1
65
+ progress_bar = DataAnon::Utils::ProgressBar.new @name, total
66
+ source_table.all.each do |record|
67
+ process_record index, record
68
+ index += 1
69
+ progress_bar.show(index)
70
+ end
71
+ progress_bar.close
61
72
  end
62
- progress_logger.info " DONE\n"
63
73
  end
64
74
 
65
75
  end
@@ -6,13 +6,14 @@ module DataAnon
6
6
  @fields.each do |field, strategy|
7
7
  database_field_name = record.attributes.select { |k,v| k.downcase == field }.keys[0]
8
8
  field_value = record.attributes[database_field_name]
9
- unless field_value.nil? || database_field_name.downcase == @primary_key.downcase
9
+ unless field_value.nil? || is_primary_key?(database_field_name)
10
10
  field = DataAnon::Core::Field.new(database_field_name, field_value, index, record)
11
11
  record[database_field_name] = strategy.anonymize(field)
12
12
  end
13
13
  end
14
14
  record.save!
15
15
  end
16
+
16
17
  end
17
18
  end
18
19
  end
@@ -0,0 +1,24 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+ class GeojsonBase
5
+
6
+ def self.region_US
7
+ self.new DataAnon::Utils::Resource.file('US_addresses.geojson')
8
+ end
9
+
10
+ def self.region_UK
11
+ self.new DataAnon::Utils::Resource.file('UK_addresses.geojson')
12
+ end
13
+
14
+ def initialize file_path
15
+ raise "Load and set the @values member variable in constructor"
16
+ end
17
+
18
+ def anonymize field
19
+ @values.sample
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomAddress < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.address(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomCity < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.city(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomPhoneNumber < FormattedStringNumber
7
+
8
+ end
9
+
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomProvince < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.province(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomZipcode < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.zipcode(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,39 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class AnonymizeDate
6
+
7
+
8
+ def self.only_month
9
+ self.new true, false
10
+ end
11
+
12
+ def self.only_day
13
+ self.new false, true
14
+ end
15
+
16
+ def initialize anonymize_month, anonymize_day
17
+
18
+ @anonymize_month = anonymize_month
19
+ @anonymize_day = anonymize_day
20
+
21
+ end
22
+
23
+ def anonymize field
24
+
25
+ original_time = field.value
26
+
27
+ year = original_time.year
28
+ month = @anonymize_month? DataAnon::Utils::RandomInt.generate(1,12) : original_time.month
29
+ days_in_month = Time.new(year,month,1,1,1,1).end_of_month.day
30
+ day = @anonymize_day? DataAnon::Utils::RandomInt.generate(1,days_in_month) : original_time.day
31
+
32
+ Date.new(year, month, day)
33
+ end
34
+
35
+
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,15 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class AnonymizeDateTime < AnonymizeTime
6
+
7
+ private
8
+ def create_object(year, month, day, hour, min, sec)
9
+ DateTime.new(year, month, day, hour, min, sec)
10
+ end
11
+
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,58 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class AnonymizeTime
6
+
7
+ DEFAULT_ANONYMIZATION = true
8
+
9
+ def self.only_month
10
+ self.new true, false, false, false
11
+ end
12
+
13
+ def self.only_day
14
+ self.new false, true, false, false
15
+ end
16
+
17
+ def self.only_hour
18
+ self.new false, false, true, false
19
+ end
20
+
21
+ def self.only_minute
22
+ self.new false, false, false, true
23
+ end
24
+
25
+ def initialize anonymize_month, anonymize_day, anonymize_hour, anonymize_min
26
+
27
+ @anonymize_month = anonymize_month
28
+ @anonymize_day = anonymize_day
29
+ @anonymize_hour = anonymize_hour
30
+ @anonymize_min = anonymize_min
31
+
32
+ end
33
+
34
+ def anonymize field
35
+
36
+ original_time = field.value
37
+
38
+ year = original_time.year
39
+ month = @anonymize_month? DataAnon::Utils::RandomInt.generate(1,12) : original_time.month
40
+ days_in_month = Time.new(year,month,1,1,1,1).end_of_month.day
41
+ day = @anonymize_day? DataAnon::Utils::RandomInt.generate(1,days_in_month) : original_time.day
42
+ hour = @anonymize_hour? DataAnon::Utils::RandomInt.generate(1,24) : original_time.hour
43
+ min = @anonymize_min? DataAnon::Utils::RandomInt.generate(1,60) : original_time.min
44
+ sec = original_time.sec
45
+
46
+ create_object(year, month, day, hour, min, sec)
47
+ end
48
+
49
+ private
50
+
51
+ def create_object(year, month, day, hour, min, sec)
52
+ Time.new(year, month, day, hour, min, sec)
53
+ end
54
+
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,21 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class DateDelta
6
+
7
+ DEFAULT_DAY_DELTA = 10
8
+
9
+ def initialize day_delta = DEFAULT_DAY_DELTA
10
+ @day_delta = day_delta
11
+ end
12
+
13
+ def anonymize field
14
+ day_adjustment = DataAnon::Utils::RandomInt.generate(-@day_delta,@day_delta)
15
+ return field.value + day_adjustment.days
16
+ end
17
+
18
+ end
19
+ end
20
+ end
21
+ end
@@ -13,9 +13,9 @@ module DataAnon
13
13
  end
14
14
 
15
15
  def anonymize field
16
- day_adjustment = @day_delta==0? 0 : (DataAnon::Utils::RandomInt.generate(-@day_delta,@day_delta))
17
- minute_adjustment = @minute_delta==0? 0 : (DataAnon::Utils::RandomInt.generate(-@minute_delta,@minute_delta))
18
- return field.value + day_adjustment.days + minute_adjustment.minutes
16
+ day_adjustment = DataAnon::Utils::RandomInt.generate(-@day_delta,@day_delta)
17
+ minute_adjustment = DataAnon::Utils::RandomInt.generate(-@minute_delta,@minute_delta)
18
+ return field.value + (day_adjustment.days + minute_adjustment.minutes)
19
19
  end
20
20
 
21
21
  end
@@ -0,0 +1,12 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class TimeDelta < DateTimeDelta
7
+ end
8
+
9
+
10
+ end
11
+ end
12
+ end
@@ -4,19 +4,24 @@ module DataAnon
4
4
 
5
5
  class DefaultAnon
6
6
 
7
- FS = DataAnon::Strategy::Field
8
- DEFAULT_STRATEGIES = {:string => FS::LoremIpsum.new,
9
- :integer => FS::RandomInt.new(18,70),
10
- :datetime => FS::DateTimeDelta.new,
11
- :boolean => FS::RandomBoolean.new
7
+ DEFAULT_STRATEGIES = {:string => FieldStrategy::LoremIpsum.new,
8
+ :fixnum => FieldStrategy::RandomIntegerDelta.new(5),
9
+ :bignum => FieldStrategy::RandomIntegerDelta.new(5000),
10
+ :float => FieldStrategy::RandomFloatDelta.new(5.0),
11
+ :datetime => FieldStrategy::DateTimeDelta.new,
12
+ :time => FieldStrategy::TimeDelta.new,
13
+ :date => FieldStrategy::DateDelta.new,
14
+ :trueclass => FieldStrategy::RandomBoolean.new,
15
+ :falseclass => FieldStrategy::RandomBoolean.new
12
16
  }
13
17
 
14
- def initialize user_defaults
18
+ def initialize user_defaults = {}
15
19
  @user_defaults = DEFAULT_STRATEGIES.merge user_defaults
16
20
  end
17
21
 
18
22
  def anonymize field
19
- strategy = @user_defaults[field.value.class.to_s.downcase.to_sym] || FS::Whitelist.new
23
+ strategy = @user_defaults[field.value.class.to_s.downcase.to_sym]
24
+ raise "No strategy defined for datatype #{field.value.class}" unless strategy
20
25
  strategy.anonymize field
21
26
  end
22
27