data-anonymization 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. data/.documentup.json +1 -0
  2. data/.travis.yml +0 -1
  3. data/README.md +277 -52
  4. data/blacklist_dsl.rb +1 -3
  5. data/data-anonymization.gemspec +4 -0
  6. data/lib/core/dsl.rb +1 -1
  7. data/lib/data-anonymization.rb +3 -0
  8. data/lib/strategy/base.rb +21 -11
  9. data/lib/strategy/blacklist.rb +2 -1
  10. data/lib/strategy/field/contact/geojson_base.rb +24 -0
  11. data/lib/strategy/field/contact/random_address.rb +17 -0
  12. data/lib/strategy/field/contact/random_city.rb +17 -0
  13. data/lib/strategy/field/contact/random_phone_number.rb +13 -0
  14. data/lib/strategy/field/contact/random_province.rb +17 -0
  15. data/lib/strategy/field/contact/random_zipcode.rb +17 -0
  16. data/lib/strategy/field/datetime/anonymize_date.rb +39 -0
  17. data/lib/strategy/field/datetime/anonymize_datetime.rb +15 -0
  18. data/lib/strategy/field/datetime/anonymize_time.rb +58 -0
  19. data/lib/strategy/field/datetime/date_delta.rb +21 -0
  20. data/lib/strategy/field/{date_time_delta.rb → datetime/date_time_delta.rb} +3 -3
  21. data/lib/strategy/field/datetime/time_delta.rb +12 -0
  22. data/lib/strategy/field/default_anon.rb +12 -7
  23. data/lib/strategy/field/email/gmail_template.rb +16 -0
  24. data/lib/strategy/field/{random_email.rb → email/random_email.rb} +0 -0
  25. data/lib/strategy/field/{random_mailinator_email.rb → email/random_mailinator_email.rb} +0 -2
  26. data/lib/strategy/field/fields.rb +51 -20
  27. data/lib/strategy/field/name/random_first_name.rb +14 -0
  28. data/lib/strategy/field/{random_full_name.rb → name/random_full_name.rb} +0 -0
  29. data/lib/strategy/field/name/random_last_name.rb +14 -0
  30. data/lib/strategy/field/{random_user_name.rb → name/random_user_name.rb} +0 -0
  31. data/lib/strategy/field/number/random_float.rb +23 -0
  32. data/lib/strategy/field/{random_float_delta.rb → number/random_float_delta.rb} +2 -4
  33. data/lib/strategy/field/{random_int.rb → number/random_integer.rb} +1 -1
  34. data/lib/strategy/field/{random_integer_delta.rb → number/random_integer_delta.rb} +2 -5
  35. data/lib/strategy/field/{random_phone_number.rb → string/formatted_string_numbers.rb} +4 -1
  36. data/lib/strategy/field/{lorem_ipsum.rb → string/lorem_ipsum.rb} +0 -0
  37. data/lib/strategy/field/{random_string.rb → string/random_string.rb} +0 -0
  38. data/lib/strategy/field/{distinct_column_values.rb → string/select_from_database.rb} +2 -3
  39. data/lib/strategy/field/string/select_from_file.rb +18 -0
  40. data/lib/strategy/field/string/select_from_list.rb +17 -0
  41. data/lib/strategy/field/{string_template.rb → string/string_template.rb} +0 -0
  42. data/lib/strategy/whitelist.rb +4 -2
  43. data/lib/utils/database.rb +8 -6
  44. data/lib/utils/geojson_parser.rb +42 -0
  45. data/lib/utils/logging.rb +0 -9
  46. data/lib/utils/progress_bar.rb +29 -0
  47. data/lib/utils/random_float.rb +12 -0
  48. data/lib/utils/random_int.rb +3 -7
  49. data/lib/utils/resource.rb +4 -0
  50. data/lib/version.rb +1 -1
  51. data/resources/UK_addresses.geojson +300 -0
  52. data/resources/US_addresses.geojson +300 -0
  53. data/spec/acceptance/rdbms_blacklist_spec.rb +2 -2
  54. data/spec/acceptance/rdbms_whitelist_spec.rb +6 -8
  55. data/spec/resource/sample.geojson +1 -0
  56. data/spec/spec_helper.rb +3 -2
  57. data/spec/strategy/field/contact/random_address_spec.rb +12 -0
  58. data/spec/strategy/field/contact/random_city_spec.rb +14 -0
  59. data/spec/strategy/field/contact/random_phone_number_spec.rb +16 -0
  60. data/spec/strategy/field/contact/random_province_spec.rb +14 -0
  61. data/spec/strategy/field/contact/random_zipcode_spec.rb +14 -0
  62. data/spec/strategy/field/datetime/anonymize_date_spec.rb +27 -0
  63. data/spec/strategy/field/datetime/anonymize_datetime_spec.rb +57 -0
  64. data/spec/strategy/field/datetime/anonymize_time_spec.rb +57 -0
  65. data/spec/strategy/field/datetime/date_delta_spec.rb +36 -0
  66. data/spec/strategy/field/{date_time_delta_spec.rb → datetime/date_time_delta_spec.rb} +3 -2
  67. data/spec/strategy/field/datetime/time_delta_spec.rb +44 -0
  68. data/spec/strategy/field/default_anon_spec.rb +42 -0
  69. data/spec/strategy/field/email/gmail_template_spec.rb +17 -0
  70. data/spec/strategy/field/{random_email_spec.rb → email/random_email_spec.rb} +2 -2
  71. data/spec/strategy/field/email/random_mailinator_email_spec.rb +14 -0
  72. data/spec/strategy/field/{random_first_name_spec.rb → name/random_first_name_spec.rb} +2 -2
  73. data/spec/strategy/field/{random_full_name_spec.rb → name/random_full_name_spec.rb} +2 -2
  74. data/spec/strategy/field/{random_last_name_spec.rb → name/random_last_name_spec.rb} +2 -2
  75. data/spec/strategy/field/{random_user_name_spec.rb → name/random_user_name_spec.rb} +2 -2
  76. data/spec/strategy/field/{random_float_delta_spec.rb → number/random_float_delta_spec.rb} +2 -2
  77. data/spec/strategy/field/number/random_float_spec.rb +28 -0
  78. data/spec/strategy/field/{random_integer_delta_spec.rb → number/random_integer_delta_spec.rb} +3 -5
  79. data/spec/strategy/field/{random_int_spec.rb → number/random_integer_spec.rb} +4 -4
  80. data/spec/strategy/field/random_boolean_spec.rb +2 -2
  81. data/spec/strategy/field/string/formatted_string_numbers_spec.rb +15 -0
  82. data/spec/strategy/field/{lorem_ipsum_spec.rb → string/lorem_ipsum_spec.rb} +2 -2
  83. data/spec/strategy/field/{random_string_spec.rb → string/random_string_spec.rb} +2 -2
  84. data/spec/strategy/field/{distinct_column_values_spec.rb → string/select_from_database_spec.rb} +3 -3
  85. data/spec/strategy/field/{random_selection_spec.rb → string/select_from_list_spec.rb} +5 -5
  86. data/spec/strategy/field/{string_template_spec.rb → string/string_template_spec.rb} +2 -2
  87. data/spec/strategy/field/whitelist_spec.rb +2 -2
  88. data/spec/support/customer_sample.rb +1 -1
  89. data/spec/utils/database_spec.rb +2 -2
  90. data/spec/utils/geojson_parser_spec.rb +38 -0
  91. data/whitelist_dsl.rb +4 -6
  92. metadata +163 -59
  93. data/lib/strategy/field/anonymize_time.rb +0 -57
  94. data/lib/strategy/field/gmail_template.rb +0 -17
  95. data/lib/strategy/field/random_first_name.rb +0 -18
  96. data/lib/strategy/field/random_last_name.rb +0 -19
  97. data/lib/strategy/field/random_selection.rb +0 -23
  98. data/lib/strategy/field/user_name_template.rb +0 -22
  99. data/spec/strategy/field/anonymize_time_spec.rb +0 -23
  100. data/spec/strategy/field/gmail_template_spec.rb +0 -14
  101. data/spec/strategy/field/random_mailinator_email_spec.rb +0 -21
  102. data/spec/strategy/field/random_phone_number_spec.rb +0 -35
  103. data/spec/strategy/field/user_name_template_spec.rb +0 -13
data/blacklist_dsl.rb CHANGED
@@ -2,8 +2,6 @@ system "bundle exec ruby whitelist_dsl.rb"
2
2
 
3
3
  require 'data-anonymization'
4
4
 
5
- FS = DataAnon::Strategy::Field
6
-
7
5
  DataAnon::Utils::Logging.logger.level = Logger::INFO
8
6
 
9
7
  database 'Chinook' do
@@ -12,7 +10,7 @@ database 'Chinook' do
12
10
 
13
11
  table 'MediaType' do
14
12
  primary_key 'MediaTypeId'
15
- anonymize('Name').using FS::StringTemplate.new('Media Type 100#{row_number}')
13
+ anonymize('Name').using FieldStrategy::StringTemplate.new('Media Type 100#{row_number}')
16
14
  end
17
15
 
18
16
  end
@@ -18,5 +18,9 @@ Gem::Specification.new do |gem|
18
18
  gem.require_paths = ["lib"]
19
19
 
20
20
  gem.add_dependency('activerecord', '~> 3.2.8')
21
+ gem.add_dependency('composite_primary_keys', '~> 5.0.8')
21
22
  gem.add_dependency('activesupport', '~> 3.2.8')
23
+ gem.add_dependency('rgeo', '~> 0.3.15')
24
+ gem.add_dependency('rgeo-geojson', '~> 0.2.3')
25
+ gem.add_dependency('powerbar', '~> 1.0.8')
22
26
  end
data/lib/core/dsl.rb CHANGED
@@ -4,7 +4,7 @@ module DataAnon
4
4
  include Utils::Logging
5
5
 
6
6
  def database(name, &block)
7
- logger.debug "#{name} : Database"
7
+ logger.debug "Processing Database: #{name}"
8
8
  DataAnon::Core::Database.new(name).instance_eval &block
9
9
  end
10
10
 
@@ -2,7 +2,10 @@ require "version"
2
2
 
3
3
  require "utils/logging"
4
4
  require "utils/random_int"
5
+ require "utils/random_float"
5
6
  require "utils/random_string"
7
+ require "utils/geojson_parser"
8
+ require "utils/progress_bar"
6
9
  require "utils/resource"
7
10
  require "core/database"
8
11
  require "core/field"
data/lib/strategy/base.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'powerbar'
2
+
1
3
  module DataAnon
2
4
  module Strategy
3
5
  class Base
@@ -14,10 +16,15 @@ module DataAnon
14
16
  self
15
17
  end
16
18
 
17
- def primary_key field
18
- @primary_key = field
19
+ def primary_key *fields
20
+ @primary_keys = fields
21
+ end
22
+
23
+ def is_primary_key? field
24
+ @primary_keys.select { |key| field.downcase == key.downcase }.length > 0
19
25
  end
20
26
 
27
+
21
28
  def whitelist *fields
22
29
  fields.each { |f| @fields[f.downcase] = DataAnon::Strategy::Field::Whitelist.new }
23
30
  end
@@ -43,23 +50,26 @@ module DataAnon
43
50
  end
44
51
 
45
52
  def dest_table
46
- @dest_table ||= Utils::DestinationTable.create @name, @primary_key
53
+ @dest_table ||= Utils::DestinationTable.create @name, @primary_keys
47
54
  end
48
55
 
49
56
  def source_table
50
- @source_table ||= Utils::SourceTable.create @name, @primary_key
57
+ @source_table ||= Utils::SourceTable.create @name, @primary_keys
51
58
  end
52
59
 
53
60
  def process
54
61
  logger.debug "Processing table #{@name} with fields strategies #{@fields}"
55
- progress_logger.info "Table: #{@name} (#{source_table.count} records) "
56
- index = 1
57
- source_table.find_each(:batch_size => 100) do |record|
58
- progress_logger.info "."
59
- process_record index, record
60
- index += 1
62
+ total = source_table.count
63
+ if total > 0
64
+ index = 1
65
+ progress_bar = DataAnon::Utils::ProgressBar.new @name, total
66
+ source_table.all.each do |record|
67
+ process_record index, record
68
+ index += 1
69
+ progress_bar.show(index)
70
+ end
71
+ progress_bar.close
61
72
  end
62
- progress_logger.info " DONE\n"
63
73
  end
64
74
 
65
75
  end
@@ -6,13 +6,14 @@ module DataAnon
6
6
  @fields.each do |field, strategy|
7
7
  database_field_name = record.attributes.select { |k,v| k.downcase == field }.keys[0]
8
8
  field_value = record.attributes[database_field_name]
9
- unless field_value.nil? || database_field_name.downcase == @primary_key.downcase
9
+ unless field_value.nil? || is_primary_key?(database_field_name)
10
10
  field = DataAnon::Core::Field.new(database_field_name, field_value, index, record)
11
11
  record[database_field_name] = strategy.anonymize(field)
12
12
  end
13
13
  end
14
14
  record.save!
15
15
  end
16
+
16
17
  end
17
18
  end
18
19
  end
@@ -0,0 +1,24 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+ class GeojsonBase
5
+
6
+ def self.region_US
7
+ self.new DataAnon::Utils::Resource.file('US_addresses.geojson')
8
+ end
9
+
10
+ def self.region_UK
11
+ self.new DataAnon::Utils::Resource.file('UK_addresses.geojson')
12
+ end
13
+
14
+ def initialize file_path
15
+ raise "Load and set the @values member variable in constructor"
16
+ end
17
+
18
+ def anonymize field
19
+ @values.sample
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomAddress < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.address(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomCity < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.city(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomPhoneNumber < FormattedStringNumber
7
+
8
+ end
9
+
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomProvince < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.province(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomZipcode < GeojsonBase
7
+
8
+ def initialize file_path
9
+ @values = DataAnon::Utils::GeojsonParser.zipcode(file_path)
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,39 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class AnonymizeDate
6
+
7
+
8
+ def self.only_month
9
+ self.new true, false
10
+ end
11
+
12
+ def self.only_day
13
+ self.new false, true
14
+ end
15
+
16
+ def initialize anonymize_month, anonymize_day
17
+
18
+ @anonymize_month = anonymize_month
19
+ @anonymize_day = anonymize_day
20
+
21
+ end
22
+
23
+ def anonymize field
24
+
25
+ original_time = field.value
26
+
27
+ year = original_time.year
28
+ month = @anonymize_month? DataAnon::Utils::RandomInt.generate(1,12) : original_time.month
29
+ days_in_month = Time.new(year,month,1,1,1,1).end_of_month.day
30
+ day = @anonymize_day? DataAnon::Utils::RandomInt.generate(1,days_in_month) : original_time.day
31
+
32
+ Date.new(year, month, day)
33
+ end
34
+
35
+
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,15 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class AnonymizeDateTime < AnonymizeTime
6
+
7
+ private
8
+ def create_object(year, month, day, hour, min, sec)
9
+ DateTime.new(year, month, day, hour, min, sec)
10
+ end
11
+
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,58 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class AnonymizeTime
6
+
7
+ DEFAULT_ANONYMIZATION = true
8
+
9
+ def self.only_month
10
+ self.new true, false, false, false
11
+ end
12
+
13
+ def self.only_day
14
+ self.new false, true, false, false
15
+ end
16
+
17
+ def self.only_hour
18
+ self.new false, false, true, false
19
+ end
20
+
21
+ def self.only_minute
22
+ self.new false, false, false, true
23
+ end
24
+
25
+ def initialize anonymize_month, anonymize_day, anonymize_hour, anonymize_min
26
+
27
+ @anonymize_month = anonymize_month
28
+ @anonymize_day = anonymize_day
29
+ @anonymize_hour = anonymize_hour
30
+ @anonymize_min = anonymize_min
31
+
32
+ end
33
+
34
+ def anonymize field
35
+
36
+ original_time = field.value
37
+
38
+ year = original_time.year
39
+ month = @anonymize_month? DataAnon::Utils::RandomInt.generate(1,12) : original_time.month
40
+ days_in_month = Time.new(year,month,1,1,1,1).end_of_month.day
41
+ day = @anonymize_day? DataAnon::Utils::RandomInt.generate(1,days_in_month) : original_time.day
42
+ hour = @anonymize_hour? DataAnon::Utils::RandomInt.generate(1,24) : original_time.hour
43
+ min = @anonymize_min? DataAnon::Utils::RandomInt.generate(1,60) : original_time.min
44
+ sec = original_time.sec
45
+
46
+ create_object(year, month, day, hour, min, sec)
47
+ end
48
+
49
+ private
50
+
51
+ def create_object(year, month, day, hour, min, sec)
52
+ Time.new(year, month, day, hour, min, sec)
53
+ end
54
+
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,21 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class DateDelta
6
+
7
+ DEFAULT_DAY_DELTA = 10
8
+
9
+ def initialize day_delta = DEFAULT_DAY_DELTA
10
+ @day_delta = day_delta
11
+ end
12
+
13
+ def anonymize field
14
+ day_adjustment = DataAnon::Utils::RandomInt.generate(-@day_delta,@day_delta)
15
+ return field.value + day_adjustment.days
16
+ end
17
+
18
+ end
19
+ end
20
+ end
21
+ end
@@ -13,9 +13,9 @@ module DataAnon
13
13
  end
14
14
 
15
15
  def anonymize field
16
- day_adjustment = @day_delta==0? 0 : (DataAnon::Utils::RandomInt.generate(-@day_delta,@day_delta))
17
- minute_adjustment = @minute_delta==0? 0 : (DataAnon::Utils::RandomInt.generate(-@minute_delta,@minute_delta))
18
- return field.value + day_adjustment.days + minute_adjustment.minutes
16
+ day_adjustment = DataAnon::Utils::RandomInt.generate(-@day_delta,@day_delta)
17
+ minute_adjustment = DataAnon::Utils::RandomInt.generate(-@minute_delta,@minute_delta)
18
+ return field.value + (day_adjustment.days + minute_adjustment.minutes)
19
19
  end
20
20
 
21
21
  end
@@ -0,0 +1,12 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class TimeDelta < DateTimeDelta
7
+ end
8
+
9
+
10
+ end
11
+ end
12
+ end
@@ -4,19 +4,24 @@ module DataAnon
4
4
 
5
5
  class DefaultAnon
6
6
 
7
- FS = DataAnon::Strategy::Field
8
- DEFAULT_STRATEGIES = {:string => FS::LoremIpsum.new,
9
- :integer => FS::RandomInt.new(18,70),
10
- :datetime => FS::DateTimeDelta.new,
11
- :boolean => FS::RandomBoolean.new
7
+ DEFAULT_STRATEGIES = {:string => FieldStrategy::LoremIpsum.new,
8
+ :fixnum => FieldStrategy::RandomIntegerDelta.new(5),
9
+ :bignum => FieldStrategy::RandomIntegerDelta.new(5000),
10
+ :float => FieldStrategy::RandomFloatDelta.new(5.0),
11
+ :datetime => FieldStrategy::DateTimeDelta.new,
12
+ :time => FieldStrategy::TimeDelta.new,
13
+ :date => FieldStrategy::DateDelta.new,
14
+ :trueclass => FieldStrategy::RandomBoolean.new,
15
+ :falseclass => FieldStrategy::RandomBoolean.new
12
16
  }
13
17
 
14
- def initialize user_defaults
18
+ def initialize user_defaults = {}
15
19
  @user_defaults = DEFAULT_STRATEGIES.merge user_defaults
16
20
  end
17
21
 
18
22
  def anonymize field
19
- strategy = @user_defaults[field.value.class.to_s.downcase.to_sym] || FS::Whitelist.new
23
+ strategy = @user_defaults[field.value.class.to_s.downcase.to_sym]
24
+ raise "No strategy defined for datatype #{field.value.class}" unless strategy
20
25
  strategy.anonymize field
21
26
  end
22
27