data-anonymization 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. data/.documentup.json +1 -0
  2. data/.travis.yml +0 -1
  3. data/README.md +277 -52
  4. data/blacklist_dsl.rb +1 -3
  5. data/data-anonymization.gemspec +4 -0
  6. data/lib/core/dsl.rb +1 -1
  7. data/lib/data-anonymization.rb +3 -0
  8. data/lib/strategy/base.rb +21 -11
  9. data/lib/strategy/blacklist.rb +2 -1
  10. data/lib/strategy/field/contact/geojson_base.rb +24 -0
  11. data/lib/strategy/field/contact/random_address.rb +17 -0
  12. data/lib/strategy/field/contact/random_city.rb +17 -0
  13. data/lib/strategy/field/contact/random_phone_number.rb +13 -0
  14. data/lib/strategy/field/contact/random_province.rb +17 -0
  15. data/lib/strategy/field/contact/random_zipcode.rb +17 -0
  16. data/lib/strategy/field/datetime/anonymize_date.rb +39 -0
  17. data/lib/strategy/field/datetime/anonymize_datetime.rb +15 -0
  18. data/lib/strategy/field/datetime/anonymize_time.rb +58 -0
  19. data/lib/strategy/field/datetime/date_delta.rb +21 -0
  20. data/lib/strategy/field/{date_time_delta.rb → datetime/date_time_delta.rb} +3 -3
  21. data/lib/strategy/field/datetime/time_delta.rb +12 -0
  22. data/lib/strategy/field/default_anon.rb +12 -7
  23. data/lib/strategy/field/email/gmail_template.rb +16 -0
  24. data/lib/strategy/field/{random_email.rb → email/random_email.rb} +0 -0
  25. data/lib/strategy/field/{random_mailinator_email.rb → email/random_mailinator_email.rb} +0 -2
  26. data/lib/strategy/field/fields.rb +51 -20
  27. data/lib/strategy/field/name/random_first_name.rb +14 -0
  28. data/lib/strategy/field/{random_full_name.rb → name/random_full_name.rb} +0 -0
  29. data/lib/strategy/field/name/random_last_name.rb +14 -0
  30. data/lib/strategy/field/{random_user_name.rb → name/random_user_name.rb} +0 -0
  31. data/lib/strategy/field/number/random_float.rb +23 -0
  32. data/lib/strategy/field/{random_float_delta.rb → number/random_float_delta.rb} +2 -4
  33. data/lib/strategy/field/{random_int.rb → number/random_integer.rb} +1 -1
  34. data/lib/strategy/field/{random_integer_delta.rb → number/random_integer_delta.rb} +2 -5
  35. data/lib/strategy/field/{random_phone_number.rb → string/formatted_string_numbers.rb} +4 -1
  36. data/lib/strategy/field/{lorem_ipsum.rb → string/lorem_ipsum.rb} +0 -0
  37. data/lib/strategy/field/{random_string.rb → string/random_string.rb} +0 -0
  38. data/lib/strategy/field/{distinct_column_values.rb → string/select_from_database.rb} +2 -3
  39. data/lib/strategy/field/string/select_from_file.rb +18 -0
  40. data/lib/strategy/field/string/select_from_list.rb +17 -0
  41. data/lib/strategy/field/{string_template.rb → string/string_template.rb} +0 -0
  42. data/lib/strategy/whitelist.rb +4 -2
  43. data/lib/utils/database.rb +8 -6
  44. data/lib/utils/geojson_parser.rb +42 -0
  45. data/lib/utils/logging.rb +0 -9
  46. data/lib/utils/progress_bar.rb +29 -0
  47. data/lib/utils/random_float.rb +12 -0
  48. data/lib/utils/random_int.rb +3 -7
  49. data/lib/utils/resource.rb +4 -0
  50. data/lib/version.rb +1 -1
  51. data/resources/UK_addresses.geojson +300 -0
  52. data/resources/US_addresses.geojson +300 -0
  53. data/spec/acceptance/rdbms_blacklist_spec.rb +2 -2
  54. data/spec/acceptance/rdbms_whitelist_spec.rb +6 -8
  55. data/spec/resource/sample.geojson +1 -0
  56. data/spec/spec_helper.rb +3 -2
  57. data/spec/strategy/field/contact/random_address_spec.rb +12 -0
  58. data/spec/strategy/field/contact/random_city_spec.rb +14 -0
  59. data/spec/strategy/field/contact/random_phone_number_spec.rb +16 -0
  60. data/spec/strategy/field/contact/random_province_spec.rb +14 -0
  61. data/spec/strategy/field/contact/random_zipcode_spec.rb +14 -0
  62. data/spec/strategy/field/datetime/anonymize_date_spec.rb +27 -0
  63. data/spec/strategy/field/datetime/anonymize_datetime_spec.rb +57 -0
  64. data/spec/strategy/field/datetime/anonymize_time_spec.rb +57 -0
  65. data/spec/strategy/field/datetime/date_delta_spec.rb +36 -0
  66. data/spec/strategy/field/{date_time_delta_spec.rb → datetime/date_time_delta_spec.rb} +3 -2
  67. data/spec/strategy/field/datetime/time_delta_spec.rb +44 -0
  68. data/spec/strategy/field/default_anon_spec.rb +42 -0
  69. data/spec/strategy/field/email/gmail_template_spec.rb +17 -0
  70. data/spec/strategy/field/{random_email_spec.rb → email/random_email_spec.rb} +2 -2
  71. data/spec/strategy/field/email/random_mailinator_email_spec.rb +14 -0
  72. data/spec/strategy/field/{random_first_name_spec.rb → name/random_first_name_spec.rb} +2 -2
  73. data/spec/strategy/field/{random_full_name_spec.rb → name/random_full_name_spec.rb} +2 -2
  74. data/spec/strategy/field/{random_last_name_spec.rb → name/random_last_name_spec.rb} +2 -2
  75. data/spec/strategy/field/{random_user_name_spec.rb → name/random_user_name_spec.rb} +2 -2
  76. data/spec/strategy/field/{random_float_delta_spec.rb → number/random_float_delta_spec.rb} +2 -2
  77. data/spec/strategy/field/number/random_float_spec.rb +28 -0
  78. data/spec/strategy/field/{random_integer_delta_spec.rb → number/random_integer_delta_spec.rb} +3 -5
  79. data/spec/strategy/field/{random_int_spec.rb → number/random_integer_spec.rb} +4 -4
  80. data/spec/strategy/field/random_boolean_spec.rb +2 -2
  81. data/spec/strategy/field/string/formatted_string_numbers_spec.rb +15 -0
  82. data/spec/strategy/field/{lorem_ipsum_spec.rb → string/lorem_ipsum_spec.rb} +2 -2
  83. data/spec/strategy/field/{random_string_spec.rb → string/random_string_spec.rb} +2 -2
  84. data/spec/strategy/field/{distinct_column_values_spec.rb → string/select_from_database_spec.rb} +3 -3
  85. data/spec/strategy/field/{random_selection_spec.rb → string/select_from_list_spec.rb} +5 -5
  86. data/spec/strategy/field/{string_template_spec.rb → string/string_template_spec.rb} +2 -2
  87. data/spec/strategy/field/whitelist_spec.rb +2 -2
  88. data/spec/support/customer_sample.rb +1 -1
  89. data/spec/utils/database_spec.rb +2 -2
  90. data/spec/utils/geojson_parser_spec.rb +38 -0
  91. data/whitelist_dsl.rb +4 -6
  92. metadata +163 -59
  93. data/lib/strategy/field/anonymize_time.rb +0 -57
  94. data/lib/strategy/field/gmail_template.rb +0 -17
  95. data/lib/strategy/field/random_first_name.rb +0 -18
  96. data/lib/strategy/field/random_last_name.rb +0 -19
  97. data/lib/strategy/field/random_selection.rb +0 -23
  98. data/lib/strategy/field/user_name_template.rb +0 -22
  99. data/spec/strategy/field/anonymize_time_spec.rb +0 -23
  100. data/spec/strategy/field/gmail_template_spec.rb +0 -14
  101. data/spec/strategy/field/random_mailinator_email_spec.rb +0 -21
  102. data/spec/strategy/field/random_phone_number_spec.rb +0 -35
  103. data/spec/strategy/field/user_name_template_spec.rb +0 -13
@@ -0,0 +1,16 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+ class GmailTemplate
5
+
6
+ def initialize username = 'someusername'
7
+ @username = username
8
+ end
9
+
10
+ def anonymize field
11
+ "#{@username}+#{field.row_number}@gmail.com"
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -9,9 +9,7 @@ module DataAnon
9
9
  end
10
10
 
11
11
  def anonymize field
12
-
13
12
  return @email_anonymizer.anonymize(field)
14
-
15
13
  end
16
14
  end
17
15
  end
@@ -1,23 +1,54 @@
1
1
  require 'strategy/field/whitelist'
2
- require 'strategy/field/string_template'
3
- require 'strategy/field/user_name_template'
4
- require 'strategy/field/random_string'
5
- require 'strategy/field/random_int'
6
2
  require 'strategy/field/random_boolean'
7
- require 'strategy/field/anonymize_time'
8
- require 'strategy/field/random_integer_delta'
9
- require 'strategy/field/random_float_delta'
10
- require 'strategy/field/random_selection'
11
- require 'strategy/field/distinct_column_values'
12
- require 'strategy/field/lorem_ipsum'
13
- require 'strategy/field/gmail_template'
14
- require 'strategy/field/date_time_delta'
15
- require 'strategy/field/default_anon'
16
- require 'strategy/field/random_email'
17
- require 'strategy/field/random_mailinator_email'
18
- require 'strategy/field/random_phone_number'
19
- require 'strategy/field/random_first_name'
20
- require 'strategy/field/random_last_name'
21
- require 'strategy/field/random_full_name'
22
- require 'strategy/field/random_user_name'
3
+
23
4
  require 'strategy/field/anonymous'
5
+
6
+ # string
7
+ require 'strategy/field/string/lorem_ipsum'
8
+ require 'strategy/field/string/string_template'
9
+ require 'strategy/field/string/random_string'
10
+ require 'strategy/field/string/formatted_string_numbers'
11
+
12
+ require 'strategy/field/string/select_from_file'
13
+ require 'strategy/field/string/select_from_list'
14
+ require 'strategy/field/string/select_from_database'
15
+
16
+ # number
17
+ require 'strategy/field/number/random_integer'
18
+ require 'strategy/field/number/random_float'
19
+ require 'strategy/field/number/random_integer_delta'
20
+ require 'strategy/field/number/random_float_delta'
21
+
22
+ # contact
23
+ require 'strategy/field/contact/geojson_base'
24
+ require 'strategy/field/contact/random_phone_number'
25
+ require 'strategy/field/contact/random_address'
26
+ require 'strategy/field/contact/random_zipcode'
27
+ require 'strategy/field/contact/random_city'
28
+ require 'strategy/field/contact/random_province'
29
+
30
+ # datetime
31
+ require 'strategy/field/datetime/anonymize_time'
32
+ require 'strategy/field/datetime/anonymize_datetime'
33
+ require 'strategy/field/datetime/anonymize_date'
34
+ require 'strategy/field/datetime/date_time_delta'
35
+ require 'strategy/field/datetime/time_delta'
36
+ require 'strategy/field/datetime/date_delta'
37
+
38
+ # email
39
+ require 'strategy/field/email/random_email'
40
+ require 'strategy/field/email/gmail_template'
41
+ require 'strategy/field/email/random_mailinator_email'
42
+
43
+ # name
44
+ require 'strategy/field/name/random_first_name'
45
+ require 'strategy/field/name/random_last_name'
46
+ require 'strategy/field/name/random_full_name'
47
+ require 'strategy/field/name/random_user_name'
48
+
49
+
50
+
51
+ FieldStrategy = DataAnon::Strategy::Field
52
+
53
+ require 'strategy/field/default_anon'
54
+
@@ -0,0 +1,14 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class RandomFirstName < SelectFromFile
6
+
7
+ def initialize file_path = nil
8
+ super(file_path || DataAnon::Utils::Resource.file('first_names.txt'))
9
+ end
10
+
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class RandomLastName < SelectFromFile
6
+
7
+ def initialize file_path = nil
8
+ super(file_path || DataAnon::Utils::Resource.file('last_names.txt'))
9
+ end
10
+
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,23 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class RandomFloat
7
+
8
+ def initialize from = 0.0, to = 100.0
9
+ @from = from
10
+ @to = to
11
+
12
+ end
13
+
14
+ def anonymize field
15
+ DataAnon::Utils::RandomFloat.generate(@from,@to)
16
+ end
17
+
18
+ end
19
+
20
+
21
+ end
22
+ end
23
+ end
@@ -3,10 +3,8 @@ module DataAnon
3
3
  module Field
4
4
  class RandomFloatDelta
5
5
 
6
- DEFAULT_DELTA = 10.0
7
-
8
- def initialize delta = nil
9
- @delta = delta || DEFAULT_DELTA
6
+ def initialize delta = 10.0
7
+ @delta = delta
10
8
  end
11
9
 
12
10
  def anonymize field
@@ -3,7 +3,7 @@ module DataAnon
3
3
  module Field
4
4
 
5
5
 
6
- class RandomInt
6
+ class RandomInteger
7
7
 
8
8
  def initialize from = 0, to = 100
9
9
  @from = from
@@ -3,11 +3,8 @@ module DataAnon
3
3
  module Field
4
4
  class RandomIntegerDelta
5
5
 
6
- DEFAULT_DELTA = 10
7
-
8
- def initialize delta = nil
9
- @delta = delta || DEFAULT_DELTA
10
-
6
+ def initialize delta = 10
7
+ @delta = delta
11
8
  end
12
9
 
13
10
  def anonymize field
@@ -3,7 +3,7 @@ module DataAnon
3
3
  module Field
4
4
 
5
5
 
6
- class RandomPhoneNumber
6
+ class FormattedStringNumber
7
7
 
8
8
  def anonymize field
9
9
  @original_phone_number = field.value
@@ -18,7 +18,10 @@ module DataAnon
18
18
 
19
19
  @anonymized_phone_number
20
20
  end
21
+
21
22
  end
23
+
24
+
22
25
  end
23
26
  end
24
27
  end
@@ -2,18 +2,17 @@ module DataAnon
2
2
  module Strategy
3
3
  module Field
4
4
 
5
- class DistinctColumnValues
5
+ class SelectFromDatabase
6
6
  include Utils::Logging
7
7
 
8
8
  def initialize table_name, field_name
9
- source = Utils::SourceTable.create table_name
9
+ source = Utils::SourceTable.create table_name, []
10
10
  @values = source.select(field_name).uniq.collect { |record| record[field_name]}
11
11
  logger.debug "For field strategy #{table_name}:#{field_name} using values #{@values} "
12
12
 
13
13
  end
14
14
 
15
15
  def anonymize field
16
- return @values[0] if @values.length == 1
17
16
  @values[DataAnon::Utils::RandomInt.generate(0,(@values.length - 1))]
18
17
  end
19
18
 
@@ -0,0 +1,18 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+ class SelectFromFile
6
+
7
+ def initialize file_path
8
+ @values = File.read(file_path).split
9
+ end
10
+
11
+ def anonymize field
12
+ @values.sample
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,17 @@
1
+ module DataAnon
2
+ module Strategy
3
+ module Field
4
+
5
+
6
+ class SelectFromList < SelectFromFile
7
+
8
+ def initialize values
9
+ @values = values.class == Array ? values : [values]
10
+ end
11
+
12
+ end
13
+
14
+
15
+ end
16
+ end
17
+ end
@@ -5,14 +5,16 @@ module DataAnon
5
5
  def process_record(index, record)
6
6
  dest_record_map = {}
7
7
  record.attributes.each do |field_name, field_value|
8
- unless field_value.nil? || field_name.downcase == @primary_key.downcase
8
+ unless field_value.nil? || is_primary_key?(field_name)
9
9
  field = DataAnon::Core::Field.new(field_name, field_value, index, record)
10
10
  field_strategy = @fields[field_name.downcase] || DataAnon::Strategy::Field::DefaultAnon.new(@user_strategies)
11
11
  dest_record_map[field_name] = field_strategy.anonymize(field)
12
12
  end
13
13
  end
14
14
  dest_record = dest_table.new dest_record_map
15
- dest_record[@primary_key] = record[@primary_key]
15
+ @primary_keys.each do |key|
16
+ dest_record[key] = record[key]
17
+ end
16
18
  dest_record.save!
17
19
  end
18
20
 
@@ -1,4 +1,5 @@
1
1
  require 'active_record'
2
+ require 'composite_primary_keys'
2
3
  require 'logger'
3
4
 
4
5
  module DataAnon
@@ -23,10 +24,11 @@ module DataAnon
23
24
 
24
25
  class BaseTable
25
26
 
26
- def self.create_table table_name, primary_key, database
27
+ def self.create_table database, table_name, primary_keys
27
28
  Class.new(database) do
28
29
  self.table_name = table_name
29
- self.primary_key = primary_key
30
+ self.primary_keys = primary_keys if primary_keys.length > 1
31
+ self.primary_key = primary_keys[0] if primary_keys.length == 1
30
32
  self.mass_assignment_sanitizer = MassAssignmentIgnoreSanitizer.new(self)
31
33
  end
32
34
  end
@@ -35,16 +37,16 @@ module DataAnon
35
37
 
36
38
  class SourceTable < BaseTable
37
39
 
38
- def self.create table_name, primary_key = nil
39
- create_table table_name, primary_key, SourceDatabase
40
+ def self.create table_name, primary_key
41
+ create_table SourceDatabase, table_name, primary_key
40
42
  end
41
43
 
42
44
  end
43
45
 
44
46
  class DestinationTable < BaseTable
45
47
 
46
- def self.create table_name, primary_key = nil
47
- create_table table_name, primary_key, DestinationDatabase
48
+ def self.create table_name, primary_key
49
+ create_table DestinationDatabase, table_name, primary_key
48
50
  end
49
51
 
50
52
  end
@@ -0,0 +1,42 @@
1
+ require 'rgeo/geo_json'
2
+
3
+ module DataAnon
4
+ module Utils
5
+ class GeojsonParser
6
+
7
+
8
+ def self.address file_path
9
+ self.new(file_path).parse 'address'
10
+ end
11
+
12
+ def self.zipcode file_path
13
+ self.new(file_path).parse 'postcode'
14
+ end
15
+
16
+ def self.province file_path
17
+ self.new(file_path).parse 'province'
18
+ end
19
+
20
+ def self.city file_path
21
+ self.new(file_path).parse 'city'
22
+ end
23
+
24
+ def self.country file_path
25
+ self.new(file_path).parse 'country'
26
+ end
27
+
28
+ def initialize file_path
29
+ @places = File.read(file_path).split(/\n/)
30
+ end
31
+
32
+ def parse property
33
+ result_list = []
34
+ @places.each do |loc|
35
+ geom = RGeo::GeoJSON.decode(loc, :json_parser => :json)
36
+ result_list.push(geom[property])
37
+ end
38
+ result_list
39
+ end
40
+ end
41
+ end
42
+ end
data/lib/utils/logging.rb CHANGED
@@ -15,15 +15,6 @@ module DataAnon
15
15
  @@logger
16
16
  end
17
17
 
18
- def progress_logger
19
- @@progress_logger ||= (self.progress_logger = Logger.new(STDOUT) )
20
- end
21
-
22
- def progress_logger= logger
23
- logger.formatter = proc { |severity, datetime, progname, msg| msg }
24
- @@progress_logger = logger
25
- end
26
-
27
18
  end
28
19
  end
29
20
  end
@@ -0,0 +1,29 @@
1
+ module DataAnon
2
+ module Utils
3
+
4
+ class ProgressBar
5
+
6
+ def initialize table_name, total
7
+ @total = total
8
+ @table_name = table_name
9
+ @progress_bar = PowerBar.new if total > 0 && show_progress
10
+ end
11
+
12
+ def show_progress
13
+ ENV['show_progress'] != 'false'
14
+ end
15
+
16
+ def show index
17
+ if @progress_bar && ((index % 1000 == 0) || (index == @total) || (index == 1))
18
+ @progress_bar.show(:msg => "Table: #{@table_name} (#{index}/#{@total})", :done => index, :total => @total)
19
+ end
20
+ end
21
+
22
+ def close
23
+ @progress_bar.close if @progress_bar
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+ end