my_obfuscate 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1 +1,2 @@
1
+ 7/17/2013 - Switch Postgres to use COPY statements and refactor internals. Thanks @samuelreh!
1
2
  3/4/2013 - Switch to the ffaker gem for speed. Add WalkerMethod and an English language frequency dictionary for generating random texts.
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in my_obfuscate.gemspec
4
4
  gemspec
5
+
6
+ gem 'rake'
data/README.rdoc CHANGED
@@ -1,5 +1,7 @@
1
1
  = MyObfuscate
2
2
 
3
+ {<img src="https://travis-ci.org/mavenlink/my_obfuscate.png">}[https://travis-ci.org/mavenlink/my_obfuscate]
4
+
3
5
  You want to develop against real production data, but you don't want to violate your users' privacy. Enter MyObfuscate: standalone Ruby code for the selective rewriting of SQL dumps in order to protect user privacy. It supports MySQL, Postgres, and SQL Server.
4
6
 
5
7
  = Install
@@ -64,6 +66,10 @@ builtin SQL Server support by specifying:
64
66
  obfuscator.database_type = :sql_server
65
67
  obfuscator.database_type = :postgres
66
68
 
69
+ If using Postgres, use pg_dump to get a dump:
70
+
71
+ pg_dump database | ruby obfuscator.rb > obfuscated_dump.sql
72
+
67
73
  == Types
68
74
 
69
75
  Available types include: email, string, lorem, name, first_name, last_name, address, street_address, city, state,
@@ -0,0 +1,146 @@
1
+ class MyObfuscate
2
+ class ConfigApplicator
3
+
4
+ def self.apply_table_config(row, table_config, columns)
5
+ return row unless table_config.is_a?(Hash)
6
+ row_hash = row_as_hash(row, columns)
7
+
8
+ table_config.each do |column, definition|
9
+ index = columns.index(column)
10
+
11
+ definition = { :type => definition } if definition.is_a?(Symbol)
12
+
13
+ if definition.has_key?(:unless)
14
+ unless_check = make_conditional_method(definition[:unless], index, row)
15
+
16
+ next if unless_check.call(row_hash)
17
+ end
18
+
19
+
20
+ if definition.has_key?(:if)
21
+ if_check = make_conditional_method(definition[:if], index, row)
22
+
23
+ next unless if_check.call(row_hash)
24
+ end
25
+
26
+ if definition[:skip_regexes]
27
+ next if definition[:skip_regexes].any? {|regex| row[index] =~ regex}
28
+ end
29
+
30
+ row[index.to_i] = case definition[:type]
31
+ when :email
32
+ md5 = Digest::MD5.hexdigest(rand.to_s)[0...5]
33
+ clean_quotes("#{Faker::Internet.email}.#{md5}.example.com")
34
+ when :string
35
+ random_string(definition[:length] || 30, definition[:chars] || SENSIBLE_CHARS)
36
+ when :lorem
37
+ clean_bad_whitespace(clean_quotes(Faker::Lorem.sentences(definition[:number] || 1).join(". ")))
38
+ when :like_english
39
+ clean_quotes random_english_sentences(definition[:number] || 1)
40
+ when :name
41
+ clean_quotes(Faker::Name.name)
42
+ when :first_name
43
+ clean_quotes(Faker::Name.first_name)
44
+ when :last_name
45
+ clean_quotes(Faker::Name.last_name)
46
+ when :address
47
+ clean_quotes("#{Faker::AddressUS.street_address}\\n#{Faker::AddressUS.city}, #{Faker::AddressUS.state_abbr} #{Faker::AddressUS.zip_code}")
48
+ when :street_address
49
+ clean_bad_whitespace(clean_quotes(Faker::AddressUS.street_address))
50
+ when :city
51
+ clean_quotes(Faker::AddressUS.city)
52
+ when :state
53
+ clean_quotes Faker::AddressUS.state_abbr
54
+ when :zip_code
55
+ Faker::AddressUS.zip_code
56
+ when :phone
57
+ clean_quotes Faker::PhoneNumber.phone_number
58
+ when :company
59
+ clean_bad_whitespace(clean_quotes(Faker::Company.name))
60
+ when :ipv4
61
+ Faker::Internet.ip_v4_address
62
+ when :ipv6
63
+ # Inlined from Faker because ffaker doesn't have ipv6.
64
+ @@ip_v6_space ||= (0..65535).to_a
65
+ container = (1..8).map{ |_| @@ip_v6_space.sample }
66
+ container.map{ |n| n.to_s(16) }.join(':')
67
+ when :url
68
+ clean_bad_whitespace(Faker::Internet.http_url)
69
+ when :integer
70
+ random_integer(definition[:between] || (0..1000)).to_s
71
+ when :fixed
72
+ if definition[:one_of]
73
+ definition[:one_of][(rand * definition[:one_of].length).to_i]
74
+ else
75
+ definition[:string].is_a?(Proc) ? definition[:string].call(row_hash) : definition[:string]
76
+ end
77
+ when :null
78
+ nil
79
+ when :keep
80
+ row[index]
81
+ else
82
+ $stderr.puts "Keeping a column value by providing an unknown type (#{definition[:type]}) is deprecated. Use :keep instead."
83
+ row[index]
84
+ end
85
+ end
86
+ row
87
+ end
88
+
89
+ def self.row_as_hash(row, columns)
90
+ columns.zip(row).inject({}) {|m, (name, value)| m[name] = value; m}
91
+ end
92
+
93
+ def self.make_conditional_method(conditional_method, index, row)
94
+ if conditional_method.is_a?(Symbol)
95
+ if conditional_method == :blank
96
+ conditional_method = lambda { |row_hash| row[index].nil? || row[index] == '' }
97
+ elsif conditional_method == :nil
98
+ conditional_method = lambda { |row_hash| row[index].nil? }
99
+ end
100
+ end
101
+ conditional_method
102
+ end
103
+
104
+ def self.random_integer(between)
105
+ (between.min + (between.max - between.min) * rand).round
106
+ end
107
+
108
+ def self.random_string(length_or_range, chars)
109
+ length_or_range = (length_or_range..length_or_range) if length_or_range.is_a?(Fixnum)
110
+ times = random_integer(length_or_range)
111
+ out = ""
112
+ times.times { out << chars[rand * chars.length] }
113
+ out
114
+ end
115
+
116
+ def self.random_english_sentences(num)
117
+ @@walker_method ||= begin
118
+ words, counts = [], []
119
+ File.read(File.expand_path(File.join(File.dirname(__FILE__), 'my_obfuscate', 'data', 'en_50K.txt'))).each_line do |line|
120
+ word, count = line.split(/\s+/)
121
+ words << word
122
+ counts << count.to_i
123
+ end
124
+ WalkerMethod.new(words, counts)
125
+ end
126
+
127
+ sentences = []
128
+ num.times do
129
+ words = []
130
+ (3 + rand * 5).to_i.times { words << @@walker_method.random }
131
+ sentences << words.join(" ") + "."
132
+ sentences.last[0] = sentences.last[0].upcase
133
+ end
134
+ sentences.join(" ")
135
+ end
136
+
137
+ def self.clean_quotes(value)
138
+ value.gsub(/['"]/, '')
139
+ end
140
+
141
+ def self.clean_bad_whitespace(value)
142
+ value.gsub(/[\n\t\r]/, '')
143
+ end
144
+
145
+ end
146
+ end
@@ -0,0 +1,45 @@
1
+ class MyObfuscate
2
+ module CopyStatementParser
3
+
4
+ # Postgres uses COPY statements instead of INSERT and look like:
5
+ #
6
+ # COPY some_table (a, b, c, d) FROM stdin;
7
+ # 1 2 3 4
8
+ # 5 6 7 8
9
+ # \.
10
+ #
11
+ # This requires the parse methods to persist data (table name and
12
+ # column names) across multiple lines.
13
+ #
14
+ def parse(obfuscator, config, input_io, output_io)
15
+ current_table_name, current_columns = ""
16
+ inside_copy_statement = false
17
+
18
+ input_io.each do |line|
19
+ if parse_insert_statement(line)
20
+ raise RuntimeError.new("Cannot obfuscate Postgres dumps containing INSERT statements. Please use COPY statments.")
21
+ elsif table_data = parse_copy_statement(line)
22
+ inside_copy_statement = true
23
+
24
+ current_table_name = table_data[:table_name]
25
+ current_columns = table_data[:column_names]
26
+
27
+ if !config[current_table_name]
28
+ $stderr.puts "Deprecated: #{current_table_name} was not specified in the config. A future release will cause this to be an error. Please specify the table definition or set it to :keep."
29
+ end
30
+
31
+ output_io.write line
32
+ elsif line.match /\S*\.\n/
33
+ inside_copy_statement = false
34
+
35
+ output_io.write line
36
+ elsif inside_copy_statement
37
+ output_io.puts obfuscator.obfuscate_bulk_insert_line(line, current_table_name, current_columns)
38
+ else
39
+ output_io.write line
40
+ end
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,22 @@
1
+ class MyObfuscate
2
+ module InsertStatementParser
3
+
4
+ def parse(obfuscator, config, input_io, output_io)
5
+ input_io.each do |line|
6
+ if table_data = parse_insert_statement(line)
7
+ table_name = table_data[:table_name]
8
+ columns = table_data[:column_names]
9
+ if config[table_name]
10
+ output_io.puts obfuscator.obfuscate_bulk_insert_line(line, table_name, columns)
11
+ else
12
+ $stderr.puts "Deprecated: #{table_name} was not specified in the config. A future release will cause this to be an error. Please specify the table definition or set it to :keep."
13
+ output_io.write line
14
+ end
15
+ else
16
+ output_io.write line
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -1,6 +1,6 @@
1
1
  class MyObfuscate
2
2
  class Mysql
3
- include MyObfuscate::DatabaseHelperShared
3
+ include MyObfuscate::InsertStatementParser
4
4
 
5
5
  def parse_insert_statement(line)
6
6
  if regex_match = insert_regex.match(line)
@@ -11,7 +11,11 @@ class MyObfuscate
11
11
  end
12
12
  end
13
13
 
14
- def make_insert_statement(table_name, column_names, values_strings)
14
+ def make_insert_statement(table_name, column_names, values)
15
+ values_strings = values.collect do |values|
16
+ "(" + values.join(",") + ")"
17
+ end.join(",")
18
+
15
19
  "INSERT INTO `#{table_name}` (`#{column_names.join('`, `')}`) VALUES #{values_strings};"
16
20
  end
17
21
 
@@ -19,5 +23,75 @@ class MyObfuscate
19
23
  /^\s*INSERT INTO `(.*?)` \((.*?)\) VALUES\s*/i
20
24
  end
21
25
 
26
+ def rows_to_be_inserted(line)
27
+ line = line.gsub(insert_regex, '').gsub(/\s*;\s*$/, '')
28
+ context_aware_mysql_string_split(line)
29
+ end
30
+
31
+ def make_valid_value_string(value)
32
+ if value.nil?
33
+ "NULL"
34
+ elsif value =~ /^0x[0-9a-fA-F]+$/
35
+ value
36
+ else
37
+ "'" + value + "'"
38
+ end
39
+ end
40
+
41
+ # Be aware, strings must be quoted in single quotes!
42
+ def context_aware_mysql_string_split(string)
43
+ in_sub_insert = false
44
+ in_quoted_string = false
45
+ escaped = false
46
+ current_field = nil
47
+ length = string.length
48
+ fields = []
49
+ output = []
50
+
51
+ string.each_char do |i|
52
+ if escaped
53
+ escaped = false
54
+ current_field ||= ""
55
+ current_field << i
56
+ else
57
+ if i == "\\"
58
+ escaped = true
59
+ current_field ||= ""
60
+ current_field << i
61
+ elsif i == "(" && !in_quoted_string && !in_sub_insert
62
+ in_sub_insert = true
63
+ elsif i == ")" && !in_quoted_string && in_sub_insert
64
+ fields << current_field unless current_field.nil?
65
+ output << fields unless fields.length == 0
66
+ in_sub_insert = false
67
+ fields = []
68
+ current_field = nil
69
+ elsif i == "'" && !in_quoted_string
70
+ fields << current_field unless current_field.nil?
71
+ current_field = ''
72
+ in_quoted_string = true
73
+ elsif i == "'" && in_quoted_string
74
+ fields << current_field unless current_field.nil?
75
+ current_field = nil
76
+ in_quoted_string = false
77
+ elsif i == "," && !in_quoted_string && in_sub_insert
78
+ fields << current_field unless current_field.nil?
79
+ current_field = nil
80
+ elsif i == "L" && !in_quoted_string && in_sub_insert && current_field == "NUL"
81
+ current_field = nil
82
+ fields << current_field
83
+ elsif (i == " " || i == "\t") && !in_quoted_string
84
+ # Don't add whitespace not in a string
85
+ elsif in_sub_insert
86
+ current_field ||= ""
87
+ current_field << i
88
+ end
89
+ end
90
+ end
91
+
92
+ fields << current_field unless current_field.nil?
93
+ output << fields unless fields.length == 0
94
+ output
95
+ end
22
96
  end
23
97
  end
@@ -1,9 +1,31 @@
1
1
  class MyObfuscate
2
2
  class Postgres
3
- include MyObfuscate::DatabaseHelperShared
3
+ include MyObfuscate::CopyStatementParser
4
4
 
5
- def parse_insert_statement(line)
6
- if regex_match = insert_regex.match(line)
5
+ # Copy statements contain the column values tab seperated like so:
6
+ # blah blah blah blah
7
+ # which we want to turn into:
8
+ # [['blah','blah','blah','blah']]
9
+ #
10
+ # We wrap it in an array to keep it consistent with MySql bulk
11
+ # obfuscation (multiple rows per insert statement)
12
+ def rows_to_be_inserted(line)
13
+ line.gsub!(/\n$/,"")
14
+ row = line.split(/\t/)
15
+
16
+ row.collect! do |value|
17
+ if value == "\\N"
18
+ nil
19
+ else
20
+ value
21
+ end
22
+ end
23
+
24
+ [row]
25
+ end
26
+
27
+ def parse_copy_statement(line)
28
+ if regex_match = /^\s*COPY (.*?) \((.*?)\) FROM\s*/i.match(line)
7
29
  {
8
30
  :table_name => regex_match[1].to_sym,
9
31
  :column_names => regex_match[2].split(/\s*,\s*/).map(&:to_sym)
@@ -11,12 +33,20 @@ class MyObfuscate
11
33
  end
12
34
  end
13
35
 
14
- def make_insert_statement(table_name, column_names, values_strings)
15
- "INSERT INTO #{table_name} (#{column_names.join(', ')}) VALUES #{values_strings};"
36
+ def make_insert_statement(table_name, column_names, values)
37
+ values.join("\t")
16
38
  end
17
39
 
18
- def insert_regex
19
- /^\s*INSERT INTO (.*?) \((.*?)\) VALUES\s*/i
40
+ def make_valid_value_string(value)
41
+ if value.nil?
42
+ "\\N"
43
+ else
44
+ value
45
+ end
46
+ end
47
+
48
+ def parse_insert_statement(line)
49
+ /^\s*INSERT INTO/i.match(line)
20
50
  end
21
51
 
22
52
  end
@@ -1,5 +1,6 @@
1
1
  class MyObfuscate
2
2
  class SqlServer
3
+ include MyObfuscate::InsertStatementParser
3
4
 
4
5
  def parse_insert_statement(line)
5
6
  if regex_match = insert_regex.match(line)
@@ -25,7 +26,11 @@ class MyObfuscate
25
26
  end
26
27
  end
27
28
 
28
- def make_insert_statement(table_name, column_names, values_strings)
29
+ def make_insert_statement(table_name, column_names, values)
30
+ values_strings = values.collect do |values|
31
+ "(" + values.join(",") + ")"
32
+ end.join(",")
33
+
29
34
  "INSERT [dbo].[#{table_name}] ([#{column_names.join("], [")}]) VALUES #{values_strings};"
30
35
  end
31
36
 
@@ -1,3 +1,3 @@
1
1
  class MyObfuscate
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/my_obfuscate.rb CHANGED
@@ -39,22 +39,7 @@ class MyObfuscate
39
39
  # Read an input stream and dump out an obfuscated output stream. These streams could be StringIO objects, Files,
40
40
  # or STDIN and STDOUT.
41
41
  def obfuscate(input_io, output_io)
42
-
43
- # We assume that every INSERT INTO line occupies one line in the file, with no internal linebreaks.
44
- input_io.each do |line|
45
- if table_data = database_helper.parse_insert_statement(line)
46
- table_name = table_data[:table_name]
47
- columns = table_data[:column_names]
48
- if config[table_name]
49
- output_io.puts obfuscate_bulk_insert_line(line, table_name, columns)
50
- else
51
- $stderr.puts "Deprecated: #{table_name} was not specified in the config. A future release will cause this to be an error. Please specify the table definition or set it to :keep."
52
- output_io.write line
53
- end
54
- else
55
- output_io.write line
56
- end
57
- end
42
+ database_helper.parse(self, config, input_io, output_io)
58
43
  end
59
44
 
60
45
  def reassembling_each_insert(line, table_name, columns)
@@ -63,146 +48,12 @@ class MyObfuscate
63
48
  result = result.map do |i|
64
49
  database_helper.make_valid_value_string(i)
65
50
  end
66
- result = result.join(",")
67
- "(" + result + ")"
68
- end.join(",")
69
- database_helper.make_insert_statement(table_name, columns, output)
70
- end
71
-
72
- def self.row_as_hash(row, columns)
73
- columns.zip(row).inject({}) {|m, (name, value)| m[name] = value; m}
74
- end
75
-
76
- def self.make_conditional_method(conditional_method, index, row)
77
- if conditional_method.is_a?(Symbol)
78
- if conditional_method == :blank
79
- conditional_method = lambda { |row_hash| row[index].nil? || row[index] == '' }
80
- elsif conditional_method == :nil
81
- conditional_method = lambda { |row_hash| row[index].nil? }
82
- end
83
51
  end
84
- conditional_method
85
- end
86
-
87
- def self.apply_table_config(row, table_config, columns)
88
- return row unless table_config.is_a?(Hash)
89
- row_hash = row_as_hash(row, columns)
90
-
91
- table_config.each do |column, definition|
92
- index = columns.index(column)
93
-
94
- definition = { :type => definition } if definition.is_a?(Symbol)
95
-
96
- if definition.has_key?(:unless)
97
- unless_check = make_conditional_method(definition[:unless], index, row)
98
-
99
- next if unless_check.call(row_hash)
100
- end
101
-
102
-
103
- if definition.has_key?(:if)
104
- if_check = make_conditional_method(definition[:if], index, row)
105
-
106
- next unless if_check.call(row_hash)
107
- end
108
-
109
- if definition[:skip_regexes]
110
- next if definition[:skip_regexes].any? {|regex| row[index] =~ regex}
111
- end
112
-
113
- row[index.to_i] = case definition[:type]
114
- when :email
115
- md5 = Digest::MD5.hexdigest(rand.to_s)[0...5]
116
- clean_quotes("#{Faker::Internet.email}.#{md5}.example.com")
117
- when :string
118
- random_string(definition[:length] || 30, definition[:chars] || SENSIBLE_CHARS)
119
- when :lorem
120
- clean_bad_whitespace(clean_quotes(Faker::Lorem.sentences(definition[:number] || 1).join(". ")))
121
- when :like_english
122
- clean_quotes random_english_sentences(definition[:number] || 1)
123
- when :name
124
- clean_quotes(Faker::Name.name)
125
- when :first_name
126
- clean_quotes(Faker::Name.first_name)
127
- when :last_name
128
- clean_quotes(Faker::Name.last_name)
129
- when :address
130
- clean_quotes("#{Faker::AddressUS.street_address}\\n#{Faker::AddressUS.city}, #{Faker::AddressUS.state_abbr} #{Faker::AddressUS.zip_code}")
131
- when :street_address
132
- clean_bad_whitespace(clean_quotes(Faker::AddressUS.street_address))
133
- when :city
134
- clean_quotes(Faker::AddressUS.city)
135
- when :state
136
- clean_quotes Faker::AddressUS.state_abbr
137
- when :zip_code
138
- Faker::AddressUS.zip_code
139
- when :phone
140
- clean_quotes Faker::PhoneNumber.phone_number
141
- when :company
142
- clean_bad_whitespace(clean_quotes(Faker::Company.name))
143
- when :ipv4
144
- Faker::Internet.ip_v4_address
145
- when :ipv6
146
- # Inlined from Faker because ffaker doesn't have ipv6.
147
- @@ip_v6_space ||= (0..65535).to_a
148
- container = (1..8).map{ |_| @@ip_v6_space.sample }
149
- container.map{ |n| n.to_s(16) }.join(':')
150
- when :url
151
- clean_bad_whitespace(Faker::Internet.http_url)
152
- when :integer
153
- random_integer(definition[:between] || (0..1000)).to_s
154
- when :fixed
155
- if definition[:one_of]
156
- definition[:one_of][(rand * definition[:one_of].length).to_i]
157
- else
158
- definition[:string].is_a?(Proc) ? definition[:string].call(row_hash) : definition[:string]
159
- end
160
- when :null
161
- nil
162
- when :keep
163
- row[index]
164
- else
165
- $stderr.puts "Keeping a column value by providing an unknown type (#{definition[:type]}) is deprecated. Use :keep instead."
166
- row[index]
167
- end
168
- end
169
- row
170
- end
171
-
172
- def self.random_integer(between)
173
- (between.min + (between.max - between.min) * rand).round
174
- end
175
-
176
- def self.random_string(length_or_range, chars)
177
- length_or_range = (length_or_range..length_or_range) if length_or_range.is_a?(Fixnum)
178
- times = random_integer(length_or_range)
179
- out = ""
180
- times.times { out << chars[rand * chars.length] }
181
- out
182
- end
183
-
184
- def self.random_english_sentences(num)
185
- @@walker_method ||= begin
186
- words, counts = [], []
187
- File.read(File.expand_path(File.join(File.dirname(__FILE__), 'my_obfuscate', 'data', 'en_50K.txt'))).each_line do |line|
188
- word, count = line.split(/\s+/)
189
- words << word
190
- counts << count.to_i
191
- end
192
- WalkerMethod.new(words, counts)
193
- end
194
-
195
- sentences = []
196
- num.times do
197
- words = []
198
- (5 + rand * 6).to_i.times { words << @@walker_method.random }
199
- sentences << words.join(" ") + "."
200
- sentences.last[0] = sentences.last[0].upcase
201
- end
202
- sentences.join(" ")
52
+ database_helper.make_insert_statement(table_name, columns, output)
203
53
  end
204
54
 
205
55
  def check_for_defined_columns_not_in_table(table_name, columns)
56
+ return unless config[table_name]
206
57
  missing_columns = config[table_name].keys - columns
207
58
  unless missing_columns.length == 0
208
59
  error_message = missing_columns.map do |missing_column|
@@ -233,23 +84,16 @@ class MyObfuscate
233
84
  check_for_table_columns_not_in_definition(table_name, columns) if fail_on_unspecified_columns?
234
85
  # Note: Remember to SQL escape strings in what you pass back.
235
86
  reassembling_each_insert(line, table_name, columns) do |row|
236
- MyObfuscate.apply_table_config(row, table_config, columns)
87
+ ConfigApplicator.apply_table_config(row, table_config, columns)
237
88
  end
238
89
  end
239
90
  end
240
91
 
241
- private
242
-
243
- def self.clean_quotes(value)
244
- value.gsub(/['"]/, '')
245
- end
246
-
247
- def self.clean_bad_whitespace(value)
248
- value.gsub(/[\n\t\r]/, '')
249
- end
250
92
  end
251
93
 
252
- require 'my_obfuscate/database_helper_shared'
94
+ require 'my_obfuscate/copy_statement_parser'
95
+ require 'my_obfuscate/insert_statement_parser'
253
96
  require 'my_obfuscate/mysql'
254
97
  require 'my_obfuscate/sql_server'
255
98
  require 'my_obfuscate/postgres'
99
+ require 'my_obfuscate/config_applicator'