my_obfuscate 0.4.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1 +1,2 @@
1
+ 7/17/2013 - Switch Postgres to use COPY statements and refactor internals. Thanks @samuelreh!
1
2
  3/4/2013 - Switch to the ffaker gem for speed. Add WalkerMethod and an English language frequency dictionary for generating random texts.
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in my_obfuscate.gemspec
4
4
  gemspec
5
+
6
+ gem 'rake'
data/README.rdoc CHANGED
@@ -1,5 +1,7 @@
1
1
  = MyObfuscate
2
2
 
3
+ {<img src="https://travis-ci.org/mavenlink/my_obfuscate.png">}[https://travis-ci.org/mavenlink/my_obfuscate]
4
+
3
5
  You want to develop against real production data, but you don't want to violate your users' privacy. Enter MyObfuscate: standalone Ruby code for the selective rewriting of SQL dumps in order to protect user privacy. It supports MySQL, Postgres, and SQL Server.
4
6
 
5
7
  = Install
@@ -64,6 +66,10 @@ builtin SQL Server support by specifying:
64
66
  obfuscator.database_type = :sql_server
65
67
  obfuscator.database_type = :postgres
66
68
 
69
+ If using Postgres, use pg_dump to get a dump:
70
+
71
+ pg_dump database | ruby obfuscator.rb > obfuscated_dump.sql
72
+
67
73
  == Types
68
74
 
69
75
  Available types include: email, string, lorem, name, first_name, last_name, address, street_address, city, state,
@@ -0,0 +1,146 @@
1
+ class MyObfuscate
2
+ class ConfigApplicator
3
+
4
+ def self.apply_table_config(row, table_config, columns)
5
+ return row unless table_config.is_a?(Hash)
6
+ row_hash = row_as_hash(row, columns)
7
+
8
+ table_config.each do |column, definition|
9
+ index = columns.index(column)
10
+
11
+ definition = { :type => definition } if definition.is_a?(Symbol)
12
+
13
+ if definition.has_key?(:unless)
14
+ unless_check = make_conditional_method(definition[:unless], index, row)
15
+
16
+ next if unless_check.call(row_hash)
17
+ end
18
+
19
+
20
+ if definition.has_key?(:if)
21
+ if_check = make_conditional_method(definition[:if], index, row)
22
+
23
+ next unless if_check.call(row_hash)
24
+ end
25
+
26
+ if definition[:skip_regexes]
27
+ next if definition[:skip_regexes].any? {|regex| row[index] =~ regex}
28
+ end
29
+
30
+ row[index.to_i] = case definition[:type]
31
+ when :email
32
+ md5 = Digest::MD5.hexdigest(rand.to_s)[0...5]
33
+ clean_quotes("#{Faker::Internet.email}.#{md5}.example.com")
34
+ when :string
35
+ random_string(definition[:length] || 30, definition[:chars] || SENSIBLE_CHARS)
36
+ when :lorem
37
+ clean_bad_whitespace(clean_quotes(Faker::Lorem.sentences(definition[:number] || 1).join(". ")))
38
+ when :like_english
39
+ clean_quotes random_english_sentences(definition[:number] || 1)
40
+ when :name
41
+ clean_quotes(Faker::Name.name)
42
+ when :first_name
43
+ clean_quotes(Faker::Name.first_name)
44
+ when :last_name
45
+ clean_quotes(Faker::Name.last_name)
46
+ when :address
47
+ clean_quotes("#{Faker::AddressUS.street_address}\\n#{Faker::AddressUS.city}, #{Faker::AddressUS.state_abbr} #{Faker::AddressUS.zip_code}")
48
+ when :street_address
49
+ clean_bad_whitespace(clean_quotes(Faker::AddressUS.street_address))
50
+ when :city
51
+ clean_quotes(Faker::AddressUS.city)
52
+ when :state
53
+ clean_quotes Faker::AddressUS.state_abbr
54
+ when :zip_code
55
+ Faker::AddressUS.zip_code
56
+ when :phone
57
+ clean_quotes Faker::PhoneNumber.phone_number
58
+ when :company
59
+ clean_bad_whitespace(clean_quotes(Faker::Company.name))
60
+ when :ipv4
61
+ Faker::Internet.ip_v4_address
62
+ when :ipv6
63
+ # Inlined from Faker because ffaker doesn't have ipv6.
64
+ @@ip_v6_space ||= (0..65535).to_a
65
+ container = (1..8).map{ |_| @@ip_v6_space.sample }
66
+ container.map{ |n| n.to_s(16) }.join(':')
67
+ when :url
68
+ clean_bad_whitespace(Faker::Internet.http_url)
69
+ when :integer
70
+ random_integer(definition[:between] || (0..1000)).to_s
71
+ when :fixed
72
+ if definition[:one_of]
73
+ definition[:one_of][(rand * definition[:one_of].length).to_i]
74
+ else
75
+ definition[:string].is_a?(Proc) ? definition[:string].call(row_hash) : definition[:string]
76
+ end
77
+ when :null
78
+ nil
79
+ when :keep
80
+ row[index]
81
+ else
82
+ $stderr.puts "Keeping a column value by providing an unknown type (#{definition[:type]}) is deprecated. Use :keep instead."
83
+ row[index]
84
+ end
85
+ end
86
+ row
87
+ end
88
+
89
+ def self.row_as_hash(row, columns)
90
+ columns.zip(row).inject({}) {|m, (name, value)| m[name] = value; m}
91
+ end
92
+
93
+ def self.make_conditional_method(conditional_method, index, row)
94
+ if conditional_method.is_a?(Symbol)
95
+ if conditional_method == :blank
96
+ conditional_method = lambda { |row_hash| row[index].nil? || row[index] == '' }
97
+ elsif conditional_method == :nil
98
+ conditional_method = lambda { |row_hash| row[index].nil? }
99
+ end
100
+ end
101
+ conditional_method
102
+ end
103
+
104
+ def self.random_integer(between)
105
+ (between.min + (between.max - between.min) * rand).round
106
+ end
107
+
108
+ def self.random_string(length_or_range, chars)
109
+ length_or_range = (length_or_range..length_or_range) if length_or_range.is_a?(Fixnum)
110
+ times = random_integer(length_or_range)
111
+ out = ""
112
+ times.times { out << chars[rand * chars.length] }
113
+ out
114
+ end
115
+
116
+ def self.random_english_sentences(num)
117
+ @@walker_method ||= begin
118
+ words, counts = [], []
119
+ File.read(File.expand_path(File.join(File.dirname(__FILE__), 'my_obfuscate', 'data', 'en_50K.txt'))).each_line do |line|
120
+ word, count = line.split(/\s+/)
121
+ words << word
122
+ counts << count.to_i
123
+ end
124
+ WalkerMethod.new(words, counts)
125
+ end
126
+
127
+ sentences = []
128
+ num.times do
129
+ words = []
130
+ (3 + rand * 5).to_i.times { words << @@walker_method.random }
131
+ sentences << words.join(" ") + "."
132
+ sentences.last[0] = sentences.last[0].upcase
133
+ end
134
+ sentences.join(" ")
135
+ end
136
+
137
+ def self.clean_quotes(value)
138
+ value.gsub(/['"]/, '')
139
+ end
140
+
141
+ def self.clean_bad_whitespace(value)
142
+ value.gsub(/[\n\t\r]/, '')
143
+ end
144
+
145
+ end
146
+ end
@@ -0,0 +1,45 @@
1
+ class MyObfuscate
2
+ module CopyStatementParser
3
+
4
+ # Postgres uses COPY statements instead of INSERT and look like:
5
+ #
6
+ # COPY some_table (a, b, c, d) FROM stdin;
7
+ # 1 2 3 4
8
+ # 5 6 7 8
9
+ # \.
10
+ #
11
+ # This requires the parse methods to persist data (table name and
12
+ # column names) across multiple lines.
13
+ #
14
+ def parse(obfuscator, config, input_io, output_io)
15
+ current_table_name, current_columns = ""
16
+ inside_copy_statement = false
17
+
18
+ input_io.each do |line|
19
+ if parse_insert_statement(line)
20
+ raise RuntimeError.new("Cannot obfuscate Postgres dumps containing INSERT statements. Please use COPY statments.")
21
+ elsif table_data = parse_copy_statement(line)
22
+ inside_copy_statement = true
23
+
24
+ current_table_name = table_data[:table_name]
25
+ current_columns = table_data[:column_names]
26
+
27
+ if !config[current_table_name]
28
+ $stderr.puts "Deprecated: #{current_table_name} was not specified in the config. A future release will cause this to be an error. Please specify the table definition or set it to :keep."
29
+ end
30
+
31
+ output_io.write line
32
+ elsif line.match /\S*\.\n/
33
+ inside_copy_statement = false
34
+
35
+ output_io.write line
36
+ elsif inside_copy_statement
37
+ output_io.puts obfuscator.obfuscate_bulk_insert_line(line, current_table_name, current_columns)
38
+ else
39
+ output_io.write line
40
+ end
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,22 @@
1
+ class MyObfuscate
2
+ module InsertStatementParser
3
+
4
+ def parse(obfuscator, config, input_io, output_io)
5
+ input_io.each do |line|
6
+ if table_data = parse_insert_statement(line)
7
+ table_name = table_data[:table_name]
8
+ columns = table_data[:column_names]
9
+ if config[table_name]
10
+ output_io.puts obfuscator.obfuscate_bulk_insert_line(line, table_name, columns)
11
+ else
12
+ $stderr.puts "Deprecated: #{table_name} was not specified in the config. A future release will cause this to be an error. Please specify the table definition or set it to :keep."
13
+ output_io.write line
14
+ end
15
+ else
16
+ output_io.write line
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -1,6 +1,6 @@
1
1
  class MyObfuscate
2
2
  class Mysql
3
- include MyObfuscate::DatabaseHelperShared
3
+ include MyObfuscate::InsertStatementParser
4
4
 
5
5
  def parse_insert_statement(line)
6
6
  if regex_match = insert_regex.match(line)
@@ -11,7 +11,11 @@ class MyObfuscate
11
11
  end
12
12
  end
13
13
 
14
- def make_insert_statement(table_name, column_names, values_strings)
14
+ def make_insert_statement(table_name, column_names, values)
15
+ values_strings = values.collect do |values|
16
+ "(" + values.join(",") + ")"
17
+ end.join(",")
18
+
15
19
  "INSERT INTO `#{table_name}` (`#{column_names.join('`, `')}`) VALUES #{values_strings};"
16
20
  end
17
21
 
@@ -19,5 +23,75 @@ class MyObfuscate
19
23
  /^\s*INSERT INTO `(.*?)` \((.*?)\) VALUES\s*/i
20
24
  end
21
25
 
26
+ def rows_to_be_inserted(line)
27
+ line = line.gsub(insert_regex, '').gsub(/\s*;\s*$/, '')
28
+ context_aware_mysql_string_split(line)
29
+ end
30
+
31
+ def make_valid_value_string(value)
32
+ if value.nil?
33
+ "NULL"
34
+ elsif value =~ /^0x[0-9a-fA-F]+$/
35
+ value
36
+ else
37
+ "'" + value + "'"
38
+ end
39
+ end
40
+
41
+ # Be aware, strings must be quoted in single quotes!
42
+ def context_aware_mysql_string_split(string)
43
+ in_sub_insert = false
44
+ in_quoted_string = false
45
+ escaped = false
46
+ current_field = nil
47
+ length = string.length
48
+ fields = []
49
+ output = []
50
+
51
+ string.each_char do |i|
52
+ if escaped
53
+ escaped = false
54
+ current_field ||= ""
55
+ current_field << i
56
+ else
57
+ if i == "\\"
58
+ escaped = true
59
+ current_field ||= ""
60
+ current_field << i
61
+ elsif i == "(" && !in_quoted_string && !in_sub_insert
62
+ in_sub_insert = true
63
+ elsif i == ")" && !in_quoted_string && in_sub_insert
64
+ fields << current_field unless current_field.nil?
65
+ output << fields unless fields.length == 0
66
+ in_sub_insert = false
67
+ fields = []
68
+ current_field = nil
69
+ elsif i == "'" && !in_quoted_string
70
+ fields << current_field unless current_field.nil?
71
+ current_field = ''
72
+ in_quoted_string = true
73
+ elsif i == "'" && in_quoted_string
74
+ fields << current_field unless current_field.nil?
75
+ current_field = nil
76
+ in_quoted_string = false
77
+ elsif i == "," && !in_quoted_string && in_sub_insert
78
+ fields << current_field unless current_field.nil?
79
+ current_field = nil
80
+ elsif i == "L" && !in_quoted_string && in_sub_insert && current_field == "NUL"
81
+ current_field = nil
82
+ fields << current_field
83
+ elsif (i == " " || i == "\t") && !in_quoted_string
84
+ # Don't add whitespace not in a string
85
+ elsif in_sub_insert
86
+ current_field ||= ""
87
+ current_field << i
88
+ end
89
+ end
90
+ end
91
+
92
+ fields << current_field unless current_field.nil?
93
+ output << fields unless fields.length == 0
94
+ output
95
+ end
22
96
  end
23
97
  end
@@ -1,9 +1,31 @@
1
1
  class MyObfuscate
2
2
  class Postgres
3
- include MyObfuscate::DatabaseHelperShared
3
+ include MyObfuscate::CopyStatementParser
4
4
 
5
- def parse_insert_statement(line)
6
- if regex_match = insert_regex.match(line)
5
+ # Copy statements contain the column values tab seperated like so:
6
+ # blah blah blah blah
7
+ # which we want to turn into:
8
+ # [['blah','blah','blah','blah']]
9
+ #
10
+ # We wrap it in an array to keep it consistent with MySql bulk
11
+ # obfuscation (multiple rows per insert statement)
12
+ def rows_to_be_inserted(line)
13
+ line.gsub!(/\n$/,"")
14
+ row = line.split(/\t/)
15
+
16
+ row.collect! do |value|
17
+ if value == "\\N"
18
+ nil
19
+ else
20
+ value
21
+ end
22
+ end
23
+
24
+ [row]
25
+ end
26
+
27
+ def parse_copy_statement(line)
28
+ if regex_match = /^\s*COPY (.*?) \((.*?)\) FROM\s*/i.match(line)
7
29
  {
8
30
  :table_name => regex_match[1].to_sym,
9
31
  :column_names => regex_match[2].split(/\s*,\s*/).map(&:to_sym)
@@ -11,12 +33,20 @@ class MyObfuscate
11
33
  end
12
34
  end
13
35
 
14
- def make_insert_statement(table_name, column_names, values_strings)
15
- "INSERT INTO #{table_name} (#{column_names.join(', ')}) VALUES #{values_strings};"
36
+ def make_insert_statement(table_name, column_names, values)
37
+ values.join("\t")
16
38
  end
17
39
 
18
- def insert_regex
19
- /^\s*INSERT INTO (.*?) \((.*?)\) VALUES\s*/i
40
+ def make_valid_value_string(value)
41
+ if value.nil?
42
+ "\\N"
43
+ else
44
+ value
45
+ end
46
+ end
47
+
48
+ def parse_insert_statement(line)
49
+ /^\s*INSERT INTO/i.match(line)
20
50
  end
21
51
 
22
52
  end
@@ -1,5 +1,6 @@
1
1
  class MyObfuscate
2
2
  class SqlServer
3
+ include MyObfuscate::InsertStatementParser
3
4
 
4
5
  def parse_insert_statement(line)
5
6
  if regex_match = insert_regex.match(line)
@@ -25,7 +26,11 @@ class MyObfuscate
25
26
  end
26
27
  end
27
28
 
28
- def make_insert_statement(table_name, column_names, values_strings)
29
+ def make_insert_statement(table_name, column_names, values)
30
+ values_strings = values.collect do |values|
31
+ "(" + values.join(",") + ")"
32
+ end.join(",")
33
+
29
34
  "INSERT [dbo].[#{table_name}] ([#{column_names.join("], [")}]) VALUES #{values_strings};"
30
35
  end
31
36
 
@@ -1,3 +1,3 @@
1
1
  class MyObfuscate
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/my_obfuscate.rb CHANGED
@@ -39,22 +39,7 @@ class MyObfuscate
39
39
  # Read an input stream and dump out an obfuscated output stream. These streams could be StringIO objects, Files,
40
40
  # or STDIN and STDOUT.
41
41
  def obfuscate(input_io, output_io)
42
-
43
- # We assume that every INSERT INTO line occupies one line in the file, with no internal linebreaks.
44
- input_io.each do |line|
45
- if table_data = database_helper.parse_insert_statement(line)
46
- table_name = table_data[:table_name]
47
- columns = table_data[:column_names]
48
- if config[table_name]
49
- output_io.puts obfuscate_bulk_insert_line(line, table_name, columns)
50
- else
51
- $stderr.puts "Deprecated: #{table_name} was not specified in the config. A future release will cause this to be an error. Please specify the table definition or set it to :keep."
52
- output_io.write line
53
- end
54
- else
55
- output_io.write line
56
- end
57
- end
42
+ database_helper.parse(self, config, input_io, output_io)
58
43
  end
59
44
 
60
45
  def reassembling_each_insert(line, table_name, columns)
@@ -63,146 +48,12 @@ class MyObfuscate
63
48
  result = result.map do |i|
64
49
  database_helper.make_valid_value_string(i)
65
50
  end
66
- result = result.join(",")
67
- "(" + result + ")"
68
- end.join(",")
69
- database_helper.make_insert_statement(table_name, columns, output)
70
- end
71
-
72
- def self.row_as_hash(row, columns)
73
- columns.zip(row).inject({}) {|m, (name, value)| m[name] = value; m}
74
- end
75
-
76
- def self.make_conditional_method(conditional_method, index, row)
77
- if conditional_method.is_a?(Symbol)
78
- if conditional_method == :blank
79
- conditional_method = lambda { |row_hash| row[index].nil? || row[index] == '' }
80
- elsif conditional_method == :nil
81
- conditional_method = lambda { |row_hash| row[index].nil? }
82
- end
83
51
  end
84
- conditional_method
85
- end
86
-
87
- def self.apply_table_config(row, table_config, columns)
88
- return row unless table_config.is_a?(Hash)
89
- row_hash = row_as_hash(row, columns)
90
-
91
- table_config.each do |column, definition|
92
- index = columns.index(column)
93
-
94
- definition = { :type => definition } if definition.is_a?(Symbol)
95
-
96
- if definition.has_key?(:unless)
97
- unless_check = make_conditional_method(definition[:unless], index, row)
98
-
99
- next if unless_check.call(row_hash)
100
- end
101
-
102
-
103
- if definition.has_key?(:if)
104
- if_check = make_conditional_method(definition[:if], index, row)
105
-
106
- next unless if_check.call(row_hash)
107
- end
108
-
109
- if definition[:skip_regexes]
110
- next if definition[:skip_regexes].any? {|regex| row[index] =~ regex}
111
- end
112
-
113
- row[index.to_i] = case definition[:type]
114
- when :email
115
- md5 = Digest::MD5.hexdigest(rand.to_s)[0...5]
116
- clean_quotes("#{Faker::Internet.email}.#{md5}.example.com")
117
- when :string
118
- random_string(definition[:length] || 30, definition[:chars] || SENSIBLE_CHARS)
119
- when :lorem
120
- clean_bad_whitespace(clean_quotes(Faker::Lorem.sentences(definition[:number] || 1).join(". ")))
121
- when :like_english
122
- clean_quotes random_english_sentences(definition[:number] || 1)
123
- when :name
124
- clean_quotes(Faker::Name.name)
125
- when :first_name
126
- clean_quotes(Faker::Name.first_name)
127
- when :last_name
128
- clean_quotes(Faker::Name.last_name)
129
- when :address
130
- clean_quotes("#{Faker::AddressUS.street_address}\\n#{Faker::AddressUS.city}, #{Faker::AddressUS.state_abbr} #{Faker::AddressUS.zip_code}")
131
- when :street_address
132
- clean_bad_whitespace(clean_quotes(Faker::AddressUS.street_address))
133
- when :city
134
- clean_quotes(Faker::AddressUS.city)
135
- when :state
136
- clean_quotes Faker::AddressUS.state_abbr
137
- when :zip_code
138
- Faker::AddressUS.zip_code
139
- when :phone
140
- clean_quotes Faker::PhoneNumber.phone_number
141
- when :company
142
- clean_bad_whitespace(clean_quotes(Faker::Company.name))
143
- when :ipv4
144
- Faker::Internet.ip_v4_address
145
- when :ipv6
146
- # Inlined from Faker because ffaker doesn't have ipv6.
147
- @@ip_v6_space ||= (0..65535).to_a
148
- container = (1..8).map{ |_| @@ip_v6_space.sample }
149
- container.map{ |n| n.to_s(16) }.join(':')
150
- when :url
151
- clean_bad_whitespace(Faker::Internet.http_url)
152
- when :integer
153
- random_integer(definition[:between] || (0..1000)).to_s
154
- when :fixed
155
- if definition[:one_of]
156
- definition[:one_of][(rand * definition[:one_of].length).to_i]
157
- else
158
- definition[:string].is_a?(Proc) ? definition[:string].call(row_hash) : definition[:string]
159
- end
160
- when :null
161
- nil
162
- when :keep
163
- row[index]
164
- else
165
- $stderr.puts "Keeping a column value by providing an unknown type (#{definition[:type]}) is deprecated. Use :keep instead."
166
- row[index]
167
- end
168
- end
169
- row
170
- end
171
-
172
- def self.random_integer(between)
173
- (between.min + (between.max - between.min) * rand).round
174
- end
175
-
176
- def self.random_string(length_or_range, chars)
177
- length_or_range = (length_or_range..length_or_range) if length_or_range.is_a?(Fixnum)
178
- times = random_integer(length_or_range)
179
- out = ""
180
- times.times { out << chars[rand * chars.length] }
181
- out
182
- end
183
-
184
- def self.random_english_sentences(num)
185
- @@walker_method ||= begin
186
- words, counts = [], []
187
- File.read(File.expand_path(File.join(File.dirname(__FILE__), 'my_obfuscate', 'data', 'en_50K.txt'))).each_line do |line|
188
- word, count = line.split(/\s+/)
189
- words << word
190
- counts << count.to_i
191
- end
192
- WalkerMethod.new(words, counts)
193
- end
194
-
195
- sentences = []
196
- num.times do
197
- words = []
198
- (5 + rand * 6).to_i.times { words << @@walker_method.random }
199
- sentences << words.join(" ") + "."
200
- sentences.last[0] = sentences.last[0].upcase
201
- end
202
- sentences.join(" ")
52
+ database_helper.make_insert_statement(table_name, columns, output)
203
53
  end
204
54
 
205
55
  def check_for_defined_columns_not_in_table(table_name, columns)
56
+ return unless config[table_name]
206
57
  missing_columns = config[table_name].keys - columns
207
58
  unless missing_columns.length == 0
208
59
  error_message = missing_columns.map do |missing_column|
@@ -233,23 +84,16 @@ class MyObfuscate
233
84
  check_for_table_columns_not_in_definition(table_name, columns) if fail_on_unspecified_columns?
234
85
  # Note: Remember to SQL escape strings in what you pass back.
235
86
  reassembling_each_insert(line, table_name, columns) do |row|
236
- MyObfuscate.apply_table_config(row, table_config, columns)
87
+ ConfigApplicator.apply_table_config(row, table_config, columns)
237
88
  end
238
89
  end
239
90
  end
240
91
 
241
- private
242
-
243
- def self.clean_quotes(value)
244
- value.gsub(/['"]/, '')
245
- end
246
-
247
- def self.clean_bad_whitespace(value)
248
- value.gsub(/[\n\t\r]/, '')
249
- end
250
92
  end
251
93
 
252
- require 'my_obfuscate/database_helper_shared'
94
+ require 'my_obfuscate/copy_statement_parser'
95
+ require 'my_obfuscate/insert_statement_parser'
253
96
  require 'my_obfuscate/mysql'
254
97
  require 'my_obfuscate/sql_server'
255
98
  require 'my_obfuscate/postgres'
99
+ require 'my_obfuscate/config_applicator'