groonga-delta 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,391 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "arrow"
17
+ require "mysql2"
18
+
19
+ require_relative "error"
20
+ require_relative "writer"
21
+
22
+ module GroongaDelta
23
+ class MySQLSource
24
+ def initialize(config, status)
25
+ @logger = config.logger
26
+ @writer = Writer.new(@logger, config.delta_dir)
27
+ @config = config.mysql
28
+ @binlog_dir = @config.binlog_dir
29
+ @mapping = config.mapping
30
+ @status = status.mysql
31
+ @tables = {}
32
+ end
33
+
34
+ def import
35
+ case ENV["GROONGA_DELTA_IMPORT_MYSQL_SOURCE_BACKEND"]
36
+ when "mysqlbinlog"
37
+ require "mysql_binlog"
38
+ import_mysqlbinlog
39
+ when "mysql2-replication"
40
+ require "mysql2-replication"
41
+ import_mysql2_replication
42
+ else
43
+ begin
44
+ require "mysql2-replication"
45
+ rescue LoadError
46
+ require "mysql_binlog"
47
+ import_mysqlbinlog
48
+ else
49
+ import_mysql2_replication
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+ def import_mysqlbinlog
56
+ file, position = read_current_status
57
+ FileUtils.mkdir_p(@binlog_dir)
58
+ local_file = File.join(@binlog_dir, file)
59
+ unless File.exist?(local_file.succ)
60
+ command_line = [@config.mysqlbinlog].flatten
61
+ command_line << "--host=#{@config.host}" if @config.host
62
+ command_line << "--port=#{@config.port}" if @config.port
63
+ command_line << "--socket=#{@config.socket}" if @config.socket
64
+ if @config.replication_slave_user
65
+ command_line << "--user=#{@config.replication_slave_user}"
66
+ end
67
+ if @config.replication_slave_password
68
+ command_line << "--password=#{@config.replication_slave_password}"
69
+ end
70
+ command_line << "--read-from-remote-server"
71
+ command_line << "--raw"
72
+ command_line << "--result-file=#{@binlog_dir}/"
73
+ command_line << file
74
+ spawn_process(*command_line) do |pid, output_read, error_read|
75
+ end
76
+ end
77
+ reader = MysqlBinlog::BinlogFileReader.new(local_file)
78
+ binlog = MysqlBinlog::Binlog.new(reader)
79
+ binlog.checksum = @config.checksum
80
+ binlog.ignore_rotate = true
81
+ binlog.each_event do |event|
82
+ next if event[:position] < position
83
+ case event[:type]
84
+ when :rotate_event
85
+ @status.update("file" => event[:event][:name],
86
+ "position" => event[:event][:pos])
87
+ when :write_rows_event_v1,
88
+ :write_rows_event_v2,
89
+ :update_rows_event_v1,
90
+ :update_rows_event_v2,
91
+ :delete_rows_event_v1,
92
+ :delete_rows_event_v2
93
+ normalized_type = event[:type].to_s.gsub(/_v\d\z/, "").to_sym
94
+ import_rows_event(normalized_type,
95
+ event[:event][:table][:db],
96
+ event[:event][:table][:table],
97
+ file,
98
+ event[:header][:next_position]) do
99
+ case normalized_type
100
+ when :write_rows_event,
101
+ :update_rows_event
102
+ event[:event][:row_image].collect do |row_image|
103
+ build_row(row_image[:after][:image])
104
+ end
105
+ when :delete_rows_event
106
+ event[:event][:row_image].collect do |row_image|
107
+ build_row(row_image[:before][:image])
108
+ end
109
+ end
110
+ end
111
+ position = event[:header][:next_position]
112
+ end
113
+ end
114
+ end
115
+
116
+ def import_mysql2_replication
117
+ file, position = read_current_status
118
+ is_mysql_56_or_later = mysql(@config.select_user,
119
+ @config.select_password) do |select_client|
120
+ mysql_version(select_client) >= Gem::Version.new("5.6")
121
+ end
122
+ mysql(@config.replication_slave_user,
123
+ @config.replication_slave_password) do |client|
124
+ if is_mysql_56_or_later
125
+ replication_client = Mysql2Replication::Client.new(client)
126
+ else
127
+ replication_client = Mysql2Replication::Client.new(client,
128
+ checksum: "NONE")
129
+ end
130
+ replication_client.file_name = file
131
+ replication_client.start_position = position
132
+ replication_client.open do
133
+ replication_client.each do |event|
134
+ case event
135
+ when Mysql2Replication::RotateEvent
136
+ file = event.file_name
137
+ when Mysql2Replication::RowsEvent
138
+ event_name = event.class.name.split("::").last
139
+ normalized_type =
140
+ event_name.scan(/[A-Z][a-z]+/).
141
+ collect(&:downcase).
142
+ join("_").
143
+ to_sym
144
+ import_rows_event(normalized_type,
145
+ event.table_map.database,
146
+ event.table_map.table,
147
+ file,
148
+ event.next_position) do
149
+ case normalized_type
150
+ when :update_rows_event
151
+ event.updated_rows
152
+ else
153
+ event.rows
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ def import_rows_event(type,
163
+ database_name,
164
+ table_name,
165
+ file,
166
+ next_position,
167
+ &block)
168
+ source_table = @mapping[database_name, table_name]
169
+ return if source_table.nil?
170
+
171
+ table = find_table(database_name, table_name)
172
+ groonga_table = source_table.groonga_table
173
+ target_rows = block.call
174
+ groonga_records = target_rows.collect do |row|
175
+ record = build_record(table, row)
176
+ groonga_table.generate_record(record)
177
+ end
178
+ return if groonga_records.empty?
179
+
180
+ case type
181
+ when :write_rows_event,
182
+ :update_rows_event
183
+ @writer.write_upserts(groonga_table.name, groonga_records)
184
+ when :delete_rows_event
185
+ groonga_record_keys = groonga_records.collect do |record|
186
+ record[:_key]
187
+ end
188
+ @writer.write_deletes(groonga_table.name,
189
+ groonga_record_keys)
190
+ end
191
+ @status.update("file" => file,
192
+ "position" => next_position)
193
+ end
194
+
195
+ def wait_process(command_line, pid, output_read, error_read)
196
+ begin
197
+ _, status = Process.waitpid2(pid)
198
+ rescue SystemCallError
199
+ else
200
+ unless status.success?
201
+ message = "Failed to run: #{command_line.join(' ')}\n"
202
+ message << "--- output ---\n"
203
+ message << output_read.read
204
+ message << "--------------\n"
205
+ message << "--- error ----\n"
206
+ message << error_read.read
207
+ message << "--------------\n"
208
+ raise ProcessError, message
209
+ end
210
+ end
211
+ end
212
+
213
+ def spawn_process(*command_line)
214
+ env = {
215
+ "LC_ALL" => "C",
216
+ }
217
+ output_read, output_write = IO.pipe
218
+ error_read, error_write = IO.pipe
219
+ options = {
220
+ :out => output_write,
221
+ :err => error_write,
222
+ }
223
+ pid = spawn(env, *command_line, options)
224
+ output_write.close
225
+ error_write.close
226
+ if block_given?
227
+ begin
228
+ yield(pid, output_read, error_read)
229
+ rescue
230
+ begin
231
+ Process.kill(:TERM, pid)
232
+ rescue SystemCallError
233
+ end
234
+ raise
235
+ ensure
236
+ wait_process(command_line, pid, output_read, error_read)
237
+ output_read.close unless output_read.closed?
238
+ error_read.close unless error_read.closed?
239
+ end
240
+ else
241
+ [pid, output_read, error_read]
242
+ end
243
+ end
244
+
245
+ def mysql(user, password)
246
+ options = {}
247
+ options[:host] = @config.host if @config.host
248
+ options[:port] = @config.port if @config.port
249
+ options[:socket] = @config.socket if @config.socket
250
+ options[:username] = user if user
251
+ options[:password] = password if password
252
+ client = Mysql2::Client.new(**options)
253
+ begin
254
+ yield(client)
255
+ ensure
256
+ client.close unless client.closed?
257
+ end
258
+ end
259
+
260
+ def mysql_version(client)
261
+ version = client.query("SELECT version()", as: :array).first.first
262
+ Gem::Version.new(version)
263
+ end
264
+
265
+ def read_current_status
266
+ if @status.file
267
+ [@status.file, @status.position]
268
+ else
269
+ file = nil
270
+ position = 0
271
+ mysql(@config.replication_client_user,
272
+ @config.replication_client_password) do |replication_client|
273
+ replication_client.query("FLUSH TABLES WITH READ LOCK")
274
+ result = replication_client.query("SHOW MASTER STATUS").first
275
+ file = result["File"]
276
+ position = result["Position"]
277
+ mysql(@config.select_user,
278
+ @config.select_password) do |select_client|
279
+ start_transaction = "START TRANSACTION " +
280
+ "WITH CONSISTENT SNAPSHOT"
281
+ if mysql_version(select_client) >= Gem::Version.new("5.6")
282
+ start_transaction += ", READ ONLY"
283
+ end
284
+ select_client.query(start_transaction)
285
+ replication_client.close
286
+ import_existing_data(select_client)
287
+ select_client.query("ROLLBACK")
288
+ end
289
+ end
290
+ @status.update("file" => file,
291
+ "position" => position)
292
+ [file, position]
293
+ end
294
+ end
295
+
296
+ def import_existing_data(client)
297
+ @mapping.source_databases.each do |source_database|
298
+ source_database.source_tables.each do |source_table|
299
+ statement = client.prepare(<<~SQL)
300
+ SELECT COUNT(*) AS n_tables
301
+ FROM information_schema.tables
302
+ WHERE
303
+ table_schema = ? AND
304
+ table_name = ?
305
+ SQL
306
+ result = statement.execute(source_database.name,
307
+ source_table.name)
308
+ n_tables = result.first["n_tables"]
309
+ statement.close
310
+ next if n_tables.zero?
311
+ full_table_name = "#{source_database.name}.#{source_table.name}"
312
+ source_column_names = source_table.source_column_names
313
+ column_list = source_column_names.join(", ")
314
+ select = "SELECT #{column_list} FROM #{full_table_name}"
315
+ if source_table.source_filter
316
+ select << " WHERE #{source_table.source_filter}"
317
+ end
318
+ result = client.query(select,
319
+ symbolize_keys: true,
320
+ cache_rows: false,
321
+ stream: true)
322
+ groonga_table = source_table.groonga_table
323
+ target_message = "#{full_table_name} -> #{groonga_table.name}"
324
+ @logger.info("Start importing: #{target_message}")
325
+ enumerator = result.to_enum(:each)
326
+ n_rows = 0
327
+ batch_size = @config.initial_import_batch_size
328
+ enumerator.each_slice(batch_size) do |rows|
329
+ @logger.info("Generating records: #{target_message}")
330
+ groonga_record_batch = groonga_table.generate_record_batch(rows)
331
+ @logger.info("Generated records: #{target_message}")
332
+ @writer.write_upserts(groonga_table.name,
333
+ groonga_record_batch.to_table)
334
+ n_rows += rows.size
335
+ @logger.info("Importing: #{target_message}: " +
336
+ "#{n_rows}(+#{rows.size})")
337
+ end
338
+ @logger.info("Imported: #{target_message}: #{n_rows}")
339
+ end
340
+ end
341
+ end
342
+
343
+ def find_table(database_name, table_name)
344
+ return @tables[table_name] if @tables.key?(table_name)
345
+
346
+ mysql(@config.select_user,
347
+ @config.select_password) do |client|
348
+ statement = client.prepare(<<~SQL)
349
+ SELECT column_name,
350
+ ordinal_position,
351
+ data_type,
352
+ column_key
353
+ FROM information_schema.columns
354
+ WHERE
355
+ table_schema = ? AND
356
+ table_name = ?
357
+ SQL
358
+ result = statement.execute(database_name, table_name)
359
+ columns = result.collect do |column|
360
+ {
361
+ name: column["column_name"],
362
+ ordinal_position: column["ordinal_position"],
363
+ data_type: column["data_type"],
364
+ is_primary_key: column["column_key"] == "PRI",
365
+ }
366
+ end
367
+ @tables[table_name] = columns.sort_by do |column|
368
+ column[:ordinal_position]
369
+ end
370
+ end
371
+ end
372
+
373
+ def build_row(value_pairs)
374
+ row = {}
375
+ value_pairs.each do |value_pair|
376
+ value_pair.each do |column_index, value|
377
+ row[column_index] = value
378
+ end
379
+ end
380
+ row
381
+ end
382
+
383
+ def build_record(table, row)
384
+ record = {}
385
+ row.each do |column_index, value|
386
+ record[table[column_index][:name].to_sym] = value
387
+ end
388
+ record
389
+ end
390
+ end
391
+ end
@@ -0,0 +1,43 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+ require "yaml"
18
+
19
+ module GroongaDelta
20
+ class Status
21
+ def initialize(dir)
22
+ @dir = dir
23
+ @path = File.join(@dir, "status.yaml")
24
+ if File.exist?(@path)
25
+ @data = YAML.load(File.read(@path))
26
+ else
27
+ @data = {}
28
+ end
29
+ end
30
+
31
+ def [](key)
32
+ @data[key]
33
+ end
34
+
35
+ def update(data)
36
+ @data.update(data)
37
+ FileUtils.mkdir_p(@dir)
38
+ File.open(@path, "w") do |output|
39
+ output.puts(YAML.dump(@data))
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,18 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module GroongaDelta
17
+ VERSION = "1.0.0"
18
+ end
@@ -0,0 +1,135 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+ require "json"
18
+
19
+ require "groonga/command"
20
+ require "parquet"
21
+
22
+ module GroongaDelta
23
+ class Writer
24
+ def initialize(logger, dir)
25
+ @logger = logger
26
+ @dir = dir
27
+ end
28
+
29
+ def write_upserts(table, records, packed: false)
30
+ if records.is_a?(Arrow::Table)
31
+ write_data(table,
32
+ "upsert",
33
+ ".parquet",
34
+ packed: packed,
35
+ open_output: false) do |output|
36
+ records.save(output, format: :parquet)
37
+ end
38
+ else
39
+ write_data(table, "upsert", ".grn", packed: packed) do |output|
40
+ first_record = true
41
+ records.each do |record|
42
+ if first_record
43
+ output.puts("load --table #{table}")
44
+ output.print("[")
45
+ first_record = false
46
+ else
47
+ output.print(",")
48
+ end
49
+ output.puts
50
+ json = "{"
51
+ record.each_with_index do |(key, value), i|
52
+ json << "," unless i.zero?
53
+ json << "#{key.to_s.to_json}:"
54
+ case value
55
+ when Time
56
+ json << value.dup.localtime.strftime("%Y-%m-%d %H:%M:%S").to_json
57
+ else
58
+ json << value.to_json
59
+ end
60
+ end
61
+ json << "}"
62
+ output.print(json)
63
+ end
64
+ unless first_record
65
+ output.puts()
66
+ output.puts("]")
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ def write_deletes(table, keys)
73
+ write_data(table, "delete", ".grn") do |output|
74
+ delete = Groonga::Command::Delete.new
75
+ delete[:table] = table
76
+ keys.each do |key|
77
+ delete[:key] = key
78
+ output.puts(delete.to_command_format)
79
+ end
80
+ end
81
+ end
82
+
83
+ def write_schema(command)
84
+ write_entry("schema", ".grn") do |output|
85
+ output.puts(command.to_command_format)
86
+ end
87
+ end
88
+
89
+ private
90
+ def write_entry(prefix, suffix, packed: false, open_output: true)
91
+ timestamp = Time.now.utc
92
+ base_name = timestamp.strftime("%Y-%m-%d-%H-%M-%S-%N#{suffix}")
93
+ if packed
94
+ dir = "#{@dir}/#{prefix}/packed"
95
+ packed_dir_base_name = timestamp.strftime("%Y-%m-%d-%H-%M-%S-%N")
96
+ temporary_path = "#{dir}/.#{packed_dir_base_name}/#{base_name}"
97
+ path = "#{dir}/#{packed_dir_base_name}/#{base_name}"
98
+ else
99
+ day = timestamp.strftime("%Y-%m-%d")
100
+ dir = "#{@dir}/#{prefix}/#{day}"
101
+ temporary_path = "#{dir}/.#{base_name}"
102
+ path = "#{dir}/#{base_name}"
103
+ end
104
+ @logger.info("Start writing: #{temporary_path}")
105
+ FileUtils.mkdir_p(File.dirname(temporary_path))
106
+ if open_output
107
+ File.open(temporary_path, "w") do |output|
108
+ yield(output)
109
+ end
110
+ else
111
+ yield(temporary_path)
112
+ end
113
+ if packed
114
+ FileUtils.mv(File.dirname(temporary_path),
115
+ File.dirname(path))
116
+ else
117
+ FileUtils.mv(temporary_path, path)
118
+ end
119
+ @logger.info("Wrote: #{path}")
120
+ end
121
+
122
+ def write_data(table,
123
+ action,
124
+ suffix,
125
+ packed: false,
126
+ open_output: true,
127
+ &block)
128
+ write_entry("data/#{table}",
129
+ "-#{action}#{suffix}",
130
+ packed: packed,
131
+ open_output: open_output,
132
+ &block)
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,18 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "groonga-delta/apply-command"
17
+ require_relative "groonga-delta/import-command"
18
+ require_relative "groonga-delta/version"