groonga-delta 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,391 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "arrow"
17
+ require "mysql2"
18
+
19
+ require_relative "error"
20
+ require_relative "writer"
21
+
22
+ module GroongaDelta
23
+ class MySQLSource
24
+ def initialize(config, status)
25
+ @logger = config.logger
26
+ @writer = Writer.new(@logger, config.delta_dir)
27
+ @config = config.mysql
28
+ @binlog_dir = @config.binlog_dir
29
+ @mapping = config.mapping
30
+ @status = status.mysql
31
+ @tables = {}
32
+ end
33
+
34
+ def import
35
+ case ENV["GROONGA_DELTA_IMPORT_MYSQL_SOURCE_BACKEND"]
36
+ when "mysqlbinlog"
37
+ require "mysql_binlog"
38
+ import_mysqlbinlog
39
+ when "mysql2-replication"
40
+ require "mysql2-replication"
41
+ import_mysql2_replication
42
+ else
43
+ begin
44
+ require "mysql2-replication"
45
+ rescue LoadError
46
+ require "mysql_binlog"
47
+ import_mysqlbinlog
48
+ else
49
+ import_mysql2_replication
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+ def import_mysqlbinlog
56
+ file, position = read_current_status
57
+ FileUtils.mkdir_p(@binlog_dir)
58
+ local_file = File.join(@binlog_dir, file)
59
+ unless File.exist?(local_file.succ)
60
+ command_line = [@config.mysqlbinlog].flatten
61
+ command_line << "--host=#{@config.host}" if @config.host
62
+ command_line << "--port=#{@config.port}" if @config.port
63
+ command_line << "--socket=#{@config.socket}" if @config.socket
64
+ if @config.replication_slave_user
65
+ command_line << "--user=#{@config.replication_slave_user}"
66
+ end
67
+ if @config.replication_slave_password
68
+ command_line << "--password=#{@config.replication_slave_password}"
69
+ end
70
+ command_line << "--read-from-remote-server"
71
+ command_line << "--raw"
72
+ command_line << "--result-file=#{@binlog_dir}/"
73
+ command_line << file
74
+ spawn_process(*command_line) do |pid, output_read, error_read|
75
+ end
76
+ end
77
+ reader = MysqlBinlog::BinlogFileReader.new(local_file)
78
+ binlog = MysqlBinlog::Binlog.new(reader)
79
+ binlog.checksum = @config.checksum
80
+ binlog.ignore_rotate = true
81
+ binlog.each_event do |event|
82
+ next if event[:position] < position
83
+ case event[:type]
84
+ when :rotate_event
85
+ @status.update("file" => event[:event][:name],
86
+ "position" => event[:event][:pos])
87
+ when :write_rows_event_v1,
88
+ :write_rows_event_v2,
89
+ :update_rows_event_v1,
90
+ :update_rows_event_v2,
91
+ :delete_rows_event_v1,
92
+ :delete_rows_event_v2
93
+ normalized_type = event[:type].to_s.gsub(/_v\d\z/, "").to_sym
94
+ import_rows_event(normalized_type,
95
+ event[:event][:table][:db],
96
+ event[:event][:table][:table],
97
+ file,
98
+ event[:header][:next_position]) do
99
+ case normalized_type
100
+ when :write_rows_event,
101
+ :update_rows_event
102
+ event[:event][:row_image].collect do |row_image|
103
+ build_row(row_image[:after][:image])
104
+ end
105
+ when :delete_rows_event
106
+ event[:event][:row_image].collect do |row_image|
107
+ build_row(row_image[:before][:image])
108
+ end
109
+ end
110
+ end
111
+ position = event[:header][:next_position]
112
+ end
113
+ end
114
+ end
115
+
116
+ def import_mysql2_replication
117
+ file, position = read_current_status
118
+ is_mysql_56_or_later = mysql(@config.select_user,
119
+ @config.select_password) do |select_client|
120
+ mysql_version(select_client) >= Gem::Version.new("5.6")
121
+ end
122
+ mysql(@config.replication_slave_user,
123
+ @config.replication_slave_password) do |client|
124
+ if is_mysql_56_or_later
125
+ replication_client = Mysql2Replication::Client.new(client)
126
+ else
127
+ replication_client = Mysql2Replication::Client.new(client,
128
+ checksum: "NONE")
129
+ end
130
+ replication_client.file_name = file
131
+ replication_client.start_position = position
132
+ replication_client.open do
133
+ replication_client.each do |event|
134
+ case event
135
+ when Mysql2Replication::RotateEvent
136
+ file = event.file_name
137
+ when Mysql2Replication::RowsEvent
138
+ event_name = event.class.name.split("::").last
139
+ normalized_type =
140
+ event_name.scan(/[A-Z][a-z]+/).
141
+ collect(&:downcase).
142
+ join("_").
143
+ to_sym
144
+ import_rows_event(normalized_type,
145
+ event.table_map.database,
146
+ event.table_map.table,
147
+ file,
148
+ event.next_position) do
149
+ case normalized_type
150
+ when :update_rows_event
151
+ event.updated_rows
152
+ else
153
+ event.rows
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ def import_rows_event(type,
163
+ database_name,
164
+ table_name,
165
+ file,
166
+ next_position,
167
+ &block)
168
+ source_table = @mapping[database_name, table_name]
169
+ return if source_table.nil?
170
+
171
+ table = find_table(database_name, table_name)
172
+ groonga_table = source_table.groonga_table
173
+ target_rows = block.call
174
+ groonga_records = target_rows.collect do |row|
175
+ record = build_record(table, row)
176
+ groonga_table.generate_record(record)
177
+ end
178
+ return if groonga_records.empty?
179
+
180
+ case type
181
+ when :write_rows_event,
182
+ :update_rows_event
183
+ @writer.write_upserts(groonga_table.name, groonga_records)
184
+ when :delete_rows_event
185
+ groonga_record_keys = groonga_records.collect do |record|
186
+ record[:_key]
187
+ end
188
+ @writer.write_deletes(groonga_table.name,
189
+ groonga_record_keys)
190
+ end
191
+ @status.update("file" => file,
192
+ "position" => next_position)
193
+ end
194
+
195
+ def wait_process(command_line, pid, output_read, error_read)
196
+ begin
197
+ _, status = Process.waitpid2(pid)
198
+ rescue SystemCallError
199
+ else
200
+ unless status.success?
201
+ message = "Failed to run: #{command_line.join(' ')}\n"
202
+ message << "--- output ---\n"
203
+ message << output_read.read
204
+ message << "--------------\n"
205
+ message << "--- error ----\n"
206
+ message << error_read.read
207
+ message << "--------------\n"
208
+ raise ProcessError, message
209
+ end
210
+ end
211
+ end
212
+
213
+ def spawn_process(*command_line)
214
+ env = {
215
+ "LC_ALL" => "C",
216
+ }
217
+ output_read, output_write = IO.pipe
218
+ error_read, error_write = IO.pipe
219
+ options = {
220
+ :out => output_write,
221
+ :err => error_write,
222
+ }
223
+ pid = spawn(env, *command_line, options)
224
+ output_write.close
225
+ error_write.close
226
+ if block_given?
227
+ begin
228
+ yield(pid, output_read, error_read)
229
+ rescue
230
+ begin
231
+ Process.kill(:TERM, pid)
232
+ rescue SystemCallError
233
+ end
234
+ raise
235
+ ensure
236
+ wait_process(command_line, pid, output_read, error_read)
237
+ output_read.close unless output_read.closed?
238
+ error_read.close unless error_read.closed?
239
+ end
240
+ else
241
+ [pid, output_read, error_read]
242
+ end
243
+ end
244
+
245
+ def mysql(user, password)
246
+ options = {}
247
+ options[:host] = @config.host if @config.host
248
+ options[:port] = @config.port if @config.port
249
+ options[:socket] = @config.socket if @config.socket
250
+ options[:username] = user if user
251
+ options[:password] = password if password
252
+ client = Mysql2::Client.new(**options)
253
+ begin
254
+ yield(client)
255
+ ensure
256
+ client.close unless client.closed?
257
+ end
258
+ end
259
+
260
+ def mysql_version(client)
261
+ version = client.query("SELECT version()", as: :array).first.first
262
+ Gem::Version.new(version)
263
+ end
264
+
265
+ def read_current_status
266
+ if @status.file
267
+ [@status.file, @status.position]
268
+ else
269
+ file = nil
270
+ position = 0
271
+ mysql(@config.replication_client_user,
272
+ @config.replication_client_password) do |replication_client|
273
+ replication_client.query("FLUSH TABLES WITH READ LOCK")
274
+ result = replication_client.query("SHOW MASTER STATUS").first
275
+ file = result["File"]
276
+ position = result["Position"]
277
+ mysql(@config.select_user,
278
+ @config.select_password) do |select_client|
279
+ start_transaction = "START TRANSACTION " +
280
+ "WITH CONSISTENT SNAPSHOT"
281
+ if mysql_version(select_client) >= Gem::Version.new("5.6")
282
+ start_transaction += ", READ ONLY"
283
+ end
284
+ select_client.query(start_transaction)
285
+ replication_client.close
286
+ import_existing_data(select_client)
287
+ select_client.query("ROLLBACK")
288
+ end
289
+ end
290
+ @status.update("file" => file,
291
+ "position" => position)
292
+ [file, position]
293
+ end
294
+ end
295
+
296
+ def import_existing_data(client)
297
+ @mapping.source_databases.each do |source_database|
298
+ source_database.source_tables.each do |source_table|
299
+ statement = client.prepare(<<~SQL)
300
+ SELECT COUNT(*) AS n_tables
301
+ FROM information_schema.tables
302
+ WHERE
303
+ table_schema = ? AND
304
+ table_name = ?
305
+ SQL
306
+ result = statement.execute(source_database.name,
307
+ source_table.name)
308
+ n_tables = result.first["n_tables"]
309
+ statement.close
310
+ next if n_tables.zero?
311
+ full_table_name = "#{source_database.name}.#{source_table.name}"
312
+ source_column_names = source_table.source_column_names
313
+ column_list = source_column_names.join(", ")
314
+ select = "SELECT #{column_list} FROM #{full_table_name}"
315
+ if source_table.source_filter
316
+ select << " WHERE #{source_table.source_filter}"
317
+ end
318
+ result = client.query(select,
319
+ symbolize_keys: true,
320
+ cache_rows: false,
321
+ stream: true)
322
+ groonga_table = source_table.groonga_table
323
+ target_message = "#{full_table_name} -> #{groonga_table.name}"
324
+ @logger.info("Start importing: #{target_message}")
325
+ enumerator = result.to_enum(:each)
326
+ n_rows = 0
327
+ batch_size = @config.initial_import_batch_size
328
+ enumerator.each_slice(batch_size) do |rows|
329
+ @logger.info("Generating records: #{target_message}")
330
+ groonga_record_batch = groonga_table.generate_record_batch(rows)
331
+ @logger.info("Generated records: #{target_message}")
332
+ @writer.write_upserts(groonga_table.name,
333
+ groonga_record_batch.to_table)
334
+ n_rows += rows.size
335
+ @logger.info("Importing: #{target_message}: " +
336
+ "#{n_rows}(+#{rows.size})")
337
+ end
338
+ @logger.info("Imported: #{target_message}: #{n_rows}")
339
+ end
340
+ end
341
+ end
342
+
343
+ def find_table(database_name, table_name)
344
+ return @tables[table_name] if @tables.key?(table_name)
345
+
346
+ mysql(@config.select_user,
347
+ @config.select_password) do |client|
348
+ statement = client.prepare(<<~SQL)
349
+ SELECT column_name,
350
+ ordinal_position,
351
+ data_type,
352
+ column_key
353
+ FROM information_schema.columns
354
+ WHERE
355
+ table_schema = ? AND
356
+ table_name = ?
357
+ SQL
358
+ result = statement.execute(database_name, table_name)
359
+ columns = result.collect do |column|
360
+ {
361
+ name: column["column_name"],
362
+ ordinal_position: column["ordinal_position"],
363
+ data_type: column["data_type"],
364
+ is_primary_key: column["column_key"] == "PRI",
365
+ }
366
+ end
367
+ @tables[table_name] = columns.sort_by do |column|
368
+ column[:ordinal_position]
369
+ end
370
+ end
371
+ end
372
+
373
+ def build_row(value_pairs)
374
+ row = {}
375
+ value_pairs.each do |value_pair|
376
+ value_pair.each do |column_index, value|
377
+ row[column_index] = value
378
+ end
379
+ end
380
+ row
381
+ end
382
+
383
+ def build_record(table, row)
384
+ record = {}
385
+ row.each do |column_index, value|
386
+ record[table[column_index][:name].to_sym] = value
387
+ end
388
+ record
389
+ end
390
+ end
391
+ end
@@ -0,0 +1,43 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+ require "yaml"
18
+
19
+ module GroongaDelta
20
+ class Status
21
+ def initialize(dir)
22
+ @dir = dir
23
+ @path = File.join(@dir, "status.yaml")
24
+ if File.exist?(@path)
25
+ @data = YAML.load(File.read(@path))
26
+ else
27
+ @data = {}
28
+ end
29
+ end
30
+
31
+ def [](key)
32
+ @data[key]
33
+ end
34
+
35
+ def update(data)
36
+ @data.update(data)
37
+ FileUtils.mkdir_p(@dir)
38
+ File.open(@path, "w") do |output|
39
+ output.puts(YAML.dump(@data))
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,18 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module GroongaDelta
17
+ VERSION = "1.0.0"
18
+ end
@@ -0,0 +1,135 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+ require "json"
18
+
19
+ require "groonga/command"
20
+ require "parquet"
21
+
22
+ module GroongaDelta
23
+ class Writer
24
+ def initialize(logger, dir)
25
+ @logger = logger
26
+ @dir = dir
27
+ end
28
+
29
+ def write_upserts(table, records, packed: false)
30
+ if records.is_a?(Arrow::Table)
31
+ write_data(table,
32
+ "upsert",
33
+ ".parquet",
34
+ packed: packed,
35
+ open_output: false) do |output|
36
+ records.save(output, format: :parquet)
37
+ end
38
+ else
39
+ write_data(table, "upsert", ".grn", packed: packed) do |output|
40
+ first_record = true
41
+ records.each do |record|
42
+ if first_record
43
+ output.puts("load --table #{table}")
44
+ output.print("[")
45
+ first_record = false
46
+ else
47
+ output.print(",")
48
+ end
49
+ output.puts
50
+ json = "{"
51
+ record.each_with_index do |(key, value), i|
52
+ json << "," unless i.zero?
53
+ json << "#{key.to_s.to_json}:"
54
+ case value
55
+ when Time
56
+ json << value.dup.localtime.strftime("%Y-%m-%d %H:%M:%S").to_json
57
+ else
58
+ json << value.to_json
59
+ end
60
+ end
61
+ json << "}"
62
+ output.print(json)
63
+ end
64
+ unless first_record
65
+ output.puts()
66
+ output.puts("]")
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ def write_deletes(table, keys)
73
+ write_data(table, "delete", ".grn") do |output|
74
+ delete = Groonga::Command::Delete.new
75
+ delete[:table] = table
76
+ keys.each do |key|
77
+ delete[:key] = key
78
+ output.puts(delete.to_command_format)
79
+ end
80
+ end
81
+ end
82
+
83
+ def write_schema(command)
84
+ write_entry("schema", ".grn") do |output|
85
+ output.puts(command.to_command_format)
86
+ end
87
+ end
88
+
89
+ private
90
+ def write_entry(prefix, suffix, packed: false, open_output: true)
91
+ timestamp = Time.now.utc
92
+ base_name = timestamp.strftime("%Y-%m-%d-%H-%M-%S-%N#{suffix}")
93
+ if packed
94
+ dir = "#{@dir}/#{prefix}/packed"
95
+ packed_dir_base_name = timestamp.strftime("%Y-%m-%d-%H-%M-%S-%N")
96
+ temporary_path = "#{dir}/.#{packed_dir_base_name}/#{base_name}"
97
+ path = "#{dir}/#{packed_dir_base_name}/#{base_name}"
98
+ else
99
+ day = timestamp.strftime("%Y-%m-%d")
100
+ dir = "#{@dir}/#{prefix}/#{day}"
101
+ temporary_path = "#{dir}/.#{base_name}"
102
+ path = "#{dir}/#{base_name}"
103
+ end
104
+ @logger.info("Start writing: #{temporary_path}")
105
+ FileUtils.mkdir_p(File.dirname(temporary_path))
106
+ if open_output
107
+ File.open(temporary_path, "w") do |output|
108
+ yield(output)
109
+ end
110
+ else
111
+ yield(temporary_path)
112
+ end
113
+ if packed
114
+ FileUtils.mv(File.dirname(temporary_path),
115
+ File.dirname(path))
116
+ else
117
+ FileUtils.mv(temporary_path, path)
118
+ end
119
+ @logger.info("Wrote: #{path}")
120
+ end
121
+
122
+ def write_data(table,
123
+ action,
124
+ suffix,
125
+ packed: false,
126
+ open_output: true,
127
+ &block)
128
+ write_entry("data/#{table}",
129
+ "-#{action}#{suffix}",
130
+ packed: packed,
131
+ open_output: open_output,
132
+ &block)
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,18 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "groonga-delta/apply-command"
17
+ require_relative "groonga-delta/import-command"
18
+ require_relative "groonga-delta/version"