groonga-delta 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,168 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "config"
17
+ require_relative "mapping"
18
+
19
+ module GroongaDelta
20
+ class ImportConfig < Config
21
+ def initialize(dir)
22
+ super("groonga-delta-import", dir)
23
+ end
24
+
25
+ def delta_dir
26
+ resolve_path(@data["delta_dir"] || "delta")
27
+ end
28
+
29
+ def mysql
30
+ return nil unless @data["mysql"]
31
+ MySQL.new(@dir,
32
+ @data["mysql"],
33
+ @secret_data["mysql"] || {})
34
+ end
35
+
36
+ def local
37
+ return nil unless @data["local"]
38
+ Local.new(@dir, @data["local"])
39
+ end
40
+
41
+ def mapping
42
+ Mapping.new(@data["mapping"] || {})
43
+ end
44
+
45
+ class MySQL
46
+ include Config::PathResolvable
47
+
48
+ def initialize(dir, data, secret_data)
49
+ @dir = dir
50
+ @data = data
51
+ @secret_data = secret_data
52
+ end
53
+
54
+ def binlog_dir
55
+ resolve_path(@data["binlog_dir"] || "binlog")
56
+ end
57
+
58
+ def mysqlbinlog
59
+ @data["mysqlbinlog"] || "mysqlbinlog"
60
+ end
61
+
62
+ def host
63
+ @data["host"] || "localhost"
64
+ end
65
+
66
+ def port
67
+ @data["port"] || 3306
68
+ end
69
+
70
+ def socket
71
+ @data["socket"]
72
+ end
73
+
74
+ def user
75
+ @data["user"]
76
+ end
77
+
78
+ def password
79
+ @secret_data["password"] || @data["password"]
80
+ end
81
+
82
+ def replication_client
83
+ @data["replication_client"] || @data
84
+ end
85
+
86
+ def replication_client_user
87
+ replication_client["user"]
88
+ end
89
+
90
+ def replication_client_password
91
+ (@secret_data["replication_client"] || @secret_data)["password"] ||
92
+ replication_client["password"]
93
+ end
94
+
95
+ def replication_slave
96
+ @data["replication_slave"] || @data
97
+ end
98
+
99
+ def replication_slave_user
100
+ replication_slave["user"]
101
+ end
102
+
103
+ def replication_slave_password
104
+ (@secret_data["replication_slave"] || @secret_data)["password"] ||
105
+ replication_slave["password"]
106
+ end
107
+
108
+ def select
109
+ @data["select"] || @data
110
+ end
111
+
112
+ def select_user
113
+ select["user"]
114
+ end
115
+
116
+ def select_password
117
+ (@secret_data["select"] || @secret_data)["password"] ||
118
+ select["password"]
119
+ end
120
+
121
+ def checksum
122
+ _checksum = @data["checksum"]
123
+ return nil if _checksum.nil?
124
+ _checksum.to_sym
125
+ end
126
+
127
+ def initial_import_batch_size
128
+ resolve_size(@data["initial_import_batch_size"] || 1024 * 1024)
129
+ end
130
+
131
+ private
132
+ def resolve_size(value)
133
+ case value
134
+ when String
135
+ case value
136
+ when /\A(\d+)[kK]\z/
137
+ Integer($1, 10) * 1024
138
+ when /\A(\d+)[mM]\z/
139
+ Integer($1, 10) * 1024 * 1024
140
+ when /\A(\d+)[gG]\z/
141
+ Integer($1, 10) * 1024 * 1024 * 1024
142
+ else
143
+ raise ConfigError, "invalid size value: #{value.inspect}"
144
+ end
145
+ else
146
+ value
147
+ end
148
+ end
149
+ end
150
+
151
+ class Local
152
+ include Config::PathResolvable
153
+
154
+ def initialize(dir, data)
155
+ @dir = dir
156
+ @data = data
157
+ end
158
+
159
+ def dir
160
+ resolve_path(@data["dir"] || "local")
161
+ end
162
+
163
+ def initial_max_number
164
+ @data["initial_max_number"] || Float::INFINITY
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,68 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "status"
17
+
18
+ module GroongaDelta
19
+ class ImportStatus < Status
20
+ def mysql
21
+ MySQL.new(self)
22
+ end
23
+
24
+ def local
25
+ Local.new(self)
26
+ end
27
+
28
+ class MySQL
29
+ def initialize(status)
30
+ @status = status
31
+ end
32
+
33
+ def [](key)
34
+ (@status["mysql"] || {})[key]
35
+ end
36
+
37
+ def update(new_data)
38
+ @status.update("mysql" => new_data)
39
+ end
40
+
41
+ def file
42
+ self["file"]
43
+ end
44
+
45
+ def position
46
+ self["position"]
47
+ end
48
+ end
49
+
50
+ class Local
51
+ def initialize(status)
52
+ @status = status
53
+ end
54
+
55
+ def [](key)
56
+ (@status["local"] || {})[key]
57
+ end
58
+
59
+ def update(new_data)
60
+ @status.update("local" => new_data)
61
+ end
62
+
63
+ def number
64
+ self["number"]
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,386 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+
18
+ require "groonga/client"
19
+ require "parquet"
20
+
21
+ require_relative "error"
22
+
23
+ module GroongaDelta
24
+ class LocalDelta
25
+ def initialize(config, status)
26
+ @config = config
27
+ @status = status
28
+ @logger = @config.logger
29
+ @delta_dir = @config.local.delta_dir
30
+ end
31
+
32
+ def apply
33
+ start_time = read_current_status
34
+ current_time = Time.now.utc
35
+ targets = list_targets(@delta_dir, start_time, current_time)
36
+ client_options = {
37
+ url: @config.groonga.url,
38
+ read_timeout: @config.groonga.read_timeout,
39
+ backend: :synchronous,
40
+ }
41
+ Groonga::Client.open(client_options) do |client|
42
+ processor = CommandProcessor.new(@config,
43
+ client,
44
+ target_commands: [],
45
+ target_tables: [],
46
+ target_columns: [])
47
+ targets.sort_by(&:timestamp).each do |target|
48
+ target.apply(@logger, client, processor)
49
+ @status.update("start_time" => [
50
+ target.timestamp.to_i,
51
+ target.timestamp.nsec,
52
+ ])
53
+ end
54
+ end
55
+ end
56
+
57
+ private
58
+ def build_time(year, month, day, hour=0, minute=0, second=0, nanosecond=0)
59
+ Time.utc(year,
60
+ month,
61
+ day,
62
+ hour,
63
+ minute,
64
+ Rational(second * 1_000_000_000 + nanosecond,
65
+ 1_000_000_000))
66
+ end
67
+
68
+ def read_current_status
69
+ start_time_unix_time, start_time_nanosecond = @status.start_time
70
+ if start_time_unix_time
71
+ start_time = Time.at(start_time_unix_time).utc
72
+ build_time(start_time.year,
73
+ start_time.month,
74
+ start_time.day,
75
+ start_time.hour,
76
+ start_time.min,
77
+ start_time.sec,
78
+ start_time_nanosecond)
79
+ else
80
+ Time.at(0).utc
81
+ end
82
+ end
83
+
84
+ def each_target_path(dir,
85
+ min_timestamp,
86
+ max_timestamp,
87
+ accept_directory: true,
88
+ &block)
89
+ Dir.glob("#{dir}/*") do |path|
90
+ base_name = File.basename(path)
91
+ if accept_directory and File.directory?(path)
92
+ timestamp = parse_directory_timestamp(base_name)
93
+ next if timestamp.nil?
94
+ next if min_timestamp and timestamp <= min_timestamp
95
+ next if max_timestamp and timestamp > max_timestamp
96
+ each_target_path(path,
97
+ min_timestamp,
98
+ max_timestamp,
99
+ accept_directory: false,
100
+ &block)
101
+ elsif File.file?(path)
102
+ timestamp, action, post_match = parse_file_timestamp(base_name)
103
+ next if timestamp.nil?
104
+ next if min_timestamp and timestamp <= min_timestamp
105
+ next if max_timestamp and timestamp > max_timestamp
106
+ yield(path, timestamp, action, post_match)
107
+ end
108
+ end
109
+ end
110
+
111
+ def each_packed_target_path(dir, min_timestamp, max_timestamp)
112
+ return unless min_timestamp.to_i.zero?
113
+ Dir.glob("#{dir}/packed/*") do |path|
114
+ next unless File.directory?(path)
115
+ timestamp, action, post_match = parse_file_timestamp(File.basename(path))
116
+ next if action
117
+ next unless post_match.empty?
118
+ yield(path, timestamp)
119
+ end
120
+ end
121
+
122
+ def list_targets(dir, start_time, current_timestamp)
123
+ targets = []
124
+ list_schema_targets(dir, start_time, current_timestamp, targets)
125
+ Dir.glob("#{dir}/data/*") do |path|
126
+ next unless File.directory?(path)
127
+ name = File.basename(path)
128
+ list_table_targets(path, name, start_time, current_timestamp, targets)
129
+ end
130
+ targets
131
+ end
132
+
133
+ def each_schema_target(dir, min_timestamp, max_timestamp)
134
+ each_target_path(dir,
135
+ min_timestamp,
136
+ max_timestamp) do |path, timestamp, action, post_match|
137
+ next if action
138
+ next unless post_match == ".grn"
139
+ yield(SchemaTarget.new(path, timestamp))
140
+ end
141
+ end
142
+
143
+ def list_schema_targets(dir, start_time, current_timestamp, targets)
144
+ latest_packed_target = nil
145
+ each_packed_target_path("#{dir}/schema",
146
+ start_time,
147
+ current_timestamp) do |path, timestamp|
148
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
149
+ next
150
+ end
151
+ latest_packed_target = PackedSchemaTarget.new(path, timestamp)
152
+ end
153
+ if latest_packed_target
154
+ targets << latest_packed_target
155
+ each_schema_target(latest_packed_target.path, nil, nil) do |target|
156
+ latest_packed_target.targets << target
157
+ end
158
+ end
159
+ each_schema_target("#{dir}/schema",
160
+ latest_packed_target&.timestamp || start_time,
161
+ current_timestamp) do |target|
162
+ targets << target
163
+ end
164
+ end
165
+
166
+ TABLE_TARGET_SUFFIXES = [".grn", ".parquet"]
167
+ def each_table_target(dir, name, min_timestamp, max_timestamp)
168
+ each_target_path(dir,
169
+ min_timestamp,
170
+ max_timestamp) do |path, timestamp, action, post_match|
171
+ next if action.nil?
172
+ next unless TABLE_TARGET_SUFFIXES.include?(post_match)
173
+ yield(TableTarget.new(path, timestamp, name, action))
174
+ end
175
+ end
176
+
177
+ def list_table_targets(dir, name, start_time, current_timestamp, targets)
178
+ latest_packed_target = nil
179
+ each_packed_target_path(dir,
180
+ start_time,
181
+ current_timestamp) do |path, timestamp|
182
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
183
+ next
184
+ end
185
+ latest_packed_target = PackedTableTarget.new(path, timestamp, name)
186
+ end
187
+ if latest_packed_target
188
+ targets << latest_packed_target
189
+ each_table_target(latest_packed_target.path, name, nil, nil) do |target|
190
+ latest_packed_target.targets << target
191
+ end
192
+ end
193
+ each_table_target(dir,
194
+ name,
195
+ latest_packed_target&.timestamp || start_time,
196
+ current_timestamp) do |target|
197
+ targets << target
198
+ end
199
+ end
200
+
201
+ def parse_directory_timestamp(base_name)
202
+ case base_name
203
+ when /\A(\d{4})-(\d{2})-(\d{2})\z/
204
+ match = Regexp.last_match
205
+ year = match[1].to_i
206
+ month = match[2].to_i
207
+ day = match[3].to_i
208
+ build_time(year, month, day)
209
+ else
210
+ nil
211
+ end
212
+ end
213
+
214
+ def parse_file_timestamp(base_name)
215
+ case base_name
216
+ when /\A(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{9})(?:-(\w+))?/
217
+ match = Regexp.last_match
218
+ year = match[1].to_i
219
+ month = match[2].to_i
220
+ day = match[3].to_i
221
+ hour = match[4].to_i
222
+ minute = match[5].to_i
223
+ second = match[6].to_i
224
+ nanosecond = match[7].to_i
225
+ action = match[8]
226
+ timestamp = build_time(year,
227
+ month,
228
+ day,
229
+ hour,
230
+ minute,
231
+ second,
232
+ nanosecond)
233
+ [timestamp, action, match.post_match]
234
+ else
235
+ nil
236
+ end
237
+ end
238
+
239
+ module ApplyLoggable
240
+ private
241
+ def apply_log(logger, path)
242
+ logger.info("Start applying: #{path}")
243
+ result = yield
244
+ logger.info("Applied: #{path}")
245
+ result
246
+ end
247
+ end
248
+
249
+ class SchemaTarget
250
+ include ApplyLoggable
251
+
252
+ attr_reader :path
253
+ attr_reader :timestamp
254
+ def initialize(path, timestamp)
255
+ @path = path
256
+ @timestamp = timestamp
257
+ end
258
+
259
+ def apply(logger, client, processor)
260
+ apply_log(logger, @path) do
261
+ processor.load(@path)
262
+ end
263
+ end
264
+ end
265
+
266
+ class PackedSchemaTarget
267
+ include ApplyLoggable
268
+
269
+ attr_reader :path
270
+ attr_reader :timestamp
271
+ attr_reader :targets
272
+ def initialize(path, timestamp)
273
+ @path = path
274
+ @timestamp = timestamp
275
+ @targets = []
276
+ end
277
+
278
+ def apply(logger, client, processor)
279
+ apply_log(logger, @path) do
280
+ @targets.sort_by(&:timestamp).each do |target|
281
+ target.apply(logger, client, processor)
282
+ end
283
+ end
284
+ end
285
+ end
286
+
287
+ class TableTarget
288
+ include ApplyLoggable
289
+
290
+ attr_reader :path
291
+ attr_reader :timestamp
292
+ attr_reader :name
293
+ attr_reader :action
294
+ def initialize(path, timestamp, name, action)
295
+ @path = path
296
+ @timestamp = timestamp
297
+ @name = name
298
+ @action = action
299
+ end
300
+
301
+ def apply(logger, client, processor)
302
+ apply_log(logger, @path) do
303
+ if @path.end_with?(".grn")
304
+ processor.load(@path)
305
+ else
306
+ # TODO: Add support for @action == "delete"
307
+ table = Arrow::Table.load(@path)
308
+ command = Groonga::Command::Load.new(table: @name,
309
+ values: table,
310
+ command_version: "3")
311
+ response = client.load(command.arguments)
312
+ processor.process_response(response, command)
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ class PackedTableTarget
319
+ include ApplyLoggable
320
+
321
+ attr_reader :path
322
+ attr_reader :timestamp
323
+ attr_reader :name
324
+ attr_reader :targets
325
+ def initialize(path, timestamp, name)
326
+ @path = path
327
+ @timestamp = timestamp
328
+ @name = name
329
+ @targets = []
330
+ end
331
+
332
+ def apply(logger, client, processor)
333
+ apply_log(logger, @path) do
334
+ @targets.sort_by(&:timestamp).each do |target|
335
+ target.apply(logger, client, processor)
336
+ end
337
+ end
338
+ end
339
+ end
340
+
341
+ class CommandProcessor < Groonga::Client::CommandProcessor
342
+ def initialize(config, *args)
343
+ @config = config
344
+ super(*args)
345
+ end
346
+
347
+ def process_response(response, command)
348
+ message = ""
349
+ case command.command_name
350
+ when "load"
351
+ command.arguments.delete(:values)
352
+ if response.success?
353
+ message = "#{response.n_loaded_records}: "
354
+ else
355
+ load_response = Groonga::Client::Response::Load.new(command,
356
+ response.header,
357
+ response.body)
358
+ message = "#{load_response.n_loaded_records}: "
359
+ end
360
+ end
361
+ if response.success?
362
+ @config.logger.info("Processed: " +
363
+ "#{response.elapsed_time}: " +
364
+ "#{command.command_name}: " +
365
+ message +
366
+ "#{command.to_command_format}")
367
+ else
368
+ failed_message = "Failed to process: " +
369
+ "#{response.return_code}: " +
370
+ "#{response.elapsed_time}: " +
371
+ "#{response.error_message}: " +
372
+ "#{command.command_name}: " +
373
+ message +
374
+ "#{command.to_command_format}"
375
+ case @config.on_error
376
+ when "ignore"
377
+ when "warning"
378
+ @config.logger.warn(failed_message)
379
+ when "error"
380
+ raise ExecutionError, failed_message
381
+ end
382
+ end
383
+ end
384
+ end
385
+ end
386
+ end