groonga-delta 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "config"
17
+ require_relative "mapping"
18
+
19
+ module GroongaDelta
20
+ class ImportConfig < Config
21
+ def initialize(dir)
22
+ super("groonga-delta-import", dir)
23
+ end
24
+
25
+ def delta_dir
26
+ resolve_path(@data["delta_dir"] || "delta")
27
+ end
28
+
29
+ def mysql
30
+ return nil unless @data["mysql"]
31
+ MySQL.new(@dir,
32
+ @data["mysql"],
33
+ @secret_data["mysql"] || {})
34
+ end
35
+
36
+ def local
37
+ return nil unless @data["local"]
38
+ Local.new(@dir, @data["local"])
39
+ end
40
+
41
+ def mapping
42
+ Mapping.new(@data["mapping"] || {})
43
+ end
44
+
45
+ class MySQL
46
+ include Config::PathResolvable
47
+
48
+ def initialize(dir, data, secret_data)
49
+ @dir = dir
50
+ @data = data
51
+ @secret_data = secret_data
52
+ end
53
+
54
+ def binlog_dir
55
+ resolve_path(@data["binlog_dir"] || "binlog")
56
+ end
57
+
58
+ def mysqlbinlog
59
+ @data["mysqlbinlog"] || "mysqlbinlog"
60
+ end
61
+
62
+ def host
63
+ @data["host"] || "localhost"
64
+ end
65
+
66
+ def port
67
+ @data["port"] || 3306
68
+ end
69
+
70
+ def socket
71
+ @data["socket"]
72
+ end
73
+
74
+ def user
75
+ @data["user"]
76
+ end
77
+
78
+ def password
79
+ @secret_data["password"] || @data["password"]
80
+ end
81
+
82
+ def replication_client
83
+ @data["replication_client"] || @data
84
+ end
85
+
86
+ def replication_client_user
87
+ replication_client["user"]
88
+ end
89
+
90
+ def replication_client_password
91
+ (@secret_data["replication_client"] || @secret_data)["password"] ||
92
+ replication_client["password"]
93
+ end
94
+
95
+ def replication_slave
96
+ @data["replication_slave"] || @data
97
+ end
98
+
99
+ def replication_slave_user
100
+ replication_slave["user"]
101
+ end
102
+
103
+ def replication_slave_password
104
+ (@secret_data["replication_slave"] || @secret_data)["password"] ||
105
+ replication_slave["password"]
106
+ end
107
+
108
+ def select
109
+ @data["select"] || @data
110
+ end
111
+
112
+ def select_user
113
+ select["user"]
114
+ end
115
+
116
+ def select_password
117
+ (@secret_data["select"] || @secret_data)["password"] ||
118
+ select["password"]
119
+ end
120
+
121
+ def checksum
122
+ _checksum = @data["checksum"]
123
+ return nil if _checksum.nil?
124
+ _checksum.to_sym
125
+ end
126
+
127
+ def initial_import_batch_size
128
+ resolve_size(@data["initial_import_batch_size"] || 1024 * 1024)
129
+ end
130
+
131
+ private
132
+ def resolve_size(value)
133
+ case value
134
+ when String
135
+ case value
136
+ when /\A(\d+)[kK]\z/
137
+ Integer($1, 10) * 1024
138
+ when /\A(\d+)[mM]\z/
139
+ Integer($1, 10) * 1024 * 1024
140
+ when /\A(\d+)[gG]\z/
141
+ Integer($1, 10) * 1024 * 1024 * 1024
142
+ else
143
+ raise ConfigError, "invalid size value: #{value.inspect}"
144
+ end
145
+ else
146
+ value
147
+ end
148
+ end
149
+ end
150
+
151
+ class Local
152
+ include Config::PathResolvable
153
+
154
+ def initialize(dir, data)
155
+ @dir = dir
156
+ @data = data
157
+ end
158
+
159
+ def dir
160
+ resolve_path(@data["dir"] || "local")
161
+ end
162
+
163
+ def initial_max_number
164
+ @data["initial_max_number"] || Float::INFINITY
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,68 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "status"
17
+
18
+ module GroongaDelta
19
+ class ImportStatus < Status
20
+ def mysql
21
+ MySQL.new(self)
22
+ end
23
+
24
+ def local
25
+ Local.new(self)
26
+ end
27
+
28
+ class MySQL
29
+ def initialize(status)
30
+ @status = status
31
+ end
32
+
33
+ def [](key)
34
+ (@status["mysql"] || {})[key]
35
+ end
36
+
37
+ def update(new_data)
38
+ @status.update("mysql" => new_data)
39
+ end
40
+
41
+ def file
42
+ self["file"]
43
+ end
44
+
45
+ def position
46
+ self["position"]
47
+ end
48
+ end
49
+
50
+ class Local
51
+ def initialize(status)
52
+ @status = status
53
+ end
54
+
55
+ def [](key)
56
+ (@status["local"] || {})[key]
57
+ end
58
+
59
+ def update(new_data)
60
+ @status.update("local" => new_data)
61
+ end
62
+
63
+ def number
64
+ self["number"]
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,386 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+
18
+ require "groonga/client"
19
+ require "parquet"
20
+
21
+ require_relative "error"
22
+
23
+ module GroongaDelta
24
+ class LocalDelta
25
+ def initialize(config, status)
26
+ @config = config
27
+ @status = status
28
+ @logger = @config.logger
29
+ @delta_dir = @config.local.delta_dir
30
+ end
31
+
32
+ def apply
33
+ start_time = read_current_status
34
+ current_time = Time.now.utc
35
+ targets = list_targets(@delta_dir, start_time, current_time)
36
+ client_options = {
37
+ url: @config.groonga.url,
38
+ read_timeout: @config.groonga.read_timeout,
39
+ backend: :synchronous,
40
+ }
41
+ Groonga::Client.open(client_options) do |client|
42
+ processor = CommandProcessor.new(@config,
43
+ client,
44
+ target_commands: [],
45
+ target_tables: [],
46
+ target_columns: [])
47
+ targets.sort_by(&:timestamp).each do |target|
48
+ target.apply(@logger, client, processor)
49
+ @status.update("start_time" => [
50
+ target.timestamp.to_i,
51
+ target.timestamp.nsec,
52
+ ])
53
+ end
54
+ end
55
+ end
56
+
57
+ private
58
+ def build_time(year, month, day, hour=0, minute=0, second=0, nanosecond=0)
59
+ Time.utc(year,
60
+ month,
61
+ day,
62
+ hour,
63
+ minute,
64
+ Rational(second * 1_000_000_000 + nanosecond,
65
+ 1_000_000_000))
66
+ end
67
+
68
+ def read_current_status
69
+ start_time_unix_time, start_time_nanosecond = @status.start_time
70
+ if start_time_unix_time
71
+ start_time = Time.at(start_time_unix_time).utc
72
+ build_time(start_time.year,
73
+ start_time.month,
74
+ start_time.day,
75
+ start_time.hour,
76
+ start_time.min,
77
+ start_time.sec,
78
+ start_time_nanosecond)
79
+ else
80
+ Time.at(0).utc
81
+ end
82
+ end
83
+
84
+ def each_target_path(dir,
85
+ min_timestamp,
86
+ max_timestamp,
87
+ accept_directory: true,
88
+ &block)
89
+ Dir.glob("#{dir}/*") do |path|
90
+ base_name = File.basename(path)
91
+ if accept_directory and File.directory?(path)
92
+ timestamp = parse_directory_timestamp(base_name)
93
+ next if timestamp.nil?
94
+ next if min_timestamp and timestamp <= min_timestamp
95
+ next if max_timestamp and timestamp > max_timestamp
96
+ each_target_path(path,
97
+ min_timestamp,
98
+ max_timestamp,
99
+ accept_directory: false,
100
+ &block)
101
+ elsif File.file?(path)
102
+ timestamp, action, post_match = parse_file_timestamp(base_name)
103
+ next if timestamp.nil?
104
+ next if min_timestamp and timestamp <= min_timestamp
105
+ next if max_timestamp and timestamp > max_timestamp
106
+ yield(path, timestamp, action, post_match)
107
+ end
108
+ end
109
+ end
110
+
111
+ def each_packed_target_path(dir, min_timestamp, max_timestamp)
112
+ return unless min_timestamp.to_i.zero?
113
+ Dir.glob("#{dir}/packed/*") do |path|
114
+ next unless File.directory?(path)
115
+ timestamp, action, post_match = parse_file_timestamp(File.basename(path))
116
+ next if action
117
+ next unless post_match.empty?
118
+ yield(path, timestamp)
119
+ end
120
+ end
121
+
122
+ def list_targets(dir, start_time, current_timestamp)
123
+ targets = []
124
+ list_schema_targets(dir, start_time, current_timestamp, targets)
125
+ Dir.glob("#{dir}/data/*") do |path|
126
+ next unless File.directory?(path)
127
+ name = File.basename(path)
128
+ list_table_targets(path, name, start_time, current_timestamp, targets)
129
+ end
130
+ targets
131
+ end
132
+
133
+ def each_schema_target(dir, min_timestamp, max_timestamp)
134
+ each_target_path(dir,
135
+ min_timestamp,
136
+ max_timestamp) do |path, timestamp, action, post_match|
137
+ next if action
138
+ next unless post_match == ".grn"
139
+ yield(SchemaTarget.new(path, timestamp))
140
+ end
141
+ end
142
+
143
+ def list_schema_targets(dir, start_time, current_timestamp, targets)
144
+ latest_packed_target = nil
145
+ each_packed_target_path("#{dir}/schema",
146
+ start_time,
147
+ current_timestamp) do |path, timestamp|
148
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
149
+ next
150
+ end
151
+ latest_packed_target = PackedSchemaTarget.new(path, timestamp)
152
+ end
153
+ if latest_packed_target
154
+ targets << latest_packed_target
155
+ each_schema_target(latest_packed_target.path, nil, nil) do |target|
156
+ latest_packed_target.targets << target
157
+ end
158
+ end
159
+ each_schema_target("#{dir}/schema",
160
+ latest_packed_target&.timestamp || start_time,
161
+ current_timestamp) do |target|
162
+ targets << target
163
+ end
164
+ end
165
+
166
+ TABLE_TARGET_SUFFIXES = [".grn", ".parquet"]
167
+ def each_table_target(dir, name, min_timestamp, max_timestamp)
168
+ each_target_path(dir,
169
+ min_timestamp,
170
+ max_timestamp) do |path, timestamp, action, post_match|
171
+ next if action.nil?
172
+ next unless TABLE_TARGET_SUFFIXES.include?(post_match)
173
+ yield(TableTarget.new(path, timestamp, name, action))
174
+ end
175
+ end
176
+
177
+ def list_table_targets(dir, name, start_time, current_timestamp, targets)
178
+ latest_packed_target = nil
179
+ each_packed_target_path(dir,
180
+ start_time,
181
+ current_timestamp) do |path, timestamp|
182
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
183
+ next
184
+ end
185
+ latest_packed_target = PackedTableTarget.new(path, timestamp, name)
186
+ end
187
+ if latest_packed_target
188
+ targets << latest_packed_target
189
+ each_table_target(latest_packed_target.path, name, nil, nil) do |target|
190
+ latest_packed_target.targets << target
191
+ end
192
+ end
193
+ each_table_target(dir,
194
+ name,
195
+ latest_packed_target&.timestamp || start_time,
196
+ current_timestamp) do |target|
197
+ targets << target
198
+ end
199
+ end
200
+
201
+ def parse_directory_timestamp(base_name)
202
+ case base_name
203
+ when /\A(\d{4})-(\d{2})-(\d{2})\z/
204
+ match = Regexp.last_match
205
+ year = match[1].to_i
206
+ month = match[2].to_i
207
+ day = match[3].to_i
208
+ build_time(year, month, day)
209
+ else
210
+ nil
211
+ end
212
+ end
213
+
214
+ def parse_file_timestamp(base_name)
215
+ case base_name
216
+ when /\A(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{9})(?:-(\w+))?/
217
+ match = Regexp.last_match
218
+ year = match[1].to_i
219
+ month = match[2].to_i
220
+ day = match[3].to_i
221
+ hour = match[4].to_i
222
+ minute = match[5].to_i
223
+ second = match[6].to_i
224
+ nanosecond = match[7].to_i
225
+ action = match[8]
226
+ timestamp = build_time(year,
227
+ month,
228
+ day,
229
+ hour,
230
+ minute,
231
+ second,
232
+ nanosecond)
233
+ [timestamp, action, match.post_match]
234
+ else
235
+ nil
236
+ end
237
+ end
238
+
239
+ module ApplyLoggable
240
+ private
241
+ def apply_log(logger, path)
242
+ logger.info("Start applying: #{path}")
243
+ result = yield
244
+ logger.info("Applied: #{path}")
245
+ result
246
+ end
247
+ end
248
+
249
+ class SchemaTarget
250
+ include ApplyLoggable
251
+
252
+ attr_reader :path
253
+ attr_reader :timestamp
254
+ def initialize(path, timestamp)
255
+ @path = path
256
+ @timestamp = timestamp
257
+ end
258
+
259
+ def apply(logger, client, processor)
260
+ apply_log(logger, @path) do
261
+ processor.load(@path)
262
+ end
263
+ end
264
+ end
265
+
266
+ class PackedSchemaTarget
267
+ include ApplyLoggable
268
+
269
+ attr_reader :path
270
+ attr_reader :timestamp
271
+ attr_reader :targets
272
+ def initialize(path, timestamp)
273
+ @path = path
274
+ @timestamp = timestamp
275
+ @targets = []
276
+ end
277
+
278
+ def apply(logger, client, processor)
279
+ apply_log(logger, @path) do
280
+ @targets.sort_by(&:timestamp).each do |target|
281
+ target.apply(logger, client, processor)
282
+ end
283
+ end
284
+ end
285
+ end
286
+
287
+ class TableTarget
288
+ include ApplyLoggable
289
+
290
+ attr_reader :path
291
+ attr_reader :timestamp
292
+ attr_reader :name
293
+ attr_reader :action
294
+ def initialize(path, timestamp, name, action)
295
+ @path = path
296
+ @timestamp = timestamp
297
+ @name = name
298
+ @action = action
299
+ end
300
+
301
+ def apply(logger, client, processor)
302
+ apply_log(logger, @path) do
303
+ if @path.end_with?(".grn")
304
+ processor.load(@path)
305
+ else
306
+ # TODO: Add support for @action == "delete"
307
+ table = Arrow::Table.load(@path)
308
+ command = Groonga::Command::Load.new(table: @name,
309
+ values: table,
310
+ command_version: "3")
311
+ response = client.load(command.arguments)
312
+ processor.process_response(response, command)
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ class PackedTableTarget
319
+ include ApplyLoggable
320
+
321
+ attr_reader :path
322
+ attr_reader :timestamp
323
+ attr_reader :name
324
+ attr_reader :targets
325
+ def initialize(path, timestamp, name)
326
+ @path = path
327
+ @timestamp = timestamp
328
+ @name = name
329
+ @targets = []
330
+ end
331
+
332
+ def apply(logger, client, processor)
333
+ apply_log(logger, @path) do
334
+ @targets.sort_by(&:timestamp).each do |target|
335
+ target.apply(logger, client, processor)
336
+ end
337
+ end
338
+ end
339
+ end
340
+
341
+ class CommandProcessor < Groonga::Client::CommandProcessor
342
+ def initialize(config, *args)
343
+ @config = config
344
+ super(*args)
345
+ end
346
+
347
+ def process_response(response, command)
348
+ message = ""
349
+ case command.command_name
350
+ when "load"
351
+ command.arguments.delete(:values)
352
+ if response.success?
353
+ message = "#{response.n_loaded_records}: "
354
+ else
355
+ load_response = Groonga::Client::Response::Load.new(command,
356
+ response.header,
357
+ response.body)
358
+ message = "#{load_response.n_loaded_records}: "
359
+ end
360
+ end
361
+ if response.success?
362
+ @config.logger.info("Processed: " +
363
+ "#{response.elapsed_time}: " +
364
+ "#{command.command_name}: " +
365
+ message +
366
+ "#{command.to_command_format}")
367
+ else
368
+ failed_message = "Failed to process: " +
369
+ "#{response.return_code}: " +
370
+ "#{response.elapsed_time}: " +
371
+ "#{response.error_message}: " +
372
+ "#{command.command_name}: " +
373
+ message +
374
+ "#{command.to_command_format}"
375
+ case @config.on_error
376
+ when "ignore"
377
+ when "warning"
378
+ @config.logger.warn(failed_message)
379
+ when "error"
380
+ raise ExecutionError, failed_message
381
+ end
382
+ end
383
+ end
384
+ end
385
+ end
386
+ end