groonga-delta 1.0.0 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +42 -0
- data/lib/groonga-delta/command.rb +10 -0
- data/lib/groonga-delta/config.rb +1 -1
- data/lib/groonga-delta/error.rb +20 -0
- data/lib/groonga-delta/import-command.rb +7 -2
- data/lib/groonga-delta/import-config.rb +43 -4
- data/lib/groonga-delta/import-status.rb +12 -4
- data/lib/groonga-delta/local-delta.rb +12 -282
- data/lib/groonga-delta/local-reader.rb +353 -0
- data/lib/groonga-delta/local-source.rb +3 -3
- data/lib/groonga-delta/local-vacuumer.rb +39 -0
- data/lib/groonga-delta/{writer.rb → local-writer.rb} +17 -5
- data/lib/groonga-delta/mapping.rb +16 -3
- data/lib/groonga-delta/mysql-source.rb +84 -131
- data/lib/groonga-delta/status.rb +3 -1
- data/lib/groonga-delta/version.rb +1 -1
- metadata +5 -3
@@ -0,0 +1,353 @@
|
|
1
|
+
# Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "fileutils"
|
17
|
+
|
18
|
+
require "groonga/command"
|
19
|
+
require "parquet"
|
20
|
+
|
21
|
+
require_relative "error"
|
22
|
+
|
23
|
+
module GroongaDelta
|
24
|
+
class LocalReader
|
25
|
+
def initialize(logger, dir)
|
26
|
+
@logger = logger
|
27
|
+
@dir = dir
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(min_timestamp=nil, max_timestamp=nil, &block)
|
31
|
+
unless block_given?
|
32
|
+
return to_enum(__method__, min_timestamp, max_timestamp)
|
33
|
+
end
|
34
|
+
|
35
|
+
targets = list_targets(@dir, min_timestamp, max_timestamp)
|
36
|
+
targets.sort_by(&:timestamp).each(&block)
|
37
|
+
end
|
38
|
+
|
39
|
+
def build_time(year, month, day, hour=0, minute=0, second=0, nanosecond=0)
|
40
|
+
Time.utc(year,
|
41
|
+
month,
|
42
|
+
day,
|
43
|
+
hour,
|
44
|
+
minute,
|
45
|
+
Rational(second * 1_000_000_000 + nanosecond,
|
46
|
+
1_000_000_000))
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def each_target_path(dir,
|
51
|
+
min_timestamp,
|
52
|
+
max_timestamp,
|
53
|
+
accept_directory: true,
|
54
|
+
&block)
|
55
|
+
if min_timestamp
|
56
|
+
min_timestamp_day = Time.utc(min_timestamp.year,
|
57
|
+
min_timestamp.month,
|
58
|
+
min_timestamp.day)
|
59
|
+
end
|
60
|
+
if max_timestamp
|
61
|
+
max_timestamp_day = Time.utc(max_timestamp.year,
|
62
|
+
max_timestamp.month,
|
63
|
+
max_timestamp.day)
|
64
|
+
end
|
65
|
+
Dir.glob("#{dir}/*") do |path|
|
66
|
+
base_name = File.basename(path)
|
67
|
+
if accept_directory and File.directory?(path)
|
68
|
+
timestamp = parse_directory_timestamp(base_name)
|
69
|
+
next if timestamp.nil?
|
70
|
+
next if min_timestamp_day and timestamp < min_timestamp_day
|
71
|
+
next if max_timestamp_day and timestamp > max_timestamp_day
|
72
|
+
each_target_path(path,
|
73
|
+
min_timestamp,
|
74
|
+
max_timestamp,
|
75
|
+
accept_directory: false,
|
76
|
+
&block)
|
77
|
+
elsif File.file?(path)
|
78
|
+
timestamp, action, post_match = parse_file_timestamp(base_name)
|
79
|
+
next if timestamp.nil?
|
80
|
+
next if min_timestamp and timestamp <= min_timestamp
|
81
|
+
next if max_timestamp and timestamp > max_timestamp
|
82
|
+
yield(path, timestamp, action, post_match)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def each_packed_target_path(dir, min_timestamp, max_timestamp)
|
88
|
+
return unless min_timestamp.to_i.zero?
|
89
|
+
Dir.glob("#{dir}/packed/*") do |path|
|
90
|
+
next unless File.directory?(path)
|
91
|
+
timestamp, action, post_match = parse_file_timestamp(File.basename(path))
|
92
|
+
next if action
|
93
|
+
next unless post_match.empty?
|
94
|
+
yield(path, timestamp)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def list_targets(dir, min_timestamp, max_timestamp)
|
99
|
+
targets = []
|
100
|
+
list_schema_targets(dir, min_timestamp, max_timestamp, targets)
|
101
|
+
Dir.glob("#{dir}/data/*") do |path|
|
102
|
+
next unless File.directory?(path)
|
103
|
+
name = File.basename(path)
|
104
|
+
list_table_targets(path, name, min_timestamp, max_timestamp, targets)
|
105
|
+
end
|
106
|
+
targets
|
107
|
+
end
|
108
|
+
|
109
|
+
def each_schema_target(dir, min_timestamp, max_timestamp)
|
110
|
+
each_target_path(dir,
|
111
|
+
min_timestamp,
|
112
|
+
max_timestamp) do |path, timestamp, action, post_match|
|
113
|
+
next if action
|
114
|
+
next unless post_match == ".grn"
|
115
|
+
yield(SchemaTarget.new(path, timestamp))
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def list_schema_targets(dir, min_timestamp, max_timestamp, targets)
|
120
|
+
latest_packed_target = nil
|
121
|
+
each_packed_target_path("#{dir}/schema",
|
122
|
+
min_timestamp,
|
123
|
+
max_timestamp) do |path, timestamp|
|
124
|
+
if latest_packed_target and latest_packed_target.timestamp > timestamp
|
125
|
+
next
|
126
|
+
end
|
127
|
+
latest_packed_target = PackedSchemaTarget.new(path, timestamp)
|
128
|
+
end
|
129
|
+
if latest_packed_target
|
130
|
+
targets << latest_packed_target
|
131
|
+
each_schema_target(latest_packed_target.path, nil, nil) do |target|
|
132
|
+
latest_packed_target.targets << target
|
133
|
+
end
|
134
|
+
end
|
135
|
+
each_schema_target("#{dir}/schema",
|
136
|
+
latest_packed_target&.timestamp || min_timestamp,
|
137
|
+
max_timestamp) do |target|
|
138
|
+
targets << target
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
TABLE_TARGET_SUFFIXES = [".grn", ".parquet"]
|
143
|
+
def each_table_target(dir, name, min_timestamp, max_timestamp)
|
144
|
+
each_target_path(dir,
|
145
|
+
min_timestamp,
|
146
|
+
max_timestamp) do |path, timestamp, action, post_match|
|
147
|
+
next if action.nil?
|
148
|
+
next unless TABLE_TARGET_SUFFIXES.include?(post_match)
|
149
|
+
yield(TableTarget.new(path, timestamp, name, action))
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def list_table_targets(dir, name, min_timestamp, max_timestamp, targets)
|
154
|
+
latest_packed_target = nil
|
155
|
+
each_packed_target_path(dir,
|
156
|
+
min_timestamp,
|
157
|
+
max_timestamp) do |path, timestamp|
|
158
|
+
if latest_packed_target and latest_packed_target.timestamp > timestamp
|
159
|
+
next
|
160
|
+
end
|
161
|
+
latest_packed_target = PackedTableTarget.new(path, timestamp, name)
|
162
|
+
end
|
163
|
+
if latest_packed_target
|
164
|
+
targets << latest_packed_target
|
165
|
+
each_table_target(latest_packed_target.path, name, nil, nil) do |target|
|
166
|
+
latest_packed_target.targets << target
|
167
|
+
end
|
168
|
+
end
|
169
|
+
each_table_target(dir,
|
170
|
+
name,
|
171
|
+
latest_packed_target&.timestamp || min_timestamp,
|
172
|
+
max_timestamp) do |target|
|
173
|
+
targets << target
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_directory_timestamp(base_name)
|
178
|
+
case base_name
|
179
|
+
when /\A(\d{4})-(\d{2})-(\d{2})\z/
|
180
|
+
match = Regexp.last_match
|
181
|
+
year = match[1].to_i
|
182
|
+
month = match[2].to_i
|
183
|
+
day = match[3].to_i
|
184
|
+
build_time(year, month, day)
|
185
|
+
else
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def parse_file_timestamp(base_name)
|
191
|
+
case base_name
|
192
|
+
when /\A(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{9})(?:-(\w+))?/
|
193
|
+
match = Regexp.last_match
|
194
|
+
year = match[1].to_i
|
195
|
+
month = match[2].to_i
|
196
|
+
day = match[3].to_i
|
197
|
+
hour = match[4].to_i
|
198
|
+
minute = match[5].to_i
|
199
|
+
second = match[6].to_i
|
200
|
+
nanosecond = match[7].to_i
|
201
|
+
action = match[8]
|
202
|
+
timestamp = build_time(year,
|
203
|
+
month,
|
204
|
+
day,
|
205
|
+
hour,
|
206
|
+
minute,
|
207
|
+
second,
|
208
|
+
nanosecond)
|
209
|
+
[timestamp, action, match.post_match]
|
210
|
+
else
|
211
|
+
nil
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
module Loggable
|
216
|
+
private
|
217
|
+
def log(logger, path, before_message, after_message)
|
218
|
+
logger.info("#{before_message}: #{path}")
|
219
|
+
result = yield
|
220
|
+
logger.info("#{after_message}: #{path}")
|
221
|
+
result
|
222
|
+
end
|
223
|
+
|
224
|
+
def apply_log(logger, path, &block)
|
225
|
+
log(logger, path, "Start applying", "Applied", &block)
|
226
|
+
end
|
227
|
+
|
228
|
+
def vacuum_log(logger, path, &block)
|
229
|
+
log(logger, path, "Start vacuuming", "Vacuumed", &block)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
class SchemaTarget
|
234
|
+
include Loggable
|
235
|
+
|
236
|
+
attr_reader :path
|
237
|
+
attr_reader :timestamp
|
238
|
+
def initialize(path, timestamp)
|
239
|
+
@path = path
|
240
|
+
@timestamp = timestamp
|
241
|
+
end
|
242
|
+
|
243
|
+
def apply(logger, client, processor)
|
244
|
+
apply_log(logger, @path) do
|
245
|
+
processor.load(@path)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def vacuum(logger)
|
250
|
+
vacuum_log(logger, @path) do
|
251
|
+
FileUtils.rm(@path)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
class PackedSchemaTarget
|
257
|
+
include Loggable
|
258
|
+
|
259
|
+
attr_reader :path
|
260
|
+
attr_reader :timestamp
|
261
|
+
attr_reader :targets
|
262
|
+
def initialize(path, timestamp)
|
263
|
+
@path = path
|
264
|
+
@timestamp = timestamp
|
265
|
+
@targets = []
|
266
|
+
end
|
267
|
+
|
268
|
+
def apply(logger, client, processor)
|
269
|
+
apply_log(logger, @path) do
|
270
|
+
@targets.sort_by(&:timestamp).each do |target|
|
271
|
+
target.apply(logger, client, processor)
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def vacuum(logger)
|
277
|
+
vacuum_log(logger, @path) do
|
278
|
+
@targets.sort_by(&:timestamp).each do |target|
|
279
|
+
target.vacuum(logger)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
class TableTarget
|
286
|
+
include Loggable
|
287
|
+
|
288
|
+
attr_reader :path
|
289
|
+
attr_reader :timestamp
|
290
|
+
attr_reader :name
|
291
|
+
attr_reader :action
|
292
|
+
def initialize(path, timestamp, name, action)
|
293
|
+
@path = path
|
294
|
+
@timestamp = timestamp
|
295
|
+
@name = name
|
296
|
+
@action = action
|
297
|
+
end
|
298
|
+
|
299
|
+
def apply(logger, client, processor)
|
300
|
+
apply_log(logger, @path) do
|
301
|
+
if @path.end_with?(".grn")
|
302
|
+
processor.load(@path)
|
303
|
+
else
|
304
|
+
# TODO: Add support for @action == "delete"
|
305
|
+
table = Arrow::Table.load(@path)
|
306
|
+
command = Groonga::Command::Load.new(table: @name,
|
307
|
+
values: table,
|
308
|
+
command_version: "3")
|
309
|
+
response = client.load(command.arguments)
|
310
|
+
processor.process_response(response, command)
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def vacuum(logger)
|
316
|
+
vacuum_log(logger, @path) do
|
317
|
+
FileUtils.rm(@path)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
class PackedTableTarget
|
323
|
+
include Loggable
|
324
|
+
|
325
|
+
attr_reader :path
|
326
|
+
attr_reader :timestamp
|
327
|
+
attr_reader :name
|
328
|
+
attr_reader :targets
|
329
|
+
def initialize(path, timestamp, name)
|
330
|
+
@path = path
|
331
|
+
@timestamp = timestamp
|
332
|
+
@name = name
|
333
|
+
@targets = []
|
334
|
+
end
|
335
|
+
|
336
|
+
def apply(logger, client, processor)
|
337
|
+
apply_log(logger, @path) do
|
338
|
+
@targets.sort_by(&:timestamp).each do |target|
|
339
|
+
target.apply(logger, client, processor)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def vacuum(logger)
|
345
|
+
vacuum_log(logger, @path) do
|
346
|
+
@targets.sort_by(&:timestamp).each do |target|
|
347
|
+
target.vacuum(logger)
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
353
|
+
end
|
@@ -15,15 +15,15 @@
|
|
15
15
|
|
16
16
|
require "groonga/command/parser"
|
17
17
|
|
18
|
-
require_relative "writer"
|
18
|
+
require_relative "local-writer"
|
19
19
|
|
20
20
|
module GroongaDelta
|
21
21
|
class LocalSource
|
22
|
-
def initialize(config, status)
|
22
|
+
def initialize(config, status, writer)
|
23
23
|
@logger = config.logger
|
24
|
-
@writer = Writer.new(@logger, config.delta_dir)
|
25
24
|
@config = config.local
|
26
25
|
@status = status.local
|
26
|
+
@writer = writer
|
27
27
|
end
|
28
28
|
|
29
29
|
def import
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "fileutils"
|
17
|
+
|
18
|
+
require_relative "local-reader"
|
19
|
+
|
20
|
+
module GroongaDelta
|
21
|
+
class LocalVacuumer
|
22
|
+
def initialize(config)
|
23
|
+
@logger = config.logger
|
24
|
+
@delta_dir = config.delta_dir
|
25
|
+
@config = config.vacuum
|
26
|
+
end
|
27
|
+
|
28
|
+
def vacuum
|
29
|
+
keep_span = @config.keep_span
|
30
|
+
return if keep_span.nil?
|
31
|
+
return if keep_span < 0
|
32
|
+
reader = LocalReader.new(@logger, @delta_dir)
|
33
|
+
max_timestamp = Time.now.utc - keep_span
|
34
|
+
reader.each(nil, max_timestamp) do |target|
|
35
|
+
target.vacuum(@logger)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -20,10 +20,11 @@ require "groonga/command"
|
|
20
20
|
require "parquet"
|
21
21
|
|
22
22
|
module GroongaDelta
|
23
|
-
class
|
24
|
-
def initialize(
|
25
|
-
@
|
26
|
-
@
|
23
|
+
class LocalWriter
|
24
|
+
def initialize(config)
|
25
|
+
@config = config
|
26
|
+
@logger = @config.logger
|
27
|
+
@dir = @config.delta_dir
|
27
28
|
end
|
28
29
|
|
29
30
|
def write_upserts(table, records, packed: false)
|
@@ -74,7 +75,7 @@ module GroongaDelta
|
|
74
75
|
delete = Groonga::Command::Delete.new
|
75
76
|
delete[:table] = table
|
76
77
|
keys.each do |key|
|
77
|
-
delete[:key] = key
|
78
|
+
delete[:key] = format_key(key)
|
78
79
|
output.puts(delete.to_command_format)
|
79
80
|
end
|
80
81
|
end
|
@@ -131,5 +132,16 @@ module GroongaDelta
|
|
131
132
|
open_output: open_output,
|
132
133
|
&block)
|
133
134
|
end
|
135
|
+
|
136
|
+
def format_key(key)
|
137
|
+
case key
|
138
|
+
when Integer, Float
|
139
|
+
key.to_s
|
140
|
+
when Time
|
141
|
+
key.strftime("%Y-%m-%d %H:%M:%S.%6N")
|
142
|
+
else
|
143
|
+
key
|
144
|
+
end
|
145
|
+
end
|
134
146
|
end
|
135
147
|
end
|
@@ -142,8 +142,12 @@ module GroongaDelta
|
|
142
142
|
def generate_record(source_record)
|
143
143
|
record = {}
|
144
144
|
@groonga_columns.each do |groonga_column|
|
145
|
-
|
146
|
-
|
145
|
+
begin
|
146
|
+
value = groonga_column.generate_value(source_record)
|
147
|
+
record[groonga_column.name.to_sym] = value
|
148
|
+
rescue => error
|
149
|
+
raise GenerationError.new(source_record, groonga_column, error)
|
150
|
+
end
|
147
151
|
end
|
148
152
|
record
|
149
153
|
end
|
@@ -255,7 +259,16 @@ module GroongaDelta
|
|
255
259
|
end
|
256
260
|
|
257
261
|
def normalize_value(value)
|
258
|
-
case type
|
262
|
+
case @type
|
263
|
+
when nil, "ShortText", "Text", "LongText"
|
264
|
+
encoding = value.encoding
|
265
|
+
if encoding == Encoding::ASCII_8BIT
|
266
|
+
value.force_encoding(Encoding::UTF_8)
|
267
|
+
return value if value.valid_encoding?
|
268
|
+
value.encode(Encoding::UTF_8, encoding)
|
269
|
+
else
|
270
|
+
value.encode(Encoding::UTF_8)
|
271
|
+
end
|
259
272
|
when "Time"
|
260
273
|
time_max = @restriction.time_max
|
261
274
|
time_min = @restriction.time_min
|