groonga-delta 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +42 -0
- data/lib/groonga-delta/command.rb +10 -0
- data/lib/groonga-delta/config.rb +1 -1
- data/lib/groonga-delta/error.rb +20 -0
- data/lib/groonga-delta/import-command.rb +7 -2
- data/lib/groonga-delta/import-config.rb +43 -4
- data/lib/groonga-delta/import-status.rb +12 -4
- data/lib/groonga-delta/local-delta.rb +12 -282
- data/lib/groonga-delta/local-reader.rb +353 -0
- data/lib/groonga-delta/local-source.rb +3 -3
- data/lib/groonga-delta/local-vacuumer.rb +39 -0
- data/lib/groonga-delta/{writer.rb → local-writer.rb} +17 -5
- data/lib/groonga-delta/mapping.rb +16 -3
- data/lib/groonga-delta/mysql-source.rb +84 -131
- data/lib/groonga-delta/status.rb +3 -1
- data/lib/groonga-delta/version.rb +1 -1
- metadata +5 -3
@@ -0,0 +1,353 @@
|
|
1
|
+
# Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "fileutils"
|
17
|
+
|
18
|
+
require "groonga/command"
|
19
|
+
require "parquet"
|
20
|
+
|
21
|
+
require_relative "error"
|
22
|
+
|
23
|
+
module GroongaDelta
|
24
|
+
class LocalReader
|
25
|
+
def initialize(logger, dir)
|
26
|
+
@logger = logger
|
27
|
+
@dir = dir
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(min_timestamp=nil, max_timestamp=nil, &block)
|
31
|
+
unless block_given?
|
32
|
+
return to_enum(__method__, min_timestamp, max_timestamp)
|
33
|
+
end
|
34
|
+
|
35
|
+
targets = list_targets(@dir, min_timestamp, max_timestamp)
|
36
|
+
targets.sort_by(&:timestamp).each(&block)
|
37
|
+
end
|
38
|
+
|
39
|
+
def build_time(year, month, day, hour=0, minute=0, second=0, nanosecond=0)
|
40
|
+
Time.utc(year,
|
41
|
+
month,
|
42
|
+
day,
|
43
|
+
hour,
|
44
|
+
minute,
|
45
|
+
Rational(second * 1_000_000_000 + nanosecond,
|
46
|
+
1_000_000_000))
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def each_target_path(dir,
|
51
|
+
min_timestamp,
|
52
|
+
max_timestamp,
|
53
|
+
accept_directory: true,
|
54
|
+
&block)
|
55
|
+
if min_timestamp
|
56
|
+
min_timestamp_day = Time.utc(min_timestamp.year,
|
57
|
+
min_timestamp.month,
|
58
|
+
min_timestamp.day)
|
59
|
+
end
|
60
|
+
if max_timestamp
|
61
|
+
max_timestamp_day = Time.utc(max_timestamp.year,
|
62
|
+
max_timestamp.month,
|
63
|
+
max_timestamp.day)
|
64
|
+
end
|
65
|
+
Dir.glob("#{dir}/*") do |path|
|
66
|
+
base_name = File.basename(path)
|
67
|
+
if accept_directory and File.directory?(path)
|
68
|
+
timestamp = parse_directory_timestamp(base_name)
|
69
|
+
next if timestamp.nil?
|
70
|
+
next if min_timestamp_day and timestamp < min_timestamp_day
|
71
|
+
next if max_timestamp_day and timestamp > max_timestamp_day
|
72
|
+
each_target_path(path,
|
73
|
+
min_timestamp,
|
74
|
+
max_timestamp,
|
75
|
+
accept_directory: false,
|
76
|
+
&block)
|
77
|
+
elsif File.file?(path)
|
78
|
+
timestamp, action, post_match = parse_file_timestamp(base_name)
|
79
|
+
next if timestamp.nil?
|
80
|
+
next if min_timestamp and timestamp <= min_timestamp
|
81
|
+
next if max_timestamp and timestamp > max_timestamp
|
82
|
+
yield(path, timestamp, action, post_match)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def each_packed_target_path(dir, min_timestamp, max_timestamp)
|
88
|
+
return unless min_timestamp.to_i.zero?
|
89
|
+
Dir.glob("#{dir}/packed/*") do |path|
|
90
|
+
next unless File.directory?(path)
|
91
|
+
timestamp, action, post_match = parse_file_timestamp(File.basename(path))
|
92
|
+
next if action
|
93
|
+
next unless post_match.empty?
|
94
|
+
yield(path, timestamp)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def list_targets(dir, min_timestamp, max_timestamp)
|
99
|
+
targets = []
|
100
|
+
list_schema_targets(dir, min_timestamp, max_timestamp, targets)
|
101
|
+
Dir.glob("#{dir}/data/*") do |path|
|
102
|
+
next unless File.directory?(path)
|
103
|
+
name = File.basename(path)
|
104
|
+
list_table_targets(path, name, min_timestamp, max_timestamp, targets)
|
105
|
+
end
|
106
|
+
targets
|
107
|
+
end
|
108
|
+
|
109
|
+
def each_schema_target(dir, min_timestamp, max_timestamp)
|
110
|
+
each_target_path(dir,
|
111
|
+
min_timestamp,
|
112
|
+
max_timestamp) do |path, timestamp, action, post_match|
|
113
|
+
next if action
|
114
|
+
next unless post_match == ".grn"
|
115
|
+
yield(SchemaTarget.new(path, timestamp))
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def list_schema_targets(dir, min_timestamp, max_timestamp, targets)
|
120
|
+
latest_packed_target = nil
|
121
|
+
each_packed_target_path("#{dir}/schema",
|
122
|
+
min_timestamp,
|
123
|
+
max_timestamp) do |path, timestamp|
|
124
|
+
if latest_packed_target and latest_packed_target.timestamp > timestamp
|
125
|
+
next
|
126
|
+
end
|
127
|
+
latest_packed_target = PackedSchemaTarget.new(path, timestamp)
|
128
|
+
end
|
129
|
+
if latest_packed_target
|
130
|
+
targets << latest_packed_target
|
131
|
+
each_schema_target(latest_packed_target.path, nil, nil) do |target|
|
132
|
+
latest_packed_target.targets << target
|
133
|
+
end
|
134
|
+
end
|
135
|
+
each_schema_target("#{dir}/schema",
|
136
|
+
latest_packed_target&.timestamp || min_timestamp,
|
137
|
+
max_timestamp) do |target|
|
138
|
+
targets << target
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
TABLE_TARGET_SUFFIXES = [".grn", ".parquet"]
|
143
|
+
def each_table_target(dir, name, min_timestamp, max_timestamp)
|
144
|
+
each_target_path(dir,
|
145
|
+
min_timestamp,
|
146
|
+
max_timestamp) do |path, timestamp, action, post_match|
|
147
|
+
next if action.nil?
|
148
|
+
next unless TABLE_TARGET_SUFFIXES.include?(post_match)
|
149
|
+
yield(TableTarget.new(path, timestamp, name, action))
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def list_table_targets(dir, name, min_timestamp, max_timestamp, targets)
|
154
|
+
latest_packed_target = nil
|
155
|
+
each_packed_target_path(dir,
|
156
|
+
min_timestamp,
|
157
|
+
max_timestamp) do |path, timestamp|
|
158
|
+
if latest_packed_target and latest_packed_target.timestamp > timestamp
|
159
|
+
next
|
160
|
+
end
|
161
|
+
latest_packed_target = PackedTableTarget.new(path, timestamp, name)
|
162
|
+
end
|
163
|
+
if latest_packed_target
|
164
|
+
targets << latest_packed_target
|
165
|
+
each_table_target(latest_packed_target.path, name, nil, nil) do |target|
|
166
|
+
latest_packed_target.targets << target
|
167
|
+
end
|
168
|
+
end
|
169
|
+
each_table_target(dir,
|
170
|
+
name,
|
171
|
+
latest_packed_target&.timestamp || min_timestamp,
|
172
|
+
max_timestamp) do |target|
|
173
|
+
targets << target
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_directory_timestamp(base_name)
|
178
|
+
case base_name
|
179
|
+
when /\A(\d{4})-(\d{2})-(\d{2})\z/
|
180
|
+
match = Regexp.last_match
|
181
|
+
year = match[1].to_i
|
182
|
+
month = match[2].to_i
|
183
|
+
day = match[3].to_i
|
184
|
+
build_time(year, month, day)
|
185
|
+
else
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def parse_file_timestamp(base_name)
|
191
|
+
case base_name
|
192
|
+
when /\A(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{9})(?:-(\w+))?/
|
193
|
+
match = Regexp.last_match
|
194
|
+
year = match[1].to_i
|
195
|
+
month = match[2].to_i
|
196
|
+
day = match[3].to_i
|
197
|
+
hour = match[4].to_i
|
198
|
+
minute = match[5].to_i
|
199
|
+
second = match[6].to_i
|
200
|
+
nanosecond = match[7].to_i
|
201
|
+
action = match[8]
|
202
|
+
timestamp = build_time(year,
|
203
|
+
month,
|
204
|
+
day,
|
205
|
+
hour,
|
206
|
+
minute,
|
207
|
+
second,
|
208
|
+
nanosecond)
|
209
|
+
[timestamp, action, match.post_match]
|
210
|
+
else
|
211
|
+
nil
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
module Loggable
|
216
|
+
private
|
217
|
+
def log(logger, path, before_message, after_message)
|
218
|
+
logger.info("#{before_message}: #{path}")
|
219
|
+
result = yield
|
220
|
+
logger.info("#{after_message}: #{path}")
|
221
|
+
result
|
222
|
+
end
|
223
|
+
|
224
|
+
def apply_log(logger, path, &block)
|
225
|
+
log(logger, path, "Start applying", "Applied", &block)
|
226
|
+
end
|
227
|
+
|
228
|
+
def vacuum_log(logger, path, &block)
|
229
|
+
log(logger, path, "Start vacuuming", "Vacuumed", &block)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
class SchemaTarget
|
234
|
+
include Loggable
|
235
|
+
|
236
|
+
attr_reader :path
|
237
|
+
attr_reader :timestamp
|
238
|
+
def initialize(path, timestamp)
|
239
|
+
@path = path
|
240
|
+
@timestamp = timestamp
|
241
|
+
end
|
242
|
+
|
243
|
+
def apply(logger, client, processor)
|
244
|
+
apply_log(logger, @path) do
|
245
|
+
processor.load(@path)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def vacuum(logger)
|
250
|
+
vacuum_log(logger, @path) do
|
251
|
+
FileUtils.rm(@path)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
class PackedSchemaTarget
|
257
|
+
include Loggable
|
258
|
+
|
259
|
+
attr_reader :path
|
260
|
+
attr_reader :timestamp
|
261
|
+
attr_reader :targets
|
262
|
+
def initialize(path, timestamp)
|
263
|
+
@path = path
|
264
|
+
@timestamp = timestamp
|
265
|
+
@targets = []
|
266
|
+
end
|
267
|
+
|
268
|
+
def apply(logger, client, processor)
|
269
|
+
apply_log(logger, @path) do
|
270
|
+
@targets.sort_by(&:timestamp).each do |target|
|
271
|
+
target.apply(logger, client, processor)
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def vacuum(logger)
|
277
|
+
vacuum_log(logger, @path) do
|
278
|
+
@targets.sort_by(&:timestamp).each do |target|
|
279
|
+
target.vacuum(logger)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
class TableTarget
|
286
|
+
include Loggable
|
287
|
+
|
288
|
+
attr_reader :path
|
289
|
+
attr_reader :timestamp
|
290
|
+
attr_reader :name
|
291
|
+
attr_reader :action
|
292
|
+
def initialize(path, timestamp, name, action)
|
293
|
+
@path = path
|
294
|
+
@timestamp = timestamp
|
295
|
+
@name = name
|
296
|
+
@action = action
|
297
|
+
end
|
298
|
+
|
299
|
+
def apply(logger, client, processor)
|
300
|
+
apply_log(logger, @path) do
|
301
|
+
if @path.end_with?(".grn")
|
302
|
+
processor.load(@path)
|
303
|
+
else
|
304
|
+
# TODO: Add support for @action == "delete"
|
305
|
+
table = Arrow::Table.load(@path)
|
306
|
+
command = Groonga::Command::Load.new(table: @name,
|
307
|
+
values: table,
|
308
|
+
command_version: "3")
|
309
|
+
response = client.load(command.arguments)
|
310
|
+
processor.process_response(response, command)
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def vacuum(logger)
|
316
|
+
vacuum_log(logger, @path) do
|
317
|
+
FileUtils.rm(@path)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
class PackedTableTarget
|
323
|
+
include Loggable
|
324
|
+
|
325
|
+
attr_reader :path
|
326
|
+
attr_reader :timestamp
|
327
|
+
attr_reader :name
|
328
|
+
attr_reader :targets
|
329
|
+
def initialize(path, timestamp, name)
|
330
|
+
@path = path
|
331
|
+
@timestamp = timestamp
|
332
|
+
@name = name
|
333
|
+
@targets = []
|
334
|
+
end
|
335
|
+
|
336
|
+
def apply(logger, client, processor)
|
337
|
+
apply_log(logger, @path) do
|
338
|
+
@targets.sort_by(&:timestamp).each do |target|
|
339
|
+
target.apply(logger, client, processor)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def vacuum(logger)
|
345
|
+
vacuum_log(logger, @path) do
|
346
|
+
@targets.sort_by(&:timestamp).each do |target|
|
347
|
+
target.vacuum(logger)
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
353
|
+
end
|
@@ -15,15 +15,15 @@
|
|
15
15
|
|
16
16
|
require "groonga/command/parser"
|
17
17
|
|
18
|
-
require_relative "writer"
|
18
|
+
require_relative "local-writer"
|
19
19
|
|
20
20
|
module GroongaDelta
|
21
21
|
class LocalSource
|
22
|
-
def initialize(config, status)
|
22
|
+
def initialize(config, status, writer)
|
23
23
|
@logger = config.logger
|
24
|
-
@writer = Writer.new(@logger, config.delta_dir)
|
25
24
|
@config = config.local
|
26
25
|
@status = status.local
|
26
|
+
@writer = writer
|
27
27
|
end
|
28
28
|
|
29
29
|
def import
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "fileutils"
|
17
|
+
|
18
|
+
require_relative "local-reader"
|
19
|
+
|
20
|
+
module GroongaDelta
|
21
|
+
class LocalVacuumer
|
22
|
+
def initialize(config)
|
23
|
+
@logger = config.logger
|
24
|
+
@delta_dir = config.delta_dir
|
25
|
+
@config = config.vacuum
|
26
|
+
end
|
27
|
+
|
28
|
+
def vacuum
|
29
|
+
keep_span = @config.keep_span
|
30
|
+
return if keep_span.nil?
|
31
|
+
return if keep_span < 0
|
32
|
+
reader = LocalReader.new(@logger, @delta_dir)
|
33
|
+
max_timestamp = Time.now.utc - keep_span
|
34
|
+
reader.each(nil, max_timestamp) do |target|
|
35
|
+
target.vacuum(@logger)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -20,10 +20,11 @@ require "groonga/command"
|
|
20
20
|
require "parquet"
|
21
21
|
|
22
22
|
module GroongaDelta
|
23
|
-
class
|
24
|
-
def initialize(
|
25
|
-
@
|
26
|
-
@
|
23
|
+
class LocalWriter
|
24
|
+
def initialize(config)
|
25
|
+
@config = config
|
26
|
+
@logger = @config.logger
|
27
|
+
@dir = @config.delta_dir
|
27
28
|
end
|
28
29
|
|
29
30
|
def write_upserts(table, records, packed: false)
|
@@ -74,7 +75,7 @@ module GroongaDelta
|
|
74
75
|
delete = Groonga::Command::Delete.new
|
75
76
|
delete[:table] = table
|
76
77
|
keys.each do |key|
|
77
|
-
delete[:key] = key
|
78
|
+
delete[:key] = format_key(key)
|
78
79
|
output.puts(delete.to_command_format)
|
79
80
|
end
|
80
81
|
end
|
@@ -131,5 +132,16 @@ module GroongaDelta
|
|
131
132
|
open_output: open_output,
|
132
133
|
&block)
|
133
134
|
end
|
135
|
+
|
136
|
+
def format_key(key)
|
137
|
+
case key
|
138
|
+
when Integer, Float
|
139
|
+
key.to_s
|
140
|
+
when Time
|
141
|
+
key.strftime("%Y-%m-%d %H:%M:%S.%6N")
|
142
|
+
else
|
143
|
+
key
|
144
|
+
end
|
145
|
+
end
|
134
146
|
end
|
135
147
|
end
|
@@ -142,8 +142,12 @@ module GroongaDelta
|
|
142
142
|
def generate_record(source_record)
|
143
143
|
record = {}
|
144
144
|
@groonga_columns.each do |groonga_column|
|
145
|
-
|
146
|
-
|
145
|
+
begin
|
146
|
+
value = groonga_column.generate_value(source_record)
|
147
|
+
record[groonga_column.name.to_sym] = value
|
148
|
+
rescue => error
|
149
|
+
raise GenerationError.new(source_record, groonga_column, error)
|
150
|
+
end
|
147
151
|
end
|
148
152
|
record
|
149
153
|
end
|
@@ -255,7 +259,16 @@ module GroongaDelta
|
|
255
259
|
end
|
256
260
|
|
257
261
|
def normalize_value(value)
|
258
|
-
case type
|
262
|
+
case @type
|
263
|
+
when nil, "ShortText", "Text", "LongText"
|
264
|
+
encoding = value.encoding
|
265
|
+
if encoding == Encoding::ASCII_8BIT
|
266
|
+
value.force_encoding(Encoding::UTF_8)
|
267
|
+
return value if value.valid_encoding?
|
268
|
+
value.encode(Encoding::UTF_8, encoding)
|
269
|
+
else
|
270
|
+
value.encode(Encoding::UTF_8)
|
271
|
+
end
|
259
272
|
when "Time"
|
260
273
|
time_max = @restriction.time_max
|
261
274
|
time_min = @restriction.time_min
|