groonga-delta 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+
18
+ require "groonga/command"
19
+ require "parquet"
20
+
21
+ require_relative "error"
22
+
23
+ module GroongaDelta
24
+ class LocalReader
25
+ def initialize(logger, dir)
26
+ @logger = logger
27
+ @dir = dir
28
+ end
29
+
30
+ def each(min_timestamp=nil, max_timestamp=nil, &block)
31
+ unless block_given?
32
+ return to_enum(__method__, min_timestamp, max_timestamp)
33
+ end
34
+
35
+ targets = list_targets(@dir, min_timestamp, max_timestamp)
36
+ targets.sort_by(&:timestamp).each(&block)
37
+ end
38
+
39
+ def build_time(year, month, day, hour=0, minute=0, second=0, nanosecond=0)
40
+ Time.utc(year,
41
+ month,
42
+ day,
43
+ hour,
44
+ minute,
45
+ Rational(second * 1_000_000_000 + nanosecond,
46
+ 1_000_000_000))
47
+ end
48
+
49
+ private
50
+ def each_target_path(dir,
51
+ min_timestamp,
52
+ max_timestamp,
53
+ accept_directory: true,
54
+ &block)
55
+ if min_timestamp
56
+ min_timestamp_day = Time.utc(min_timestamp.year,
57
+ min_timestamp.month,
58
+ min_timestamp.day)
59
+ end
60
+ if max_timestamp
61
+ max_timestamp_day = Time.utc(max_timestamp.year,
62
+ max_timestamp.month,
63
+ max_timestamp.day)
64
+ end
65
+ Dir.glob("#{dir}/*") do |path|
66
+ base_name = File.basename(path)
67
+ if accept_directory and File.directory?(path)
68
+ timestamp = parse_directory_timestamp(base_name)
69
+ next if timestamp.nil?
70
+ next if min_timestamp_day and timestamp < min_timestamp_day
71
+ next if max_timestamp_day and timestamp > max_timestamp_day
72
+ each_target_path(path,
73
+ min_timestamp,
74
+ max_timestamp,
75
+ accept_directory: false,
76
+ &block)
77
+ elsif File.file?(path)
78
+ timestamp, action, post_match = parse_file_timestamp(base_name)
79
+ next if timestamp.nil?
80
+ next if min_timestamp and timestamp <= min_timestamp
81
+ next if max_timestamp and timestamp > max_timestamp
82
+ yield(path, timestamp, action, post_match)
83
+ end
84
+ end
85
+ end
86
+
87
+ def each_packed_target_path(dir, min_timestamp, max_timestamp)
88
+ return unless min_timestamp.to_i.zero?
89
+ Dir.glob("#{dir}/packed/*") do |path|
90
+ next unless File.directory?(path)
91
+ timestamp, action, post_match = parse_file_timestamp(File.basename(path))
92
+ next if action
93
+ next unless post_match.empty?
94
+ yield(path, timestamp)
95
+ end
96
+ end
97
+
98
+ def list_targets(dir, min_timestamp, max_timestamp)
99
+ targets = []
100
+ list_schema_targets(dir, min_timestamp, max_timestamp, targets)
101
+ Dir.glob("#{dir}/data/*") do |path|
102
+ next unless File.directory?(path)
103
+ name = File.basename(path)
104
+ list_table_targets(path, name, min_timestamp, max_timestamp, targets)
105
+ end
106
+ targets
107
+ end
108
+
109
+ def each_schema_target(dir, min_timestamp, max_timestamp)
110
+ each_target_path(dir,
111
+ min_timestamp,
112
+ max_timestamp) do |path, timestamp, action, post_match|
113
+ next if action
114
+ next unless post_match == ".grn"
115
+ yield(SchemaTarget.new(path, timestamp))
116
+ end
117
+ end
118
+
119
+ def list_schema_targets(dir, min_timestamp, max_timestamp, targets)
120
+ latest_packed_target = nil
121
+ each_packed_target_path("#{dir}/schema",
122
+ min_timestamp,
123
+ max_timestamp) do |path, timestamp|
124
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
125
+ next
126
+ end
127
+ latest_packed_target = PackedSchemaTarget.new(path, timestamp)
128
+ end
129
+ if latest_packed_target
130
+ targets << latest_packed_target
131
+ each_schema_target(latest_packed_target.path, nil, nil) do |target|
132
+ latest_packed_target.targets << target
133
+ end
134
+ end
135
+ each_schema_target("#{dir}/schema",
136
+ latest_packed_target&.timestamp || min_timestamp,
137
+ max_timestamp) do |target|
138
+ targets << target
139
+ end
140
+ end
141
+
142
+ TABLE_TARGET_SUFFIXES = [".grn", ".parquet"]
143
+ def each_table_target(dir, name, min_timestamp, max_timestamp)
144
+ each_target_path(dir,
145
+ min_timestamp,
146
+ max_timestamp) do |path, timestamp, action, post_match|
147
+ next if action.nil?
148
+ next unless TABLE_TARGET_SUFFIXES.include?(post_match)
149
+ yield(TableTarget.new(path, timestamp, name, action))
150
+ end
151
+ end
152
+
153
+ def list_table_targets(dir, name, min_timestamp, max_timestamp, targets)
154
+ latest_packed_target = nil
155
+ each_packed_target_path(dir,
156
+ min_timestamp,
157
+ max_timestamp) do |path, timestamp|
158
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
159
+ next
160
+ end
161
+ latest_packed_target = PackedTableTarget.new(path, timestamp, name)
162
+ end
163
+ if latest_packed_target
164
+ targets << latest_packed_target
165
+ each_table_target(latest_packed_target.path, name, nil, nil) do |target|
166
+ latest_packed_target.targets << target
167
+ end
168
+ end
169
+ each_table_target(dir,
170
+ name,
171
+ latest_packed_target&.timestamp || min_timestamp,
172
+ max_timestamp) do |target|
173
+ targets << target
174
+ end
175
+ end
176
+
177
+ def parse_directory_timestamp(base_name)
178
+ case base_name
179
+ when /\A(\d{4})-(\d{2})-(\d{2})\z/
180
+ match = Regexp.last_match
181
+ year = match[1].to_i
182
+ month = match[2].to_i
183
+ day = match[3].to_i
184
+ build_time(year, month, day)
185
+ else
186
+ nil
187
+ end
188
+ end
189
+
190
+ def parse_file_timestamp(base_name)
191
+ case base_name
192
+ when /\A(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{9})(?:-(\w+))?/
193
+ match = Regexp.last_match
194
+ year = match[1].to_i
195
+ month = match[2].to_i
196
+ day = match[3].to_i
197
+ hour = match[4].to_i
198
+ minute = match[5].to_i
199
+ second = match[6].to_i
200
+ nanosecond = match[7].to_i
201
+ action = match[8]
202
+ timestamp = build_time(year,
203
+ month,
204
+ day,
205
+ hour,
206
+ minute,
207
+ second,
208
+ nanosecond)
209
+ [timestamp, action, match.post_match]
210
+ else
211
+ nil
212
+ end
213
+ end
214
+
215
+ module Loggable
216
+ private
217
+ def log(logger, path, before_message, after_message)
218
+ logger.info("#{before_message}: #{path}")
219
+ result = yield
220
+ logger.info("#{after_message}: #{path}")
221
+ result
222
+ end
223
+
224
+ def apply_log(logger, path, &block)
225
+ log(logger, path, "Start applying", "Applied", &block)
226
+ end
227
+
228
+ def vacuum_log(logger, path, &block)
229
+ log(logger, path, "Start vacuuming", "Vacuumed", &block)
230
+ end
231
+ end
232
+
233
+ class SchemaTarget
234
+ include Loggable
235
+
236
+ attr_reader :path
237
+ attr_reader :timestamp
238
+ def initialize(path, timestamp)
239
+ @path = path
240
+ @timestamp = timestamp
241
+ end
242
+
243
+ def apply(logger, client, processor)
244
+ apply_log(logger, @path) do
245
+ processor.load(@path)
246
+ end
247
+ end
248
+
249
+ def vacuum(logger)
250
+ vacuum_log(logger, @path) do
251
+ FileUtils.rm(@path)
252
+ end
253
+ end
254
+ end
255
+
256
+ class PackedSchemaTarget
257
+ include Loggable
258
+
259
+ attr_reader :path
260
+ attr_reader :timestamp
261
+ attr_reader :targets
262
+ def initialize(path, timestamp)
263
+ @path = path
264
+ @timestamp = timestamp
265
+ @targets = []
266
+ end
267
+
268
+ def apply(logger, client, processor)
269
+ apply_log(logger, @path) do
270
+ @targets.sort_by(&:timestamp).each do |target|
271
+ target.apply(logger, client, processor)
272
+ end
273
+ end
274
+ end
275
+
276
+ def vacuum(logger)
277
+ vacuum_log(logger, @path) do
278
+ @targets.sort_by(&:timestamp).each do |target|
279
+ target.vacuum(logger)
280
+ end
281
+ end
282
+ end
283
+ end
284
+
285
+ class TableTarget
286
+ include Loggable
287
+
288
+ attr_reader :path
289
+ attr_reader :timestamp
290
+ attr_reader :name
291
+ attr_reader :action
292
+ def initialize(path, timestamp, name, action)
293
+ @path = path
294
+ @timestamp = timestamp
295
+ @name = name
296
+ @action = action
297
+ end
298
+
299
+ def apply(logger, client, processor)
300
+ apply_log(logger, @path) do
301
+ if @path.end_with?(".grn")
302
+ processor.load(@path)
303
+ else
304
+ # TODO: Add support for @action == "delete"
305
+ table = Arrow::Table.load(@path)
306
+ command = Groonga::Command::Load.new(table: @name,
307
+ values: table,
308
+ command_version: "3")
309
+ response = client.load(command.arguments)
310
+ processor.process_response(response, command)
311
+ end
312
+ end
313
+ end
314
+
315
+ def vacuum(logger)
316
+ vacuum_log(logger, @path) do
317
+ FileUtils.rm(@path)
318
+ end
319
+ end
320
+ end
321
+
322
+ class PackedTableTarget
323
+ include Loggable
324
+
325
+ attr_reader :path
326
+ attr_reader :timestamp
327
+ attr_reader :name
328
+ attr_reader :targets
329
+ def initialize(path, timestamp, name)
330
+ @path = path
331
+ @timestamp = timestamp
332
+ @name = name
333
+ @targets = []
334
+ end
335
+
336
+ def apply(logger, client, processor)
337
+ apply_log(logger, @path) do
338
+ @targets.sort_by(&:timestamp).each do |target|
339
+ target.apply(logger, client, processor)
340
+ end
341
+ end
342
+ end
343
+
344
+ def vacuum(logger)
345
+ vacuum_log(logger, @path) do
346
+ @targets.sort_by(&:timestamp).each do |target|
347
+ target.vacuum(logger)
348
+ end
349
+ end
350
+ end
351
+ end
352
+ end
353
+ end
@@ -15,15 +15,15 @@
15
15
 
16
16
  require "groonga/command/parser"
17
17
 
18
- require_relative "writer"
18
+ require_relative "local-writer"
19
19
 
20
20
  module GroongaDelta
21
21
  class LocalSource
22
- def initialize(config, status)
22
+ def initialize(config, status, writer)
23
23
  @logger = config.logger
24
- @writer = Writer.new(@logger, config.delta_dir)
25
24
  @config = config.local
26
25
  @status = status.local
26
+ @writer = writer
27
27
  end
28
28
 
29
29
  def import
@@ -0,0 +1,39 @@
1
+ # Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+
18
+ require_relative "local-reader"
19
+
20
+ module GroongaDelta
21
+ class LocalVacuumer
22
+ def initialize(config)
23
+ @logger = config.logger
24
+ @delta_dir = config.delta_dir
25
+ @config = config.vacuum
26
+ end
27
+
28
+ def vacuum
29
+ keep_span = @config.keep_span
30
+ return if keep_span.nil?
31
+ return if keep_span < 0
32
+ reader = LocalReader.new(@logger, @delta_dir)
33
+ max_timestamp = Time.now.utc - keep_span
34
+ reader.each(nil, max_timestamp) do |target|
35
+ target.vacuum(@logger)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -20,10 +20,11 @@ require "groonga/command"
20
20
  require "parquet"
21
21
 
22
22
  module GroongaDelta
23
- class Writer
24
- def initialize(logger, dir)
25
- @logger = logger
26
- @dir = dir
23
+ class LocalWriter
24
+ def initialize(config)
25
+ @config = config
26
+ @logger = @config.logger
27
+ @dir = @config.delta_dir
27
28
  end
28
29
 
29
30
  def write_upserts(table, records, packed: false)
@@ -74,7 +75,7 @@ module GroongaDelta
74
75
  delete = Groonga::Command::Delete.new
75
76
  delete[:table] = table
76
77
  keys.each do |key|
77
- delete[:key] = key
78
+ delete[:key] = format_key(key)
78
79
  output.puts(delete.to_command_format)
79
80
  end
80
81
  end
@@ -131,5 +132,16 @@ module GroongaDelta
131
132
  open_output: open_output,
132
133
  &block)
133
134
  end
135
+
136
+ def format_key(key)
137
+ case key
138
+ when Integer, Float
139
+ key.to_s
140
+ when Time
141
+ key.strftime("%Y-%m-%d %H:%M:%S.%6N")
142
+ else
143
+ key
144
+ end
145
+ end
134
146
  end
135
147
  end
@@ -142,8 +142,12 @@ module GroongaDelta
142
142
  def generate_record(source_record)
143
143
  record = {}
144
144
  @groonga_columns.each do |groonga_column|
145
- value = groonga_column.generate_value(source_record)
146
- record[groonga_column.name.to_sym] = value
145
+ begin
146
+ value = groonga_column.generate_value(source_record)
147
+ record[groonga_column.name.to_sym] = value
148
+ rescue => error
149
+ raise GenerationError.new(source_record, groonga_column, error)
150
+ end
147
151
  end
148
152
  record
149
153
  end
@@ -255,7 +259,16 @@ module GroongaDelta
255
259
  end
256
260
 
257
261
  def normalize_value(value)
258
- case type
262
+ case @type
263
+ when nil, "ShortText", "Text", "LongText"
264
+ encoding = value.encoding
265
+ if encoding == Encoding::ASCII_8BIT
266
+ value.force_encoding(Encoding::UTF_8)
267
+ return value if value.valid_encoding?
268
+ value.encode(Encoding::UTF_8, encoding)
269
+ else
270
+ value.encode(Encoding::UTF_8)
271
+ end
259
272
  when "Time"
260
273
  time_max = @restriction.time_max
261
274
  time_min = @restriction.time_min