groonga-delta 1.0.0 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,353 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+
18
+ require "groonga/command"
19
+ require "parquet"
20
+
21
+ require_relative "error"
22
+
23
+ module GroongaDelta
24
+ class LocalReader
25
+ def initialize(logger, dir)
26
+ @logger = logger
27
+ @dir = dir
28
+ end
29
+
30
+ def each(min_timestamp=nil, max_timestamp=nil, &block)
31
+ unless block_given?
32
+ return to_enum(__method__, min_timestamp, max_timestamp)
33
+ end
34
+
35
+ targets = list_targets(@dir, min_timestamp, max_timestamp)
36
+ targets.sort_by(&:timestamp).each(&block)
37
+ end
38
+
39
+ def build_time(year, month, day, hour=0, minute=0, second=0, nanosecond=0)
40
+ Time.utc(year,
41
+ month,
42
+ day,
43
+ hour,
44
+ minute,
45
+ Rational(second * 1_000_000_000 + nanosecond,
46
+ 1_000_000_000))
47
+ end
48
+
49
+ private
50
+ def each_target_path(dir,
51
+ min_timestamp,
52
+ max_timestamp,
53
+ accept_directory: true,
54
+ &block)
55
+ if min_timestamp
56
+ min_timestamp_day = Time.utc(min_timestamp.year,
57
+ min_timestamp.month,
58
+ min_timestamp.day)
59
+ end
60
+ if max_timestamp
61
+ max_timestamp_day = Time.utc(max_timestamp.year,
62
+ max_timestamp.month,
63
+ max_timestamp.day)
64
+ end
65
+ Dir.glob("#{dir}/*") do |path|
66
+ base_name = File.basename(path)
67
+ if accept_directory and File.directory?(path)
68
+ timestamp = parse_directory_timestamp(base_name)
69
+ next if timestamp.nil?
70
+ next if min_timestamp_day and timestamp < min_timestamp_day
71
+ next if max_timestamp_day and timestamp > max_timestamp_day
72
+ each_target_path(path,
73
+ min_timestamp,
74
+ max_timestamp,
75
+ accept_directory: false,
76
+ &block)
77
+ elsif File.file?(path)
78
+ timestamp, action, post_match = parse_file_timestamp(base_name)
79
+ next if timestamp.nil?
80
+ next if min_timestamp and timestamp <= min_timestamp
81
+ next if max_timestamp and timestamp > max_timestamp
82
+ yield(path, timestamp, action, post_match)
83
+ end
84
+ end
85
+ end
86
+
87
+ def each_packed_target_path(dir, min_timestamp, max_timestamp)
88
+ return unless min_timestamp.to_i.zero?
89
+ Dir.glob("#{dir}/packed/*") do |path|
90
+ next unless File.directory?(path)
91
+ timestamp, action, post_match = parse_file_timestamp(File.basename(path))
92
+ next if action
93
+ next unless post_match.empty?
94
+ yield(path, timestamp)
95
+ end
96
+ end
97
+
98
+ def list_targets(dir, min_timestamp, max_timestamp)
99
+ targets = []
100
+ list_schema_targets(dir, min_timestamp, max_timestamp, targets)
101
+ Dir.glob("#{dir}/data/*") do |path|
102
+ next unless File.directory?(path)
103
+ name = File.basename(path)
104
+ list_table_targets(path, name, min_timestamp, max_timestamp, targets)
105
+ end
106
+ targets
107
+ end
108
+
109
+ def each_schema_target(dir, min_timestamp, max_timestamp)
110
+ each_target_path(dir,
111
+ min_timestamp,
112
+ max_timestamp) do |path, timestamp, action, post_match|
113
+ next if action
114
+ next unless post_match == ".grn"
115
+ yield(SchemaTarget.new(path, timestamp))
116
+ end
117
+ end
118
+
119
+ def list_schema_targets(dir, min_timestamp, max_timestamp, targets)
120
+ latest_packed_target = nil
121
+ each_packed_target_path("#{dir}/schema",
122
+ min_timestamp,
123
+ max_timestamp) do |path, timestamp|
124
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
125
+ next
126
+ end
127
+ latest_packed_target = PackedSchemaTarget.new(path, timestamp)
128
+ end
129
+ if latest_packed_target
130
+ targets << latest_packed_target
131
+ each_schema_target(latest_packed_target.path, nil, nil) do |target|
132
+ latest_packed_target.targets << target
133
+ end
134
+ end
135
+ each_schema_target("#{dir}/schema",
136
+ latest_packed_target&.timestamp || min_timestamp,
137
+ max_timestamp) do |target|
138
+ targets << target
139
+ end
140
+ end
141
+
142
+ TABLE_TARGET_SUFFIXES = [".grn", ".parquet"]
143
+ def each_table_target(dir, name, min_timestamp, max_timestamp)
144
+ each_target_path(dir,
145
+ min_timestamp,
146
+ max_timestamp) do |path, timestamp, action, post_match|
147
+ next if action.nil?
148
+ next unless TABLE_TARGET_SUFFIXES.include?(post_match)
149
+ yield(TableTarget.new(path, timestamp, name, action))
150
+ end
151
+ end
152
+
153
+ def list_table_targets(dir, name, min_timestamp, max_timestamp, targets)
154
+ latest_packed_target = nil
155
+ each_packed_target_path(dir,
156
+ min_timestamp,
157
+ max_timestamp) do |path, timestamp|
158
+ if latest_packed_target and latest_packed_target.timestamp > timestamp
159
+ next
160
+ end
161
+ latest_packed_target = PackedTableTarget.new(path, timestamp, name)
162
+ end
163
+ if latest_packed_target
164
+ targets << latest_packed_target
165
+ each_table_target(latest_packed_target.path, name, nil, nil) do |target|
166
+ latest_packed_target.targets << target
167
+ end
168
+ end
169
+ each_table_target(dir,
170
+ name,
171
+ latest_packed_target&.timestamp || min_timestamp,
172
+ max_timestamp) do |target|
173
+ targets << target
174
+ end
175
+ end
176
+
177
+ def parse_directory_timestamp(base_name)
178
+ case base_name
179
+ when /\A(\d{4})-(\d{2})-(\d{2})\z/
180
+ match = Regexp.last_match
181
+ year = match[1].to_i
182
+ month = match[2].to_i
183
+ day = match[3].to_i
184
+ build_time(year, month, day)
185
+ else
186
+ nil
187
+ end
188
+ end
189
+
190
+ def parse_file_timestamp(base_name)
191
+ case base_name
192
+ when /\A(\d{4})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{9})(?:-(\w+))?/
193
+ match = Regexp.last_match
194
+ year = match[1].to_i
195
+ month = match[2].to_i
196
+ day = match[3].to_i
197
+ hour = match[4].to_i
198
+ minute = match[5].to_i
199
+ second = match[6].to_i
200
+ nanosecond = match[7].to_i
201
+ action = match[8]
202
+ timestamp = build_time(year,
203
+ month,
204
+ day,
205
+ hour,
206
+ minute,
207
+ second,
208
+ nanosecond)
209
+ [timestamp, action, match.post_match]
210
+ else
211
+ nil
212
+ end
213
+ end
214
+
215
+ module Loggable
216
+ private
217
+ def log(logger, path, before_message, after_message)
218
+ logger.info("#{before_message}: #{path}")
219
+ result = yield
220
+ logger.info("#{after_message}: #{path}")
221
+ result
222
+ end
223
+
224
+ def apply_log(logger, path, &block)
225
+ log(logger, path, "Start applying", "Applied", &block)
226
+ end
227
+
228
+ def vacuum_log(logger, path, &block)
229
+ log(logger, path, "Start vacuuming", "Vacuumed", &block)
230
+ end
231
+ end
232
+
233
+ class SchemaTarget
234
+ include Loggable
235
+
236
+ attr_reader :path
237
+ attr_reader :timestamp
238
+ def initialize(path, timestamp)
239
+ @path = path
240
+ @timestamp = timestamp
241
+ end
242
+
243
+ def apply(logger, client, processor)
244
+ apply_log(logger, @path) do
245
+ processor.load(@path)
246
+ end
247
+ end
248
+
249
+ def vacuum(logger)
250
+ vacuum_log(logger, @path) do
251
+ FileUtils.rm(@path)
252
+ end
253
+ end
254
+ end
255
+
256
+ class PackedSchemaTarget
257
+ include Loggable
258
+
259
+ attr_reader :path
260
+ attr_reader :timestamp
261
+ attr_reader :targets
262
+ def initialize(path, timestamp)
263
+ @path = path
264
+ @timestamp = timestamp
265
+ @targets = []
266
+ end
267
+
268
+ def apply(logger, client, processor)
269
+ apply_log(logger, @path) do
270
+ @targets.sort_by(&:timestamp).each do |target|
271
+ target.apply(logger, client, processor)
272
+ end
273
+ end
274
+ end
275
+
276
+ def vacuum(logger)
277
+ vacuum_log(logger, @path) do
278
+ @targets.sort_by(&:timestamp).each do |target|
279
+ target.vacuum(logger)
280
+ end
281
+ end
282
+ end
283
+ end
284
+
285
+ class TableTarget
286
+ include Loggable
287
+
288
+ attr_reader :path
289
+ attr_reader :timestamp
290
+ attr_reader :name
291
+ attr_reader :action
292
+ def initialize(path, timestamp, name, action)
293
+ @path = path
294
+ @timestamp = timestamp
295
+ @name = name
296
+ @action = action
297
+ end
298
+
299
+ def apply(logger, client, processor)
300
+ apply_log(logger, @path) do
301
+ if @path.end_with?(".grn")
302
+ processor.load(@path)
303
+ else
304
+ # TODO: Add support for @action == "delete"
305
+ table = Arrow::Table.load(@path)
306
+ command = Groonga::Command::Load.new(table: @name,
307
+ values: table,
308
+ command_version: "3")
309
+ response = client.load(command.arguments)
310
+ processor.process_response(response, command)
311
+ end
312
+ end
313
+ end
314
+
315
+ def vacuum(logger)
316
+ vacuum_log(logger, @path) do
317
+ FileUtils.rm(@path)
318
+ end
319
+ end
320
+ end
321
+
322
+ class PackedTableTarget
323
+ include Loggable
324
+
325
+ attr_reader :path
326
+ attr_reader :timestamp
327
+ attr_reader :name
328
+ attr_reader :targets
329
+ def initialize(path, timestamp, name)
330
+ @path = path
331
+ @timestamp = timestamp
332
+ @name = name
333
+ @targets = []
334
+ end
335
+
336
+ def apply(logger, client, processor)
337
+ apply_log(logger, @path) do
338
+ @targets.sort_by(&:timestamp).each do |target|
339
+ target.apply(logger, client, processor)
340
+ end
341
+ end
342
+ end
343
+
344
+ def vacuum(logger)
345
+ vacuum_log(logger, @path) do
346
+ @targets.sort_by(&:timestamp).each do |target|
347
+ target.vacuum(logger)
348
+ end
349
+ end
350
+ end
351
+ end
352
+ end
353
+ end
@@ -15,15 +15,15 @@
15
15
 
16
16
  require "groonga/command/parser"
17
17
 
18
- require_relative "writer"
18
+ require_relative "local-writer"
19
19
 
20
20
  module GroongaDelta
21
21
  class LocalSource
22
- def initialize(config, status)
22
+ def initialize(config, status, writer)
23
23
  @logger = config.logger
24
- @writer = Writer.new(@logger, config.delta_dir)
25
24
  @config = config.local
26
25
  @status = status.local
26
+ @writer = writer
27
27
  end
28
28
 
29
29
  def import
@@ -0,0 +1,39 @@
1
+ # Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "fileutils"
17
+
18
+ require_relative "local-reader"
19
+
20
+ module GroongaDelta
21
+ class LocalVacuumer
22
+ def initialize(config)
23
+ @logger = config.logger
24
+ @delta_dir = config.delta_dir
25
+ @config = config.vacuum
26
+ end
27
+
28
+ def vacuum
29
+ keep_span = @config.keep_span
30
+ return if keep_span.nil?
31
+ return if keep_span < 0
32
+ reader = LocalReader.new(@logger, @delta_dir)
33
+ max_timestamp = Time.now.utc - keep_span
34
+ reader.each(nil, max_timestamp) do |target|
35
+ target.vacuum(@logger)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -20,10 +20,11 @@ require "groonga/command"
20
20
  require "parquet"
21
21
 
22
22
  module GroongaDelta
23
- class Writer
24
- def initialize(logger, dir)
25
- @logger = logger
26
- @dir = dir
23
+ class LocalWriter
24
+ def initialize(config)
25
+ @config = config
26
+ @logger = @config.logger
27
+ @dir = @config.delta_dir
27
28
  end
28
29
 
29
30
  def write_upserts(table, records, packed: false)
@@ -74,7 +75,7 @@ module GroongaDelta
74
75
  delete = Groonga::Command::Delete.new
75
76
  delete[:table] = table
76
77
  keys.each do |key|
77
- delete[:key] = key
78
+ delete[:key] = format_key(key)
78
79
  output.puts(delete.to_command_format)
79
80
  end
80
81
  end
@@ -131,5 +132,16 @@ module GroongaDelta
131
132
  open_output: open_output,
132
133
  &block)
133
134
  end
135
+
136
+ def format_key(key)
137
+ case key
138
+ when Integer, Float
139
+ key.to_s
140
+ when Time
141
+ key.strftime("%Y-%m-%d %H:%M:%S.%6N")
142
+ else
143
+ key
144
+ end
145
+ end
134
146
  end
135
147
  end
@@ -142,8 +142,12 @@ module GroongaDelta
142
142
  def generate_record(source_record)
143
143
  record = {}
144
144
  @groonga_columns.each do |groonga_column|
145
- value = groonga_column.generate_value(source_record)
146
- record[groonga_column.name.to_sym] = value
145
+ begin
146
+ value = groonga_column.generate_value(source_record)
147
+ record[groonga_column.name.to_sym] = value
148
+ rescue => error
149
+ raise GenerationError.new(source_record, groonga_column, error)
150
+ end
147
151
  end
148
152
  record
149
153
  end
@@ -255,7 +259,16 @@ module GroongaDelta
255
259
  end
256
260
 
257
261
  def normalize_value(value)
258
- case type
262
+ case @type
263
+ when nil, "ShortText", "Text", "LongText"
264
+ encoding = value.encoding
265
+ if encoding == Encoding::ASCII_8BIT
266
+ value.force_encoding(Encoding::UTF_8)
267
+ return value if value.valid_encoding?
268
+ value.encode(Encoding::UTF_8, encoding)
269
+ else
270
+ value.encode(Encoding::UTF_8)
271
+ end
259
272
  when "Time"
260
273
  time_max = @restriction.time_max
261
274
  time_min = @restriction.time_min