groonga-delta 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +26 -0
- data/LICENSE.txt +674 -0
- data/README.md +43 -0
- data/Rakefile +36 -0
- data/bin/groonga-delta-apply +21 -0
- data/bin/groonga-delta-import +21 -0
- data/doc/text/news.md +5 -0
- data/groonga-delta.gemspec +54 -0
- data/lib/groonga-delta/apply-command.rb +38 -0
- data/lib/groonga-delta/apply-config.rb +78 -0
- data/lib/groonga-delta/apply-status.rb +24 -0
- data/lib/groonga-delta/command.rb +75 -0
- data/lib/groonga-delta/config.rb +99 -0
- data/lib/groonga-delta/error.rb +28 -0
- data/lib/groonga-delta/import-command.rb +43 -0
- data/lib/groonga-delta/import-config.rb +168 -0
- data/lib/groonga-delta/import-status.rb +68 -0
- data/lib/groonga-delta/local-delta.rb +386 -0
- data/lib/groonga-delta/local-source.rb +134 -0
- data/lib/groonga-delta/ltsv-log-formatter.rb +50 -0
- data/lib/groonga-delta/mapping.rb +314 -0
- data/lib/groonga-delta/mysql-source.rb +391 -0
- data/lib/groonga-delta/status.rb +43 -0
- data/lib/groonga-delta/version.rb +18 -0
- data/lib/groonga-delta/writer.rb +135 -0
- data/lib/groonga-delta.rb +18 -0
- metadata +114 -0
@@ -0,0 +1,134 @@
|
|
1
|
+
# Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "groonga/command/parser"
|
17
|
+
|
18
|
+
require_relative "writer"
|
19
|
+
|
20
|
+
module GroongaDelta
|
21
|
+
class LocalSource
|
22
|
+
def initialize(config, status)
|
23
|
+
@logger = config.logger
|
24
|
+
@writer = Writer.new(@logger, config.delta_dir)
|
25
|
+
@config = config.local
|
26
|
+
@status = status.local
|
27
|
+
end
|
28
|
+
|
29
|
+
def import
|
30
|
+
latest_number = @status.number || -1
|
31
|
+
targets = []
|
32
|
+
Dir.glob("#{@config.dir}/*.grn") do |path|
|
33
|
+
case File.basename(path)
|
34
|
+
when /\A\d+/
|
35
|
+
number = Regexp.last_match[0]
|
36
|
+
number = Integer(number, 10)
|
37
|
+
next if number <= latest_number
|
38
|
+
targets << [number, path]
|
39
|
+
else
|
40
|
+
next
|
41
|
+
end
|
42
|
+
end
|
43
|
+
targets.sort_by! {|number, _path| number}
|
44
|
+
parser = create_command_parser
|
45
|
+
targets.each do |number, path|
|
46
|
+
if latest_number == -1 and number > @config.initial_max_number
|
47
|
+
@logger.info("Stopped initial import")
|
48
|
+
break
|
49
|
+
end
|
50
|
+
@logger.info("Start importing: #{path}")
|
51
|
+
File.open(path) do |input|
|
52
|
+
last_line = nil
|
53
|
+
input.each_line do |line|
|
54
|
+
last_line = line
|
55
|
+
parser << line
|
56
|
+
end
|
57
|
+
if last_line and not last_line.end_with?("\n")
|
58
|
+
parser << line
|
59
|
+
end
|
60
|
+
end
|
61
|
+
@logger.info("Imported: #{path}")
|
62
|
+
@status.update("number" => number)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def create_command_parser
|
68
|
+
parser = Groonga::Command::Parser.new
|
69
|
+
|
70
|
+
parser.on_command do |command|
|
71
|
+
write_command(command)
|
72
|
+
end
|
73
|
+
|
74
|
+
parser.on_load_columns do |command, columns|
|
75
|
+
command[:columns] ||= columns.join(",")
|
76
|
+
end
|
77
|
+
|
78
|
+
split_load_chunk_size = 10000
|
79
|
+
load_values = []
|
80
|
+
parser.on_load_value do |command, value|
|
81
|
+
unless command[:values]
|
82
|
+
load_values << value
|
83
|
+
if load_values.size == split_load_chunk_size
|
84
|
+
write_load_command(command, load_values)
|
85
|
+
load_values.clear
|
86
|
+
end
|
87
|
+
end
|
88
|
+
command.original_source.clear
|
89
|
+
end
|
90
|
+
|
91
|
+
parser.on_load_complete do |command|
|
92
|
+
if command[:values]
|
93
|
+
write_load_command(command)
|
94
|
+
else
|
95
|
+
unless load_values.empty?
|
96
|
+
write_load_command(command, load_values)
|
97
|
+
load_values.clear
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
parser
|
103
|
+
end
|
104
|
+
|
105
|
+
def write_command(command)
|
106
|
+
case command.command_name
|
107
|
+
when "delete"
|
108
|
+
if command[:key]
|
109
|
+
@writer.write_deletes(command[:table], [command[:key]])
|
110
|
+
else
|
111
|
+
raise NotImplementedError,
|
112
|
+
"delete by not _key isn't supported yet: #{command.to_s}"
|
113
|
+
end
|
114
|
+
else
|
115
|
+
@writer.write_schema(command)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def write_load_command(command, values=nil)
|
120
|
+
columns = command.columns
|
121
|
+
values ||= command.values
|
122
|
+
if columns
|
123
|
+
original_values = values
|
124
|
+
values = Enumerator.new do |yielder|
|
125
|
+
yielder << columns
|
126
|
+
values.each do |value|
|
127
|
+
yielder << value
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
@writer.write_upserts(command.table, values)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
module GroongaDelta
|
17
|
+
class LTSVLogFormatter
|
18
|
+
def call(severity, time, program_name, message)
|
19
|
+
prefix = "timestamp:%{timestamp}\tseverity:%{severity}\tpid:%{pid}" % {
|
20
|
+
severity: severity,
|
21
|
+
timestamp: time.strftime("%Y-%m-%dT%H:%M:%S.%N"),
|
22
|
+
pid: Process.pid,
|
23
|
+
}
|
24
|
+
formatted = ""
|
25
|
+
backtrace = nil
|
26
|
+
case message
|
27
|
+
when String
|
28
|
+
when Exception
|
29
|
+
backtrace = message.backtrace
|
30
|
+
message = "#{message.class}: #{message}"
|
31
|
+
else
|
32
|
+
message = message.inspect
|
33
|
+
end
|
34
|
+
message.each_line(chomp: true) do |line, i|
|
35
|
+
formatted << "#{prefix}\tmessage:#{escape_value(line)}\n"
|
36
|
+
end
|
37
|
+
if backtrace
|
38
|
+
backtrace.each do |trace|
|
39
|
+
formatted << "#{prefix}\tmessage:#{escape_value(trace)}\n"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
formatted
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def escape_value(value)
|
47
|
+
value.gsub(/[\t\r\n]/, " ")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,314 @@
|
|
1
|
+
# Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require_relative "error"
|
17
|
+
|
18
|
+
module GroongaDelta
|
19
|
+
class Mapping
|
20
|
+
def initialize(data)
|
21
|
+
@data = data
|
22
|
+
build_source_databases
|
23
|
+
end
|
24
|
+
|
25
|
+
def source_databases
|
26
|
+
@source_databases
|
27
|
+
end
|
28
|
+
|
29
|
+
def [](source_database_name, source_table_name=nil)
|
30
|
+
if source_table_name.nil?
|
31
|
+
@source_databases_index[source_database_name]
|
32
|
+
else
|
33
|
+
@source_tables_index[[source_database_name, source_table_name]]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def build_source_databases
|
39
|
+
raw_source_databases = {}
|
40
|
+
@data.each do |groonga_table_name, details|
|
41
|
+
restriction = Restriction.new(details["restriction"])
|
42
|
+
(details["sources"] || []).each do |source|
|
43
|
+
raw_groonga_columns = source["columns"]
|
44
|
+
groonga_columns = []
|
45
|
+
source_column_names = []
|
46
|
+
raw_groonga_columns.each do |name, template|
|
47
|
+
if template.is_a?(Hash)
|
48
|
+
options = template
|
49
|
+
template = options["template"]
|
50
|
+
expression = options["expression"]
|
51
|
+
type = options["type"]
|
52
|
+
source_column_names.concat(options["source_column_names"] || [])
|
53
|
+
else
|
54
|
+
expression = nil
|
55
|
+
type = nil
|
56
|
+
end
|
57
|
+
groonga_columns << GroongaColumn.new(name,
|
58
|
+
template,
|
59
|
+
expression,
|
60
|
+
type,
|
61
|
+
restriction)
|
62
|
+
if template
|
63
|
+
template.scan(/%{(.*?)}/).flatten.each do |source_column_name|
|
64
|
+
source_column_names << source_column_name.to_sym
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
source_column_names.uniq!
|
69
|
+
groonga_table = GroongaTable.new(groonga_table_name,
|
70
|
+
groonga_columns)
|
71
|
+
source_table = SourceTable.new(source["table"],
|
72
|
+
source_column_names,
|
73
|
+
source["filter"],
|
74
|
+
groonga_table)
|
75
|
+
source_tables = (raw_source_databases[source["database"]] ||= [])
|
76
|
+
source_tables << source_table
|
77
|
+
end
|
78
|
+
end
|
79
|
+
@source_databases = []
|
80
|
+
@source_databases_index = {}
|
81
|
+
@source_tables_index = {}
|
82
|
+
raw_source_databases.each do |source_database_name, source_tables|
|
83
|
+
source_database = SourceDatabase.new(source_database_name,
|
84
|
+
source_tables)
|
85
|
+
@source_databases << source_database
|
86
|
+
@source_databases_index[source_database.name] = source_database
|
87
|
+
source_tables.each do |source_table|
|
88
|
+
@source_tables_index[[source_database.name, source_table.name]] =
|
89
|
+
source_table
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class Restriction
|
95
|
+
attr_reader :time_max
|
96
|
+
attr_reader :time_min
|
97
|
+
def initialize(data)
|
98
|
+
@data = data
|
99
|
+
@time_max = time_value("time", "max")
|
100
|
+
@time_min = time_value("time", "min")
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
def time_value(*keys)
|
105
|
+
return nil if @data.nil?
|
106
|
+
value = @data.dig(*keys)
|
107
|
+
return value if value.nil?
|
108
|
+
Time.parse(value).localtime
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
class SourceDatabase
|
113
|
+
attr_reader :name
|
114
|
+
attr_reader :source_tables
|
115
|
+
def initialize(name, source_tables)
|
116
|
+
@name = name
|
117
|
+
@source_tables = source_tables
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
class SourceTable
|
122
|
+
attr_reader :name
|
123
|
+
attr_reader :source_column_names
|
124
|
+
attr_reader :source_filter
|
125
|
+
attr_reader :groonga_table
|
126
|
+
def initialize(name, source_column_names, source_filter, groonga_table)
|
127
|
+
@name = name
|
128
|
+
@source_column_names = source_column_names
|
129
|
+
@source_filter = source_filter
|
130
|
+
@groonga_table = groonga_table
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class GroongaTable
|
135
|
+
attr_reader :name
|
136
|
+
attr_reader :groonga_columns
|
137
|
+
def initialize(name, groonga_columns)
|
138
|
+
@name = name
|
139
|
+
@groonga_columns = groonga_columns
|
140
|
+
end
|
141
|
+
|
142
|
+
def generate_record(source_record)
|
143
|
+
record = {}
|
144
|
+
@groonga_columns.each do |groonga_column|
|
145
|
+
value = groonga_column.generate_value(source_record)
|
146
|
+
record[groonga_column.name.to_sym] = value
|
147
|
+
end
|
148
|
+
record
|
149
|
+
end
|
150
|
+
|
151
|
+
def generate_record_batch(source_records)
|
152
|
+
fields = @groonga_columns.collect do |groonga_column|
|
153
|
+
{
|
154
|
+
name: groonga_column.name,
|
155
|
+
data_type: groonga_column.arrow_type,
|
156
|
+
}
|
157
|
+
end
|
158
|
+
builder = Arrow::RecordBatchBuilder.new(fields)
|
159
|
+
groonga_records = Enumerator.new do |yielder|
|
160
|
+
source_records.each do |source_record|
|
161
|
+
yielder << generate_record(source_record)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
builder.append_records(groonga_records)
|
165
|
+
builder.flush
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
class GroongaColumn
|
170
|
+
attr_reader :name
|
171
|
+
attr_reader :template
|
172
|
+
attr_reader :expression
|
173
|
+
attr_reader :type
|
174
|
+
attr_reader :restriction
|
175
|
+
def initialize(name, template, expression, type, restriction)
|
176
|
+
@name = name
|
177
|
+
@template = template
|
178
|
+
@expression = expression
|
179
|
+
@type = type
|
180
|
+
@restriction = restriction
|
181
|
+
end
|
182
|
+
|
183
|
+
def generate_value(source_record)
|
184
|
+
if @template
|
185
|
+
value = cast(@template % source_record)
|
186
|
+
else
|
187
|
+
evaluator = ExpressionEvaluator.new(source_record)
|
188
|
+
value = evaluator.evaluate(@expression)
|
189
|
+
end
|
190
|
+
normalize_value(value)
|
191
|
+
end
|
192
|
+
|
193
|
+
def arrow_type
|
194
|
+
case @type
|
195
|
+
when nil, "ShortText", "Text", "LongText"
|
196
|
+
:string
|
197
|
+
when "Bool"
|
198
|
+
:boolean
|
199
|
+
when "Time"
|
200
|
+
Arrow::TimestampDataType.new(:nano)
|
201
|
+
else
|
202
|
+
@type
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
def cast(value)
|
208
|
+
case @type
|
209
|
+
when nil, "ShortText", "Text", "LongText"
|
210
|
+
value
|
211
|
+
when /\AU?Int(?:8|16|32|64)\z/
|
212
|
+
return 0 if value.empty?
|
213
|
+
Integer(value, 10)
|
214
|
+
when "Float"
|
215
|
+
return 0.0 if value.empty?
|
216
|
+
Float(value)
|
217
|
+
when "Bool"
|
218
|
+
return false if value.empty?
|
219
|
+
case value
|
220
|
+
when "0"
|
221
|
+
false
|
222
|
+
else
|
223
|
+
true
|
224
|
+
end
|
225
|
+
when "Time"
|
226
|
+
case value
|
227
|
+
when /\A(\d{4})-(\d{2})-(\d{2})\z/
|
228
|
+
match = Regexp.last_match
|
229
|
+
year = Integer(match[1], 10)
|
230
|
+
month = Integer(match[2], 10)
|
231
|
+
day = Integer(match[3], 10)
|
232
|
+
Time.new(year, month, day)
|
233
|
+
when /\A(\d{4})-(\d{2})-(\d{2})\
|
234
|
+
(\d{2}):(\d{2}):(\d{2})\
|
235
|
+
([+-])(\d{2})(\d{2})\z/x
|
236
|
+
match = Regexp.last_match
|
237
|
+
year = Integer(match[1], 10)
|
238
|
+
month = Integer(match[2], 10)
|
239
|
+
day = Integer(match[3], 10)
|
240
|
+
hour = Integer(match[4], 10)
|
241
|
+
minute = Integer(match[5], 10)
|
242
|
+
second = Integer(match[6], 10)
|
243
|
+
timezone_sign = match[7]
|
244
|
+
timezone_hour = match[8]
|
245
|
+
timezone_minute = match[9]
|
246
|
+
timezone = "#{timezone_sign}#{timezone_hour}:#{timezone_minute}"
|
247
|
+
time = Time.new(year, month, day, hour, minute, second, timezone)
|
248
|
+
time.utc.localtime
|
249
|
+
else
|
250
|
+
value
|
251
|
+
end
|
252
|
+
else
|
253
|
+
raise ConfigError, "Unknown type: #{@type}: #{value.inspect}"
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def normalize_value(value)
|
258
|
+
case type
|
259
|
+
when "Time"
|
260
|
+
time_max = @restriction.time_max
|
261
|
+
time_min = @restriction.time_min
|
262
|
+
if !value.is_a?(Time) and value.respond_to?(:to_time)
|
263
|
+
value = value.to_time
|
264
|
+
end
|
265
|
+
return value if time_max.nil? and time_min.nil?
|
266
|
+
return value unless value.is_a?(Time)
|
267
|
+
if time_max and value >= time_max
|
268
|
+
time_max
|
269
|
+
elsif time_min and value <= time_min
|
270
|
+
time_min
|
271
|
+
else
|
272
|
+
value
|
273
|
+
end
|
274
|
+
else
|
275
|
+
value
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
class ExpressionEvaluator
|
281
|
+
class Context < BasicObject
|
282
|
+
def html_untag(text)
|
283
|
+
text.gsub(/<.*?>/, "")
|
284
|
+
end
|
285
|
+
|
286
|
+
def groonga_escape_query(text)
|
287
|
+
case text
|
288
|
+
when /[+\-><~*()"\\: ]/
|
289
|
+
"\"#{text.gsub("\"", "\\\"")}\""
|
290
|
+
when "OR"
|
291
|
+
"\"OR\""
|
292
|
+
else
|
293
|
+
text
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
def initialize(source_record)
|
299
|
+
@context = Context.new
|
300
|
+
context_singleton_class =
|
301
|
+
Kernel.instance_method(:singleton_class).bind(@context).call
|
302
|
+
source_record.each do |key, value|
|
303
|
+
context_singleton_class.define_method(key) do
|
304
|
+
value
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
def evaluate(expression)
|
310
|
+
@context.instance_eval(expression, __FILE__, __LINE__)
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|