groonga-delta 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,134 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "groonga/command/parser"
17
+
18
+ require_relative "writer"
19
+
20
+ module GroongaDelta
21
+ class LocalSource
22
+ def initialize(config, status)
23
+ @logger = config.logger
24
+ @writer = Writer.new(@logger, config.delta_dir)
25
+ @config = config.local
26
+ @status = status.local
27
+ end
28
+
29
+ def import
30
+ latest_number = @status.number || -1
31
+ targets = []
32
+ Dir.glob("#{@config.dir}/*.grn") do |path|
33
+ case File.basename(path)
34
+ when /\A\d+/
35
+ number = Regexp.last_match[0]
36
+ number = Integer(number, 10)
37
+ next if number <= latest_number
38
+ targets << [number, path]
39
+ else
40
+ next
41
+ end
42
+ end
43
+ targets.sort_by! {|number, _path| number}
44
+ parser = create_command_parser
45
+ targets.each do |number, path|
46
+ if latest_number == -1 and number > @config.initial_max_number
47
+ @logger.info("Stopped initial import")
48
+ break
49
+ end
50
+ @logger.info("Start importing: #{path}")
51
+ File.open(path) do |input|
52
+ last_line = nil
53
+ input.each_line do |line|
54
+ last_line = line
55
+ parser << line
56
+ end
57
+ if last_line and not last_line.end_with?("\n")
58
+ parser << line
59
+ end
60
+ end
61
+ @logger.info("Imported: #{path}")
62
+ @status.update("number" => number)
63
+ end
64
+ end
65
+
66
+ private
67
+ def create_command_parser
68
+ parser = Groonga::Command::Parser.new
69
+
70
+ parser.on_command do |command|
71
+ write_command(command)
72
+ end
73
+
74
+ parser.on_load_columns do |command, columns|
75
+ command[:columns] ||= columns.join(",")
76
+ end
77
+
78
+ split_load_chunk_size = 10000
79
+ load_values = []
80
+ parser.on_load_value do |command, value|
81
+ unless command[:values]
82
+ load_values << value
83
+ if load_values.size == split_load_chunk_size
84
+ write_load_command(command, load_values)
85
+ load_values.clear
86
+ end
87
+ end
88
+ command.original_source.clear
89
+ end
90
+
91
+ parser.on_load_complete do |command|
92
+ if command[:values]
93
+ write_load_command(command)
94
+ else
95
+ unless load_values.empty?
96
+ write_load_command(command, load_values)
97
+ load_values.clear
98
+ end
99
+ end
100
+ end
101
+
102
+ parser
103
+ end
104
+
105
+ def write_command(command)
106
+ case command.command_name
107
+ when "delete"
108
+ if command[:key]
109
+ @writer.write_deletes(command[:table], [command[:key]])
110
+ else
111
+ raise NotImplementedError,
112
+ "delete by not _key isn't supported yet: #{command.to_s}"
113
+ end
114
+ else
115
+ @writer.write_schema(command)
116
+ end
117
+ end
118
+
119
+ def write_load_command(command, values=nil)
120
+ columns = command.columns
121
+ values ||= command.values
122
+ if columns
123
+ original_values = values
124
+ values = Enumerator.new do |yielder|
125
+ yielder << columns
126
+ values.each do |value|
127
+ yielder << value
128
+ end
129
+ end
130
+ end
131
+ @writer.write_upserts(command.table, values)
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,50 @@
1
+ # Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module GroongaDelta
17
+ class LTSVLogFormatter
18
+ def call(severity, time, program_name, message)
19
+ prefix = "timestamp:%{timestamp}\tseverity:%{severity}\tpid:%{pid}" % {
20
+ severity: severity,
21
+ timestamp: time.strftime("%Y-%m-%dT%H:%M:%S.%N"),
22
+ pid: Process.pid,
23
+ }
24
+ formatted = ""
25
+ backtrace = nil
26
+ case message
27
+ when String
28
+ when Exception
29
+ backtrace = message.backtrace
30
+ message = "#{message.class}: #{message}"
31
+ else
32
+ message = message.inspect
33
+ end
34
+ message.each_line(chomp: true) do |line, i|
35
+ formatted << "#{prefix}\tmessage:#{escape_value(line)}\n"
36
+ end
37
+ if backtrace
38
+ backtrace.each do |trace|
39
+ formatted << "#{prefix}\tmessage:#{escape_value(trace)}\n"
40
+ end
41
+ end
42
+ formatted
43
+ end
44
+
45
+ private
46
+ def escape_value(value)
47
+ value.gsub(/[\t\r\n]/, " ")
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,314 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "error"
17
+
18
+ module GroongaDelta
19
+ class Mapping
20
+ def initialize(data)
21
+ @data = data
22
+ build_source_databases
23
+ end
24
+
25
+ def source_databases
26
+ @source_databases
27
+ end
28
+
29
+ def [](source_database_name, source_table_name=nil)
30
+ if source_table_name.nil?
31
+ @source_databases_index[source_database_name]
32
+ else
33
+ @source_tables_index[[source_database_name, source_table_name]]
34
+ end
35
+ end
36
+
37
+ private
38
+ def build_source_databases
39
+ raw_source_databases = {}
40
+ @data.each do |groonga_table_name, details|
41
+ restriction = Restriction.new(details["restriction"])
42
+ (details["sources"] || []).each do |source|
43
+ raw_groonga_columns = source["columns"]
44
+ groonga_columns = []
45
+ source_column_names = []
46
+ raw_groonga_columns.each do |name, template|
47
+ if template.is_a?(Hash)
48
+ options = template
49
+ template = options["template"]
50
+ expression = options["expression"]
51
+ type = options["type"]
52
+ source_column_names.concat(options["source_column_names"] || [])
53
+ else
54
+ expression = nil
55
+ type = nil
56
+ end
57
+ groonga_columns << GroongaColumn.new(name,
58
+ template,
59
+ expression,
60
+ type,
61
+ restriction)
62
+ if template
63
+ template.scan(/%{(.*?)}/).flatten.each do |source_column_name|
64
+ source_column_names << source_column_name.to_sym
65
+ end
66
+ end
67
+ end
68
+ source_column_names.uniq!
69
+ groonga_table = GroongaTable.new(groonga_table_name,
70
+ groonga_columns)
71
+ source_table = SourceTable.new(source["table"],
72
+ source_column_names,
73
+ source["filter"],
74
+ groonga_table)
75
+ source_tables = (raw_source_databases[source["database"]] ||= [])
76
+ source_tables << source_table
77
+ end
78
+ end
79
+ @source_databases = []
80
+ @source_databases_index = {}
81
+ @source_tables_index = {}
82
+ raw_source_databases.each do |source_database_name, source_tables|
83
+ source_database = SourceDatabase.new(source_database_name,
84
+ source_tables)
85
+ @source_databases << source_database
86
+ @source_databases_index[source_database.name] = source_database
87
+ source_tables.each do |source_table|
88
+ @source_tables_index[[source_database.name, source_table.name]] =
89
+ source_table
90
+ end
91
+ end
92
+ end
93
+
94
+ class Restriction
95
+ attr_reader :time_max
96
+ attr_reader :time_min
97
+ def initialize(data)
98
+ @data = data
99
+ @time_max = time_value("time", "max")
100
+ @time_min = time_value("time", "min")
101
+ end
102
+
103
+ private
104
+ def time_value(*keys)
105
+ return nil if @data.nil?
106
+ value = @data.dig(*keys)
107
+ return value if value.nil?
108
+ Time.parse(value).localtime
109
+ end
110
+ end
111
+
112
+ class SourceDatabase
113
+ attr_reader :name
114
+ attr_reader :source_tables
115
+ def initialize(name, source_tables)
116
+ @name = name
117
+ @source_tables = source_tables
118
+ end
119
+ end
120
+
121
+ class SourceTable
122
+ attr_reader :name
123
+ attr_reader :source_column_names
124
+ attr_reader :source_filter
125
+ attr_reader :groonga_table
126
+ def initialize(name, source_column_names, source_filter, groonga_table)
127
+ @name = name
128
+ @source_column_names = source_column_names
129
+ @source_filter = source_filter
130
+ @groonga_table = groonga_table
131
+ end
132
+ end
133
+
134
+ class GroongaTable
135
+ attr_reader :name
136
+ attr_reader :groonga_columns
137
+ def initialize(name, groonga_columns)
138
+ @name = name
139
+ @groonga_columns = groonga_columns
140
+ end
141
+
142
+ def generate_record(source_record)
143
+ record = {}
144
+ @groonga_columns.each do |groonga_column|
145
+ value = groonga_column.generate_value(source_record)
146
+ record[groonga_column.name.to_sym] = value
147
+ end
148
+ record
149
+ end
150
+
151
+ def generate_record_batch(source_records)
152
+ fields = @groonga_columns.collect do |groonga_column|
153
+ {
154
+ name: groonga_column.name,
155
+ data_type: groonga_column.arrow_type,
156
+ }
157
+ end
158
+ builder = Arrow::RecordBatchBuilder.new(fields)
159
+ groonga_records = Enumerator.new do |yielder|
160
+ source_records.each do |source_record|
161
+ yielder << generate_record(source_record)
162
+ end
163
+ end
164
+ builder.append_records(groonga_records)
165
+ builder.flush
166
+ end
167
+ end
168
+
169
+ class GroongaColumn
170
+ attr_reader :name
171
+ attr_reader :template
172
+ attr_reader :expression
173
+ attr_reader :type
174
+ attr_reader :restriction
175
+ def initialize(name, template, expression, type, restriction)
176
+ @name = name
177
+ @template = template
178
+ @expression = expression
179
+ @type = type
180
+ @restriction = restriction
181
+ end
182
+
183
+ def generate_value(source_record)
184
+ if @template
185
+ value = cast(@template % source_record)
186
+ else
187
+ evaluator = ExpressionEvaluator.new(source_record)
188
+ value = evaluator.evaluate(@expression)
189
+ end
190
+ normalize_value(value)
191
+ end
192
+
193
+ def arrow_type
194
+ case @type
195
+ when nil, "ShortText", "Text", "LongText"
196
+ :string
197
+ when "Bool"
198
+ :boolean
199
+ when "Time"
200
+ Arrow::TimestampDataType.new(:nano)
201
+ else
202
+ @type
203
+ end
204
+ end
205
+
206
+ private
207
+ def cast(value)
208
+ case @type
209
+ when nil, "ShortText", "Text", "LongText"
210
+ value
211
+ when /\AU?Int(?:8|16|32|64)\z/
212
+ return 0 if value.empty?
213
+ Integer(value, 10)
214
+ when "Float"
215
+ return 0.0 if value.empty?
216
+ Float(value)
217
+ when "Bool"
218
+ return false if value.empty?
219
+ case value
220
+ when "0"
221
+ false
222
+ else
223
+ true
224
+ end
225
+ when "Time"
226
+ case value
227
+ when /\A(\d{4})-(\d{2})-(\d{2})\z/
228
+ match = Regexp.last_match
229
+ year = Integer(match[1], 10)
230
+ month = Integer(match[2], 10)
231
+ day = Integer(match[3], 10)
232
+ Time.new(year, month, day)
233
+ when /\A(\d{4})-(\d{2})-(\d{2})\
234
+ (\d{2}):(\d{2}):(\d{2})\
235
+ ([+-])(\d{2})(\d{2})\z/x
236
+ match = Regexp.last_match
237
+ year = Integer(match[1], 10)
238
+ month = Integer(match[2], 10)
239
+ day = Integer(match[3], 10)
240
+ hour = Integer(match[4], 10)
241
+ minute = Integer(match[5], 10)
242
+ second = Integer(match[6], 10)
243
+ timezone_sign = match[7]
244
+ timezone_hour = match[8]
245
+ timezone_minute = match[9]
246
+ timezone = "#{timezone_sign}#{timezone_hour}:#{timezone_minute}"
247
+ time = Time.new(year, month, day, hour, minute, second, timezone)
248
+ time.utc.localtime
249
+ else
250
+ value
251
+ end
252
+ else
253
+ raise ConfigError, "Unknown type: #{@type}: #{value.inspect}"
254
+ end
255
+ end
256
+
257
+ def normalize_value(value)
258
+ case type
259
+ when "Time"
260
+ time_max = @restriction.time_max
261
+ time_min = @restriction.time_min
262
+ if !value.is_a?(Time) and value.respond_to?(:to_time)
263
+ value = value.to_time
264
+ end
265
+ return value if time_max.nil? and time_min.nil?
266
+ return value unless value.is_a?(Time)
267
+ if time_max and value >= time_max
268
+ time_max
269
+ elsif time_min and value <= time_min
270
+ time_min
271
+ else
272
+ value
273
+ end
274
+ else
275
+ value
276
+ end
277
+ end
278
+ end
279
+
280
+ class ExpressionEvaluator
281
+ class Context < BasicObject
282
+ def html_untag(text)
283
+ text.gsub(/<.*?>/, "")
284
+ end
285
+
286
+ def groonga_escape_query(text)
287
+ case text
288
+ when /[+\-><~*()"\\: ]/
289
+ "\"#{text.gsub("\"", "\\\"")}\""
290
+ when "OR"
291
+ "\"OR\""
292
+ else
293
+ text
294
+ end
295
+ end
296
+ end
297
+
298
+ def initialize(source_record)
299
+ @context = Context.new
300
+ context_singleton_class =
301
+ Kernel.instance_method(:singleton_class).bind(@context).call
302
+ source_record.each do |key, value|
303
+ context_singleton_class.define_method(key) do
304
+ value
305
+ end
306
+ end
307
+ end
308
+
309
+ def evaluate(expression)
310
+ @context.instance_eval(expression, __FILE__, __LINE__)
311
+ end
312
+ end
313
+ end
314
+ end