groonga-delta 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "groonga/command/parser"
17
+
18
+ require_relative "writer"
19
+
20
+ module GroongaDelta
21
+ class LocalSource
22
+ def initialize(config, status)
23
+ @logger = config.logger
24
+ @writer = Writer.new(@logger, config.delta_dir)
25
+ @config = config.local
26
+ @status = status.local
27
+ end
28
+
29
+ def import
30
+ latest_number = @status.number || -1
31
+ targets = []
32
+ Dir.glob("#{@config.dir}/*.grn") do |path|
33
+ case File.basename(path)
34
+ when /\A\d+/
35
+ number = Regexp.last_match[0]
36
+ number = Integer(number, 10)
37
+ next if number <= latest_number
38
+ targets << [number, path]
39
+ else
40
+ next
41
+ end
42
+ end
43
+ targets.sort_by! {|number, _path| number}
44
+ parser = create_command_parser
45
+ targets.each do |number, path|
46
+ if latest_number == -1 and number > @config.initial_max_number
47
+ @logger.info("Stopped initial import")
48
+ break
49
+ end
50
+ @logger.info("Start importing: #{path}")
51
+ File.open(path) do |input|
52
+ last_line = nil
53
+ input.each_line do |line|
54
+ last_line = line
55
+ parser << line
56
+ end
57
+ if last_line and not last_line.end_with?("\n")
58
+ parser << line
59
+ end
60
+ end
61
+ @logger.info("Imported: #{path}")
62
+ @status.update("number" => number)
63
+ end
64
+ end
65
+
66
+ private
67
+ def create_command_parser
68
+ parser = Groonga::Command::Parser.new
69
+
70
+ parser.on_command do |command|
71
+ write_command(command)
72
+ end
73
+
74
+ parser.on_load_columns do |command, columns|
75
+ command[:columns] ||= columns.join(",")
76
+ end
77
+
78
+ split_load_chunk_size = 10000
79
+ load_values = []
80
+ parser.on_load_value do |command, value|
81
+ unless command[:values]
82
+ load_values << value
83
+ if load_values.size == split_load_chunk_size
84
+ write_load_command(command, load_values)
85
+ load_values.clear
86
+ end
87
+ end
88
+ command.original_source.clear
89
+ end
90
+
91
+ parser.on_load_complete do |command|
92
+ if command[:values]
93
+ write_load_command(command)
94
+ else
95
+ unless load_values.empty?
96
+ write_load_command(command, load_values)
97
+ load_values.clear
98
+ end
99
+ end
100
+ end
101
+
102
+ parser
103
+ end
104
+
105
+ def write_command(command)
106
+ case command.command_name
107
+ when "delete"
108
+ if command[:key]
109
+ @writer.write_deletes(command[:table], [command[:key]])
110
+ else
111
+ raise NotImplementedError,
112
+ "delete by not _key isn't supported yet: #{command.to_s}"
113
+ end
114
+ else
115
+ @writer.write_schema(command)
116
+ end
117
+ end
118
+
119
+ def write_load_command(command, values=nil)
120
+ columns = command.columns
121
+ values ||= command.values
122
+ if columns
123
+ original_values = values
124
+ values = Enumerator.new do |yielder|
125
+ yielder << columns
126
+ values.each do |value|
127
+ yielder << value
128
+ end
129
+ end
130
+ end
131
+ @writer.write_upserts(command.table, values)
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,50 @@
1
+ # Copyright (C) 2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module GroongaDelta
17
+ class LTSVLogFormatter
18
+ def call(severity, time, program_name, message)
19
+ prefix = "timestamp:%{timestamp}\tseverity:%{severity}\tpid:%{pid}" % {
20
+ severity: severity,
21
+ timestamp: time.strftime("%Y-%m-%dT%H:%M:%S.%N"),
22
+ pid: Process.pid,
23
+ }
24
+ formatted = ""
25
+ backtrace = nil
26
+ case message
27
+ when String
28
+ when Exception
29
+ backtrace = message.backtrace
30
+ message = "#{message.class}: #{message}"
31
+ else
32
+ message = message.inspect
33
+ end
34
+ message.each_line(chomp: true) do |line, i|
35
+ formatted << "#{prefix}\tmessage:#{escape_value(line)}\n"
36
+ end
37
+ if backtrace
38
+ backtrace.each do |trace|
39
+ formatted << "#{prefix}\tmessage:#{escape_value(trace)}\n"
40
+ end
41
+ end
42
+ formatted
43
+ end
44
+
45
+ private
46
+ def escape_value(value)
47
+ value.gsub(/[\t\r\n]/, " ")
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,314 @@
1
+ # Copyright (C) 2021-2022 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require_relative "error"
17
+
18
+ module GroongaDelta
19
+ class Mapping
20
+ def initialize(data)
21
+ @data = data
22
+ build_source_databases
23
+ end
24
+
25
+ def source_databases
26
+ @source_databases
27
+ end
28
+
29
+ def [](source_database_name, source_table_name=nil)
30
+ if source_table_name.nil?
31
+ @source_databases_index[source_database_name]
32
+ else
33
+ @source_tables_index[[source_database_name, source_table_name]]
34
+ end
35
+ end
36
+
37
+ private
38
+ def build_source_databases
39
+ raw_source_databases = {}
40
+ @data.each do |groonga_table_name, details|
41
+ restriction = Restriction.new(details["restriction"])
42
+ (details["sources"] || []).each do |source|
43
+ raw_groonga_columns = source["columns"]
44
+ groonga_columns = []
45
+ source_column_names = []
46
+ raw_groonga_columns.each do |name, template|
47
+ if template.is_a?(Hash)
48
+ options = template
49
+ template = options["template"]
50
+ expression = options["expression"]
51
+ type = options["type"]
52
+ source_column_names.concat(options["source_column_names"] || [])
53
+ else
54
+ expression = nil
55
+ type = nil
56
+ end
57
+ groonga_columns << GroongaColumn.new(name,
58
+ template,
59
+ expression,
60
+ type,
61
+ restriction)
62
+ if template
63
+ template.scan(/%{(.*?)}/).flatten.each do |source_column_name|
64
+ source_column_names << source_column_name.to_sym
65
+ end
66
+ end
67
+ end
68
+ source_column_names.uniq!
69
+ groonga_table = GroongaTable.new(groonga_table_name,
70
+ groonga_columns)
71
+ source_table = SourceTable.new(source["table"],
72
+ source_column_names,
73
+ source["filter"],
74
+ groonga_table)
75
+ source_tables = (raw_source_databases[source["database"]] ||= [])
76
+ source_tables << source_table
77
+ end
78
+ end
79
+ @source_databases = []
80
+ @source_databases_index = {}
81
+ @source_tables_index = {}
82
+ raw_source_databases.each do |source_database_name, source_tables|
83
+ source_database = SourceDatabase.new(source_database_name,
84
+ source_tables)
85
+ @source_databases << source_database
86
+ @source_databases_index[source_database.name] = source_database
87
+ source_tables.each do |source_table|
88
+ @source_tables_index[[source_database.name, source_table.name]] =
89
+ source_table
90
+ end
91
+ end
92
+ end
93
+
94
+ class Restriction
95
+ attr_reader :time_max
96
+ attr_reader :time_min
97
+ def initialize(data)
98
+ @data = data
99
+ @time_max = time_value("time", "max")
100
+ @time_min = time_value("time", "min")
101
+ end
102
+
103
+ private
104
+ def time_value(*keys)
105
+ return nil if @data.nil?
106
+ value = @data.dig(*keys)
107
+ return value if value.nil?
108
+ Time.parse(value).localtime
109
+ end
110
+ end
111
+
112
+ class SourceDatabase
113
+ attr_reader :name
114
+ attr_reader :source_tables
115
+ def initialize(name, source_tables)
116
+ @name = name
117
+ @source_tables = source_tables
118
+ end
119
+ end
120
+
121
+ class SourceTable
122
+ attr_reader :name
123
+ attr_reader :source_column_names
124
+ attr_reader :source_filter
125
+ attr_reader :groonga_table
126
+ def initialize(name, source_column_names, source_filter, groonga_table)
127
+ @name = name
128
+ @source_column_names = source_column_names
129
+ @source_filter = source_filter
130
+ @groonga_table = groonga_table
131
+ end
132
+ end
133
+
134
+ class GroongaTable
135
+ attr_reader :name
136
+ attr_reader :groonga_columns
137
+ def initialize(name, groonga_columns)
138
+ @name = name
139
+ @groonga_columns = groonga_columns
140
+ end
141
+
142
+ def generate_record(source_record)
143
+ record = {}
144
+ @groonga_columns.each do |groonga_column|
145
+ value = groonga_column.generate_value(source_record)
146
+ record[groonga_column.name.to_sym] = value
147
+ end
148
+ record
149
+ end
150
+
151
+ def generate_record_batch(source_records)
152
+ fields = @groonga_columns.collect do |groonga_column|
153
+ {
154
+ name: groonga_column.name,
155
+ data_type: groonga_column.arrow_type,
156
+ }
157
+ end
158
+ builder = Arrow::RecordBatchBuilder.new(fields)
159
+ groonga_records = Enumerator.new do |yielder|
160
+ source_records.each do |source_record|
161
+ yielder << generate_record(source_record)
162
+ end
163
+ end
164
+ builder.append_records(groonga_records)
165
+ builder.flush
166
+ end
167
+ end
168
+
169
+ class GroongaColumn
170
+ attr_reader :name
171
+ attr_reader :template
172
+ attr_reader :expression
173
+ attr_reader :type
174
+ attr_reader :restriction
175
+ def initialize(name, template, expression, type, restriction)
176
+ @name = name
177
+ @template = template
178
+ @expression = expression
179
+ @type = type
180
+ @restriction = restriction
181
+ end
182
+
183
+ def generate_value(source_record)
184
+ if @template
185
+ value = cast(@template % source_record)
186
+ else
187
+ evaluator = ExpressionEvaluator.new(source_record)
188
+ value = evaluator.evaluate(@expression)
189
+ end
190
+ normalize_value(value)
191
+ end
192
+
193
+ def arrow_type
194
+ case @type
195
+ when nil, "ShortText", "Text", "LongText"
196
+ :string
197
+ when "Bool"
198
+ :boolean
199
+ when "Time"
200
+ Arrow::TimestampDataType.new(:nano)
201
+ else
202
+ @type
203
+ end
204
+ end
205
+
206
+ private
207
+ def cast(value)
208
+ case @type
209
+ when nil, "ShortText", "Text", "LongText"
210
+ value
211
+ when /\AU?Int(?:8|16|32|64)\z/
212
+ return 0 if value.empty?
213
+ Integer(value, 10)
214
+ when "Float"
215
+ return 0.0 if value.empty?
216
+ Float(value)
217
+ when "Bool"
218
+ return false if value.empty?
219
+ case value
220
+ when "0"
221
+ false
222
+ else
223
+ true
224
+ end
225
+ when "Time"
226
+ case value
227
+ when /\A(\d{4})-(\d{2})-(\d{2})\z/
228
+ match = Regexp.last_match
229
+ year = Integer(match[1], 10)
230
+ month = Integer(match[2], 10)
231
+ day = Integer(match[3], 10)
232
+ Time.new(year, month, day)
233
+ when /\A(\d{4})-(\d{2})-(\d{2})\
234
+ (\d{2}):(\d{2}):(\d{2})\
235
+ ([+-])(\d{2})(\d{2})\z/x
236
+ match = Regexp.last_match
237
+ year = Integer(match[1], 10)
238
+ month = Integer(match[2], 10)
239
+ day = Integer(match[3], 10)
240
+ hour = Integer(match[4], 10)
241
+ minute = Integer(match[5], 10)
242
+ second = Integer(match[6], 10)
243
+ timezone_sign = match[7]
244
+ timezone_hour = match[8]
245
+ timezone_minute = match[9]
246
+ timezone = "#{timezone_sign}#{timezone_hour}:#{timezone_minute}"
247
+ time = Time.new(year, month, day, hour, minute, second, timezone)
248
+ time.utc.localtime
249
+ else
250
+ value
251
+ end
252
+ else
253
+ raise ConfigError, "Unknown type: #{@type}: #{value.inspect}"
254
+ end
255
+ end
256
+
257
+ def normalize_value(value)
258
+ case type
259
+ when "Time"
260
+ time_max = @restriction.time_max
261
+ time_min = @restriction.time_min
262
+ if !value.is_a?(Time) and value.respond_to?(:to_time)
263
+ value = value.to_time
264
+ end
265
+ return value if time_max.nil? and time_min.nil?
266
+ return value unless value.is_a?(Time)
267
+ if time_max and value >= time_max
268
+ time_max
269
+ elsif time_min and value <= time_min
270
+ time_min
271
+ else
272
+ value
273
+ end
274
+ else
275
+ value
276
+ end
277
+ end
278
+ end
279
+
280
+ class ExpressionEvaluator
281
+ class Context < BasicObject
282
+ def html_untag(text)
283
+ text.gsub(/<.*?>/, "")
284
+ end
285
+
286
+ def groonga_escape_query(text)
287
+ case text
288
+ when /[+\-><~*()"\\: ]/
289
+ "\"#{text.gsub("\"", "\\\"")}\""
290
+ when "OR"
291
+ "\"OR\""
292
+ else
293
+ text
294
+ end
295
+ end
296
+ end
297
+
298
+ def initialize(source_record)
299
+ @context = Context.new
300
+ context_singleton_class =
301
+ Kernel.instance_method(:singleton_class).bind(@context).call
302
+ source_record.each do |key, value|
303
+ context_singleton_class.define_method(key) do
304
+ value
305
+ end
306
+ end
307
+ end
308
+
309
+ def evaluate(expression)
310
+ @context.instance_eval(expression, __FILE__, __LINE__)
311
+ end
312
+ end
313
+ end
314
+ end