embulk 0.10.26-java → 0.10.27-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 37c5b862011b9f51b60707e073634c4d5f93a958
4
- data.tar.gz: 605d208a67523de7e3fc0e89a5150694365b7799
3
+ metadata.gz: f38ba95abc72fbadd053d8b57f7b1513ae15a55e
4
+ data.tar.gz: 0326ed992defdc94711bf9dbf29d2166da13b0bd
5
5
  SHA512:
6
- metadata.gz: 7fa4b558d76d6b58a299eb96ef838e611a66e665c71fb99e7a30bf918401c62f83dc30f222d74e3a4e903153e2d6c140f184117e0cda4d1de6a0b8895599251f
7
- data.tar.gz: 309678e2d2e996dd60f84e1ae49eaafde413e45a7ccab5ad34d00c57fc163f1d5f49cf7f559acd2f0f919035050fceea08a292508366051e97b6d956aa33de69
6
+ metadata.gz: 72a48650886ff80ff88e9b481cf52bdd557b5ff1d956814da6ba8659d8078dfb29668f71eb8a13d524ee22d1c19da3ac8d1c91c381bc579ddeb213c4f3fdafaa
7
+ data.tar.gz: d7dbb35669b4190eb072338649221a128314fd903c78a2f05be4a31e7f3da2811d51150a48ca9bdea4e943b3db0d6163e5b549cfaddb4eda98565f55a9f08e16
data/embulk.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |gem|
2
2
  gem.name = "embulk"
3
- gem.version = "0.10.26"
3
+ gem.version = "0.10.27"
4
4
  gem.license = "Apache-2.0"
5
5
 
6
6
  gem.summary = "Embulk's runtime library for Ruby."
@@ -33,7 +33,7 @@ Gem::Specification.new do |gem|
33
33
  "documentation_uri" => "https://www.embulk.org/",
34
34
  "homepage_uri" => gem.homepage,
35
35
  # "mailing_list_uri" => "",
36
- "source_code_uri" => "https://github.com/embulk/embulk/tree/v0.10.26",
36
+ "source_code_uri" => "https://github.com/embulk/embulk/tree/v0.10.27",
37
37
  # "wiki_uri" => "",
38
38
  }
39
39
  end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- GEM_VERSION_EMBEDDED = "0.10.26"
2
+ GEM_VERSION_EMBEDDED = "0.10.27"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.26
4
+ version: 0.10.27
5
5
  platform: java
6
6
  authors:
7
7
  - Sadayuki Furuhashi
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-01-28 00:00:00.000000000 Z
13
+ date: 2021-03-12 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  requirement: !ruby/object:Gem::Requirement
@@ -52,12 +52,7 @@ files:
52
52
  - lib/embulk/filter_plugin.rb
53
53
  - lib/embulk/formatter_plugin.rb
54
54
  - lib/embulk/gem_version.rb
55
- - lib/embulk/guess/bzip2.rb
56
55
  - lib/embulk/guess/charset.rb
57
- - lib/embulk/guess/csv.rb
58
- - lib/embulk/guess/csv_all_strings.rb
59
- - lib/embulk/guess/gzip.rb
60
- - lib/embulk/guess/json.rb
61
56
  - lib/embulk/guess/newline.rb
62
57
  - lib/embulk/guess/schema_guess.rb
63
58
  - lib/embulk/guess/time_format_guess.rb
@@ -83,7 +78,7 @@ metadata:
83
78
  changelog_uri: https://github.com/embulk/embulk/releases
84
79
  documentation_uri: https://www.embulk.org/
85
80
  homepage_uri: https://www.embulk.org/
86
- source_code_uri: https://github.com/embulk/embulk/tree/v0.10.26
81
+ source_code_uri: https://github.com/embulk/embulk/tree/v0.10.27
87
82
  post_install_message:
88
83
  rdoc_options: []
89
84
  require_paths:
@@ -1,23 +0,0 @@
1
- module Embulk
2
- module Guess
3
-
4
- class Bzip2GuessPlugin < GuessPlugin
5
- Plugin.register_guess('bzip2', self)
6
-
7
- # magic: BZ
8
- # version: 'h' = bzip2
9
- # blocksize: 1 .. 9
10
- # block magic: 0x314159265359 (6 bytes)
11
- block_magic = [0x31, 0x41, 0x59, 0x26, 0x53, 0x59].pack('C*')
12
- BZIP2_HEADER_PATTERN = /BZh[1-9]#{Regexp.quote(block_magic)}/n
13
-
14
- def guess(config, sample_buffer)
15
- if sample_buffer[0,10] =~ BZIP2_HEADER_PATTERN
16
- return {"decoders" => [{"type" => "bzip2"}]}
17
- end
18
- return {}
19
- end
20
- end
21
-
22
- end
23
- end
@@ -1,374 +0,0 @@
1
- module Embulk
2
- module Guess
3
- require 'embulk/guess/schema_guess'
4
-
5
- class CsvGuessPlugin < LineGuessPlugin
6
- Plugin.register_guess('csv', self)
7
-
8
- DELIMITER_CANDIDATES = [
9
- ",", "\t", "|", ";"
10
- ]
11
-
12
- QUOTE_CANDIDATES = [
13
- "\"", "'"
14
- ]
15
-
16
- ESCAPE_CANDIDATES = [
17
- "\\", '"'
18
- ]
19
-
20
- NULL_STRING_CANDIDATES = [
21
- "null",
22
- "NULL",
23
- "#N/A",
24
- "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
- ]
26
-
27
- COMMENT_LINE_MARKER_CANDIDATES = [
28
- "#",
29
- "//",
30
- ]
31
-
32
- MAX_SKIP_LINES = 10
33
- NO_SKIP_DETECT_LINES = 10
34
-
35
- def guess_lines(config, sample_lines)
36
- return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"
37
-
38
- parser_config = config["parser"] || {}
39
- if parser_config["type"] == "csv" && parser_config["delimiter"]
40
- delim = parser_config["delimiter"]
41
- else
42
- delim = guess_delimiter(sample_lines)
43
- unless delim
44
- # assuming single column CSV
45
- delim = DELIMITER_CANDIDATES.first
46
- end
47
- end
48
-
49
- parser_guessed = DataSource.new.merge(parser_config).merge({"type" => "csv", "delimiter" => delim})
50
-
51
- unless parser_guessed.has_key?("quote")
52
- quote = guess_quote(sample_lines, delim)
53
- unless quote
54
- if !guess_force_no_quote(sample_lines, delim, '"')
55
- # assuming CSV follows RFC for quoting
56
- quote = '"'
57
- else
58
- # disable quoting (set null)
59
- end
60
- end
61
- parser_guessed["quote"] = quote
62
- end
63
- parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
64
-
65
- unless parser_guessed.has_key?("escape")
66
- if quote = parser_guessed["quote"]
67
- escape = guess_escape(sample_lines, delim, quote)
68
- unless escape
69
- if quote == '"'
70
- # assuming this CSV follows RFC for escaping
71
- escape = '"'
72
- else
73
- # disable escaping (set null)
74
- end
75
- end
76
- parser_guessed["escape"] = escape
77
- else
78
- # escape does nothing if quote is disabled
79
- end
80
- end
81
-
82
- unless parser_guessed.has_key?("null_string")
83
- null_string = guess_null_string(sample_lines, delim)
84
- parser_guessed["null_string"] = null_string if null_string
85
- # don't even set null_string to avoid confusion of null and 'null' in YAML format
86
- end
87
-
88
- # guessing skip_header_lines should be before guessing guess_comment_line_marker
89
- # because lines supplied to CsvTokenizer already don't include skipped header lines.
90
- # skipping empty lines is also disabled here because skipping header lines is done by
91
- # CsvParser which doesn't skip empty lines automatically
92
- sample_records = split_lines(parser_guessed, false, sample_lines, delim, {})
93
- skip_header_lines = guess_skip_header_lines(sample_records)
94
- sample_lines = sample_lines[skip_header_lines..-1]
95
- sample_records = sample_records[skip_header_lines..-1]
96
-
97
- unless parser_guessed.has_key?("comment_line_marker")
98
- comment_line_marker, sample_lines =
99
- guess_comment_line_marker(sample_lines, delim, parser_guessed["quote"], parser_guessed["null_string"])
100
- if comment_line_marker
101
- parser_guessed["comment_line_marker"] = comment_line_marker
102
- end
103
- end
104
-
105
- sample_records = split_lines(parser_guessed, true, sample_lines, delim, {})
106
-
107
- # It should fail if CSV parser cannot parse sample_lines.
108
- if sample_records.nil? || sample_records.empty?
109
- return {}
110
- end
111
-
112
- if sample_lines.size == 1
113
- # The file contains only 1 line. Assume that there are no header line.
114
- header_line = false
115
-
116
- column_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
117
-
118
- unless parser_guessed.has_key?("trim_if_not_quoted")
119
- sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
120
- column_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed)
121
- if column_types != column_types_trimmed
122
- parser_guessed["trim_if_not_quoted"] = true
123
- column_types = column_types_trimmed
124
- else
125
- parser_guessed["trim_if_not_quoted"] = false
126
- end
127
- end
128
- else
129
- # The file contains more than 1 line. If guessed first line's column types are all strings or boolean, and the types are
130
- # different from the other lines, assume that the first line is column names.
131
- first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
132
- other_types = SchemaGuess.types_from_array_records(sample_records[1..-1] || [])
133
-
134
- unless parser_guessed.has_key?("trim_if_not_quoted")
135
- sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
136
- other_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed[1..-1] || [])
137
- if other_types != other_types_trimmed
138
- parser_guessed["trim_if_not_quoted"] = true
139
- other_types = other_types_trimmed
140
- else
141
- parser_guessed["trim_if_not_quoted"] = false
142
- end
143
- end
144
-
145
- header_line = (first_types != other_types && first_types.all? {|t| ["string", "boolean"].include?(t) }) || guess_string_header_line(sample_records)
146
- column_types = other_types
147
- end
148
-
149
- if column_types.empty?
150
- # TODO here is making the guessing failed if the file doesn't contain any columns. However,
151
- # this may not be convenient for users.
152
- return {}
153
- end
154
-
155
- if header_line
156
- parser_guessed["skip_header_lines"] = skip_header_lines + 1
157
- else
158
- parser_guessed["skip_header_lines"] = skip_header_lines
159
- end
160
-
161
- parser_guessed["allow_extra_columns"] = false unless parser_guessed.has_key?("allow_extra_columns")
162
- parser_guessed["allow_optional_columns"] = false unless parser_guessed.has_key?("allow_optional_columns")
163
-
164
- if header_line
165
- column_names = sample_records.first.map(&:strip)
166
- else
167
- column_names = (0..column_types.size).to_a.map {|i| "c#{i}" }
168
- end
169
- schema = []
170
- column_names.zip(column_types).each do |name,type|
171
- if name && type
172
- schema << new_column(name, type)
173
- end
174
- end
175
- parser_guessed["columns"] = schema
176
-
177
- return {"parser" => parser_guessed}
178
- end
179
-
180
- def new_column(name, type)
181
- if type.is_a?(SchemaGuess::TimestampTypeMatch)
182
- {"name" => name, "type" => type, "format" => type.format}
183
- else
184
- {"name" => name, "type" => type}
185
- end
186
- end
187
-
188
- private
189
-
190
- def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
191
- null_string = parser_config["null_string"]
192
- config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
193
- parser_task = config.load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
194
- data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
195
- sample = Buffer.from_ruby_string(data)
196
- decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
197
- tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
198
- rows = []
199
- while tokenizer.nextFile
200
- while tokenizer.nextRecord(skip_empty_lines)
201
- begin
202
- columns = []
203
- while true
204
- begin
205
- column = tokenizer.nextColumn
206
- quoted = tokenizer.wasQuotedColumn
207
- if null_string && !quoted && column == null_string
208
- column = nil
209
- end
210
- columns << column
211
- rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
212
- rows << columns
213
- break
214
- end
215
- end
216
- rescue org.embulk.standards.CsvTokenizer::InvalidValueException
217
- # TODO warning
218
- tokenizer.skipCurrentLine
219
- end
220
- end
221
- end
222
- return rows
223
- rescue
224
- # TODO warning if fallback to this ad-hoc implementation
225
- sample_lines.map {|line| line.split(delim) }
226
- end
227
-
228
- def guess_delimiter(sample_lines)
229
- delim_weights = DELIMITER_CANDIDATES.map do |d|
230
- counts = sample_lines.map {|line| line.count(d) }
231
- total = array_sum(counts)
232
- if total > 0
233
- stddev = array_standard_deviation(counts)
234
- stddev = 0.000000001 if stddev == 0.0
235
- weight = total / stddev
236
- [d, weight]
237
- else
238
- [nil, 0]
239
- end
240
- end
241
-
242
- delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
243
- if delim != nil && weight > 1
244
- return delim
245
- else
246
- return nil
247
- end
248
- end
249
-
250
- def guess_quote(sample_lines, delim)
251
- delim_regexp = Regexp.escape(delim)
252
- quote_weights = QUOTE_CANDIDATES.map do |q|
253
- weights = sample_lines.map do |line|
254
- q_regexp = Regexp.escape(q)
255
- count = line.count(q)
256
- if count > 0
257
- weight = count
258
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
259
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
260
- weight
261
- else
262
- nil
263
- end
264
- end.compact
265
- weights.empty? ? 0 : array_avg(weights)
266
- end
267
- quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
268
- if weight >= 10.0
269
- return quote
270
- else
271
- return nil
272
- end
273
- end
274
-
275
- def guess_force_no_quote(sample_lines, delim, quote_candidate)
276
- delim_regexp = Regexp.escape(delim)
277
- q_regexp = Regexp.escape(quote_candidate)
278
- sample_lines.any? do |line|
279
- # quoting character appear at the middle of a non-quoted value
280
- line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
281
- end
282
- end
283
-
284
- def guess_escape(sample_lines, delim, quote)
285
- guessed = ESCAPE_CANDIDATES.map do |str|
286
- regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
287
- counts = sample_lines.map {|line| line.scan(regexp).count }
288
- count = counts.inject(0) {|r,c| r + c }
289
- [str, count]
290
- end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
291
- found = guessed.first
292
- return found ? found[0] : nil
293
- end
294
-
295
- def guess_null_string(sample_lines, delim)
296
- guessed = NULL_STRING_CANDIDATES.map do |str|
297
- regexp = /(?:^|#{Regexp.quote(delim)})#{Regexp.quote(str)}(?:$|#{Regexp.quote(delim)})/
298
- counts = sample_lines.map {|line| line.scan(regexp).count }
299
- count = counts.inject(0) {|r,c| r + c }
300
- [str, count]
301
- end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
302
- found_str, found_count = guessed.first
303
- return found_str ? found_str : nil
304
- end
305
-
306
- def guess_skip_header_lines(sample_records)
307
- counts = sample_records.map {|records| records.size }
308
- (1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
309
- check_row_count = counts[i-1]
310
- if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
311
- return i - 1
312
- end
313
- end
314
- return 0
315
- end
316
-
317
- def guess_comment_line_marker(sample_lines, delim, quote, null_string)
318
- exclude = []
319
- exclude << /^#{Regexp.escape(quote)}/ if quote && !quote.empty?
320
- exclude << /^#{Regexp.escape(null_string)}(?:#{Regexp.escape(delim)}|$)/ if null_string
321
-
322
- guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
323
- regexp = /^#{Regexp.quote(str)}/
324
- unmatch_lines = sample_lines.reject do |line|
325
- exclude.all? {|ex| line !~ ex } && line =~ regexp
326
- end
327
- match_count = sample_lines.size - unmatch_lines.size
328
- [str, match_count, unmatch_lines]
329
- end.select {|str,match_count,unmatch_lines| match_count > 0 }.sort_by {|str,match_count,unmatch_lines| -match_count }
330
-
331
- str, match_count, unmatch_lines = guessed.first
332
- if str
333
- return str, unmatch_lines
334
- else
335
- return nil, sample_lines
336
- end
337
- end
338
-
339
- def guess_string_header_line(sample_records)
340
- first = sample_records.first
341
- first.count.times do |column_index|
342
- lengths = sample_records.map {|row| row[column_index] }.compact.map {|v| v.to_s.size }
343
- if lengths.size > 1
344
- if array_variance(lengths[1..-1]) <= 0.2
345
- avg = array_avg(lengths[1..-1])
346
- if avg == 0.0 ? lengths[0] > 1 : (avg - lengths[0]).abs / avg > 0.7
347
- return true
348
- end
349
- end
350
- end
351
- end
352
- return false
353
- end
354
-
355
- def array_sum(array)
356
- array.inject(0) {|r,i| r += i }
357
- end
358
-
359
- def array_avg(array)
360
- array.inject(0.0) {|r,i| r += i } / array.size
361
- end
362
-
363
- def array_variance(array)
364
- avg = array_avg(array)
365
- array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
366
- end
367
-
368
- def array_standard_deviation(array)
369
- Math.sqrt(array_variance(array))
370
- end
371
- end
372
-
373
- end
374
- end
@@ -1,13 +0,0 @@
1
- module Embulk
2
- module Guess
3
- require 'embulk/guess/csv'
4
-
5
- class CsvAllStringsGuessPlugin < CsvGuessPlugin
6
- Plugin.register_guess("csv_all_strings", self)
7
-
8
- def new_column(name, type)
9
- {"name" => name, "type" => "string"}
10
- end
11
- end
12
- end
13
- end
@@ -1,18 +0,0 @@
1
- module Embulk
2
- module Guess
3
-
4
- class GzipGuessPlugin < GuessPlugin
5
- Plugin.register_guess('gzip', self)
6
-
7
- GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
-
9
- def guess(config, sample_buffer)
10
- if sample_buffer[0,2] == GZIP_HEADER
11
- return {"decoders" => [{"type" => "gzip"}]}
12
- end
13
- return {}
14
- end
15
- end
16
-
17
- end
18
- end
@@ -1,50 +0,0 @@
1
- module Embulk
2
- module Guess
3
- class JsonGuessPlugin < GuessPlugin
4
- Plugin.register_guess('json', self)
5
-
6
- java_import 'com.google.common.collect.Lists'
7
- java_import 'java.io.ByteArrayInputStream'
8
- java_import 'org.embulk.spi.Exec'
9
- java_import 'org.embulk.spi.json.JsonParser'
10
- java_import 'org.embulk.spi.json.JsonParseException'
11
- java_import 'org.embulk.spi.util.FileInputInputStream'
12
- java_import 'org.embulk.spi.util.InputStreamFileInput'
13
-
14
- def guess(config, sample_buffer)
15
- return {} unless config.fetch("parser", {}).fetch("type", "json") == "json"
16
-
17
- # Use org.embulk.spi.json.JsonParser to respond to multi-line Json
18
- json_parser = new_json_parser(sample_buffer)
19
- one_json_parsed = false
20
- begin
21
- while (v = json_parser.next)
22
- # "v" needs to be JSON object type (isMapValue) because:
23
- # 1) Single-column CSV can be mis-guessed as JSON if JSON non-objects are accepted.
24
- # 2) JsonParserPlugin accepts only the JSON object type.
25
- raise JsonParseException.new("v must be JSON object type") unless v.isMapValue
26
- one_json_parsed = true
27
- end
28
- rescue JsonParseException
29
- # the exception is ignored
30
- end
31
-
32
- if one_json_parsed
33
- return {"parser" => {"type" => "json"}} # if JsonParser can parse even one JSON data
34
- else
35
- return {}
36
- end
37
- end
38
-
39
- private
40
-
41
- def new_json_parser(buffer)
42
- input_streams = Lists::newArrayList(ByteArrayInputStream.new(buffer.to_java_bytes))
43
- iterator_provider = InputStreamFileInput::IteratorProvider.new(input_streams)
44
- input = FileInputInputStream.new(InputStreamFileInput.new(Java::SPI::Exec.getBufferAllocator(), iterator_provider))
45
- input.nextFile
46
- JsonParser.new.open(input)
47
- end
48
- end
49
- end
50
- end