embulk 0.10.26-java → 0.10.27-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 37c5b862011b9f51b60707e073634c4d5f93a958
4
- data.tar.gz: 605d208a67523de7e3fc0e89a5150694365b7799
3
+ metadata.gz: f38ba95abc72fbadd053d8b57f7b1513ae15a55e
4
+ data.tar.gz: 0326ed992defdc94711bf9dbf29d2166da13b0bd
5
5
  SHA512:
6
- metadata.gz: 7fa4b558d76d6b58a299eb96ef838e611a66e665c71fb99e7a30bf918401c62f83dc30f222d74e3a4e903153e2d6c140f184117e0cda4d1de6a0b8895599251f
7
- data.tar.gz: 309678e2d2e996dd60f84e1ae49eaafde413e45a7ccab5ad34d00c57fc163f1d5f49cf7f559acd2f0f919035050fceea08a292508366051e97b6d956aa33de69
6
+ metadata.gz: 72a48650886ff80ff88e9b481cf52bdd557b5ff1d956814da6ba8659d8078dfb29668f71eb8a13d524ee22d1c19da3ac8d1c91c381bc579ddeb213c4f3fdafaa
7
+ data.tar.gz: d7dbb35669b4190eb072338649221a128314fd903c78a2f05be4a31e7f3da2811d51150a48ca9bdea4e943b3db0d6163e5b549cfaddb4eda98565f55a9f08e16
data/embulk.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |gem|
2
2
  gem.name = "embulk"
3
- gem.version = "0.10.26"
3
+ gem.version = "0.10.27"
4
4
  gem.license = "Apache-2.0"
5
5
 
6
6
  gem.summary = "Embulk's runtime library for Ruby."
@@ -33,7 +33,7 @@ Gem::Specification.new do |gem|
33
33
  "documentation_uri" => "https://www.embulk.org/",
34
34
  "homepage_uri" => gem.homepage,
35
35
  # "mailing_list_uri" => "",
36
- "source_code_uri" => "https://github.com/embulk/embulk/tree/v0.10.26",
36
+ "source_code_uri" => "https://github.com/embulk/embulk/tree/v0.10.27",
37
37
  # "wiki_uri" => "",
38
38
  }
39
39
  end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- GEM_VERSION_EMBEDDED = "0.10.26"
2
+ GEM_VERSION_EMBEDDED = "0.10.27"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.26
4
+ version: 0.10.27
5
5
  platform: java
6
6
  authors:
7
7
  - Sadayuki Furuhashi
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-01-28 00:00:00.000000000 Z
13
+ date: 2021-03-12 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  requirement: !ruby/object:Gem::Requirement
@@ -52,12 +52,7 @@ files:
52
52
  - lib/embulk/filter_plugin.rb
53
53
  - lib/embulk/formatter_plugin.rb
54
54
  - lib/embulk/gem_version.rb
55
- - lib/embulk/guess/bzip2.rb
56
55
  - lib/embulk/guess/charset.rb
57
- - lib/embulk/guess/csv.rb
58
- - lib/embulk/guess/csv_all_strings.rb
59
- - lib/embulk/guess/gzip.rb
60
- - lib/embulk/guess/json.rb
61
56
  - lib/embulk/guess/newline.rb
62
57
  - lib/embulk/guess/schema_guess.rb
63
58
  - lib/embulk/guess/time_format_guess.rb
@@ -83,7 +78,7 @@ metadata:
83
78
  changelog_uri: https://github.com/embulk/embulk/releases
84
79
  documentation_uri: https://www.embulk.org/
85
80
  homepage_uri: https://www.embulk.org/
86
- source_code_uri: https://github.com/embulk/embulk/tree/v0.10.26
81
+ source_code_uri: https://github.com/embulk/embulk/tree/v0.10.27
87
82
  post_install_message:
88
83
  rdoc_options: []
89
84
  require_paths:
@@ -1,23 +0,0 @@
1
- module Embulk
2
- module Guess
3
-
4
- class Bzip2GuessPlugin < GuessPlugin
5
- Plugin.register_guess('bzip2', self)
6
-
7
- # magic: BZ
8
- # version: 'h' = bzip2
9
- # blocksize: 1 .. 9
10
- # block magic: 0x314159265359 (6 bytes)
11
- block_magic = [0x31, 0x41, 0x59, 0x26, 0x53, 0x59].pack('C*')
12
- BZIP2_HEADER_PATTERN = /BZh[1-9]#{Regexp.quote(block_magic)}/n
13
-
14
- def guess(config, sample_buffer)
15
- if sample_buffer[0,10] =~ BZIP2_HEADER_PATTERN
16
- return {"decoders" => [{"type" => "bzip2"}]}
17
- end
18
- return {}
19
- end
20
- end
21
-
22
- end
23
- end
@@ -1,374 +0,0 @@
1
- module Embulk
2
- module Guess
3
- require 'embulk/guess/schema_guess'
4
-
5
- class CsvGuessPlugin < LineGuessPlugin
6
- Plugin.register_guess('csv', self)
7
-
8
- DELIMITER_CANDIDATES = [
9
- ",", "\t", "|", ";"
10
- ]
11
-
12
- QUOTE_CANDIDATES = [
13
- "\"", "'"
14
- ]
15
-
16
- ESCAPE_CANDIDATES = [
17
- "\\", '"'
18
- ]
19
-
20
- NULL_STRING_CANDIDATES = [
21
- "null",
22
- "NULL",
23
- "#N/A",
24
- "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
- ]
26
-
27
- COMMENT_LINE_MARKER_CANDIDATES = [
28
- "#",
29
- "//",
30
- ]
31
-
32
- MAX_SKIP_LINES = 10
33
- NO_SKIP_DETECT_LINES = 10
34
-
35
- def guess_lines(config, sample_lines)
36
- return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"
37
-
38
- parser_config = config["parser"] || {}
39
- if parser_config["type"] == "csv" && parser_config["delimiter"]
40
- delim = parser_config["delimiter"]
41
- else
42
- delim = guess_delimiter(sample_lines)
43
- unless delim
44
- # assuming single column CSV
45
- delim = DELIMITER_CANDIDATES.first
46
- end
47
- end
48
-
49
- parser_guessed = DataSource.new.merge(parser_config).merge({"type" => "csv", "delimiter" => delim})
50
-
51
- unless parser_guessed.has_key?("quote")
52
- quote = guess_quote(sample_lines, delim)
53
- unless quote
54
- if !guess_force_no_quote(sample_lines, delim, '"')
55
- # assuming CSV follows RFC for quoting
56
- quote = '"'
57
- else
58
- # disable quoting (set null)
59
- end
60
- end
61
- parser_guessed["quote"] = quote
62
- end
63
- parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
64
-
65
- unless parser_guessed.has_key?("escape")
66
- if quote = parser_guessed["quote"]
67
- escape = guess_escape(sample_lines, delim, quote)
68
- unless escape
69
- if quote == '"'
70
- # assuming this CSV follows RFC for escaping
71
- escape = '"'
72
- else
73
- # disable escaping (set null)
74
- end
75
- end
76
- parser_guessed["escape"] = escape
77
- else
78
- # escape does nothing if quote is disabled
79
- end
80
- end
81
-
82
- unless parser_guessed.has_key?("null_string")
83
- null_string = guess_null_string(sample_lines, delim)
84
- parser_guessed["null_string"] = null_string if null_string
85
- # don't even set null_string to avoid confusion of null and 'null' in YAML format
86
- end
87
-
88
- # guessing skip_header_lines should be before guessing guess_comment_line_marker
89
- # because lines supplied to CsvTokenizer already don't include skipped header lines.
90
- # skipping empty lines is also disabled here because skipping header lines is done by
91
- # CsvParser which doesn't skip empty lines automatically
92
- sample_records = split_lines(parser_guessed, false, sample_lines, delim, {})
93
- skip_header_lines = guess_skip_header_lines(sample_records)
94
- sample_lines = sample_lines[skip_header_lines..-1]
95
- sample_records = sample_records[skip_header_lines..-1]
96
-
97
- unless parser_guessed.has_key?("comment_line_marker")
98
- comment_line_marker, sample_lines =
99
- guess_comment_line_marker(sample_lines, delim, parser_guessed["quote"], parser_guessed["null_string"])
100
- if comment_line_marker
101
- parser_guessed["comment_line_marker"] = comment_line_marker
102
- end
103
- end
104
-
105
- sample_records = split_lines(parser_guessed, true, sample_lines, delim, {})
106
-
107
- # It should fail if CSV parser cannot parse sample_lines.
108
- if sample_records.nil? || sample_records.empty?
109
- return {}
110
- end
111
-
112
- if sample_lines.size == 1
113
- # The file contains only 1 line. Assume that there are no header line.
114
- header_line = false
115
-
116
- column_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
117
-
118
- unless parser_guessed.has_key?("trim_if_not_quoted")
119
- sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
120
- column_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed)
121
- if column_types != column_types_trimmed
122
- parser_guessed["trim_if_not_quoted"] = true
123
- column_types = column_types_trimmed
124
- else
125
- parser_guessed["trim_if_not_quoted"] = false
126
- end
127
- end
128
- else
129
- # The file contains more than 1 line. If guessed first line's column types are all strings or boolean, and the types are
130
- # different from the other lines, assume that the first line is column names.
131
- first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
132
- other_types = SchemaGuess.types_from_array_records(sample_records[1..-1] || [])
133
-
134
- unless parser_guessed.has_key?("trim_if_not_quoted")
135
- sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
136
- other_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed[1..-1] || [])
137
- if other_types != other_types_trimmed
138
- parser_guessed["trim_if_not_quoted"] = true
139
- other_types = other_types_trimmed
140
- else
141
- parser_guessed["trim_if_not_quoted"] = false
142
- end
143
- end
144
-
145
- header_line = (first_types != other_types && first_types.all? {|t| ["string", "boolean"].include?(t) }) || guess_string_header_line(sample_records)
146
- column_types = other_types
147
- end
148
-
149
- if column_types.empty?
150
- # TODO here is making the guessing failed if the file doesn't contain any columns. However,
151
- # this may not be convenient for users.
152
- return {}
153
- end
154
-
155
- if header_line
156
- parser_guessed["skip_header_lines"] = skip_header_lines + 1
157
- else
158
- parser_guessed["skip_header_lines"] = skip_header_lines
159
- end
160
-
161
- parser_guessed["allow_extra_columns"] = false unless parser_guessed.has_key?("allow_extra_columns")
162
- parser_guessed["allow_optional_columns"] = false unless parser_guessed.has_key?("allow_optional_columns")
163
-
164
- if header_line
165
- column_names = sample_records.first.map(&:strip)
166
- else
167
- column_names = (0..column_types.size).to_a.map {|i| "c#{i}" }
168
- end
169
- schema = []
170
- column_names.zip(column_types).each do |name,type|
171
- if name && type
172
- schema << new_column(name, type)
173
- end
174
- end
175
- parser_guessed["columns"] = schema
176
-
177
- return {"parser" => parser_guessed}
178
- end
179
-
180
- def new_column(name, type)
181
- if type.is_a?(SchemaGuess::TimestampTypeMatch)
182
- {"name" => name, "type" => type, "format" => type.format}
183
- else
184
- {"name" => name, "type" => type}
185
- end
186
- end
187
-
188
- private
189
-
190
- def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
191
- null_string = parser_config["null_string"]
192
- config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
193
- parser_task = config.load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
194
- data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
195
- sample = Buffer.from_ruby_string(data)
196
- decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
197
- tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
198
- rows = []
199
- while tokenizer.nextFile
200
- while tokenizer.nextRecord(skip_empty_lines)
201
- begin
202
- columns = []
203
- while true
204
- begin
205
- column = tokenizer.nextColumn
206
- quoted = tokenizer.wasQuotedColumn
207
- if null_string && !quoted && column == null_string
208
- column = nil
209
- end
210
- columns << column
211
- rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
212
- rows << columns
213
- break
214
- end
215
- end
216
- rescue org.embulk.standards.CsvTokenizer::InvalidValueException
217
- # TODO warning
218
- tokenizer.skipCurrentLine
219
- end
220
- end
221
- end
222
- return rows
223
- rescue
224
- # TODO warning if fallback to this ad-hoc implementation
225
- sample_lines.map {|line| line.split(delim) }
226
- end
227
-
228
- def guess_delimiter(sample_lines)
229
- delim_weights = DELIMITER_CANDIDATES.map do |d|
230
- counts = sample_lines.map {|line| line.count(d) }
231
- total = array_sum(counts)
232
- if total > 0
233
- stddev = array_standard_deviation(counts)
234
- stddev = 0.000000001 if stddev == 0.0
235
- weight = total / stddev
236
- [d, weight]
237
- else
238
- [nil, 0]
239
- end
240
- end
241
-
242
- delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
243
- if delim != nil && weight > 1
244
- return delim
245
- else
246
- return nil
247
- end
248
- end
249
-
250
- def guess_quote(sample_lines, delim)
251
- delim_regexp = Regexp.escape(delim)
252
- quote_weights = QUOTE_CANDIDATES.map do |q|
253
- weights = sample_lines.map do |line|
254
- q_regexp = Regexp.escape(q)
255
- count = line.count(q)
256
- if count > 0
257
- weight = count
258
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
259
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
260
- weight
261
- else
262
- nil
263
- end
264
- end.compact
265
- weights.empty? ? 0 : array_avg(weights)
266
- end
267
- quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
268
- if weight >= 10.0
269
- return quote
270
- else
271
- return nil
272
- end
273
- end
274
-
275
- def guess_force_no_quote(sample_lines, delim, quote_candidate)
276
- delim_regexp = Regexp.escape(delim)
277
- q_regexp = Regexp.escape(quote_candidate)
278
- sample_lines.any? do |line|
279
- # quoting character appear at the middle of a non-quoted value
280
- line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
281
- end
282
- end
283
-
284
- def guess_escape(sample_lines, delim, quote)
285
- guessed = ESCAPE_CANDIDATES.map do |str|
286
- regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
287
- counts = sample_lines.map {|line| line.scan(regexp).count }
288
- count = counts.inject(0) {|r,c| r + c }
289
- [str, count]
290
- end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
291
- found = guessed.first
292
- return found ? found[0] : nil
293
- end
294
-
295
- def guess_null_string(sample_lines, delim)
296
- guessed = NULL_STRING_CANDIDATES.map do |str|
297
- regexp = /(?:^|#{Regexp.quote(delim)})#{Regexp.quote(str)}(?:$|#{Regexp.quote(delim)})/
298
- counts = sample_lines.map {|line| line.scan(regexp).count }
299
- count = counts.inject(0) {|r,c| r + c }
300
- [str, count]
301
- end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
302
- found_str, found_count = guessed.first
303
- return found_str ? found_str : nil
304
- end
305
-
306
- def guess_skip_header_lines(sample_records)
307
- counts = sample_records.map {|records| records.size }
308
- (1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
309
- check_row_count = counts[i-1]
310
- if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
311
- return i - 1
312
- end
313
- end
314
- return 0
315
- end
316
-
317
- def guess_comment_line_marker(sample_lines, delim, quote, null_string)
318
- exclude = []
319
- exclude << /^#{Regexp.escape(quote)}/ if quote && !quote.empty?
320
- exclude << /^#{Regexp.escape(null_string)}(?:#{Regexp.escape(delim)}|$)/ if null_string
321
-
322
- guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
323
- regexp = /^#{Regexp.quote(str)}/
324
- unmatch_lines = sample_lines.reject do |line|
325
- exclude.all? {|ex| line !~ ex } && line =~ regexp
326
- end
327
- match_count = sample_lines.size - unmatch_lines.size
328
- [str, match_count, unmatch_lines]
329
- end.select {|str,match_count,unmatch_lines| match_count > 0 }.sort_by {|str,match_count,unmatch_lines| -match_count }
330
-
331
- str, match_count, unmatch_lines = guessed.first
332
- if str
333
- return str, unmatch_lines
334
- else
335
- return nil, sample_lines
336
- end
337
- end
338
-
339
- def guess_string_header_line(sample_records)
340
- first = sample_records.first
341
- first.count.times do |column_index|
342
- lengths = sample_records.map {|row| row[column_index] }.compact.map {|v| v.to_s.size }
343
- if lengths.size > 1
344
- if array_variance(lengths[1..-1]) <= 0.2
345
- avg = array_avg(lengths[1..-1])
346
- if avg == 0.0 ? lengths[0] > 1 : (avg - lengths[0]).abs / avg > 0.7
347
- return true
348
- end
349
- end
350
- end
351
- end
352
- return false
353
- end
354
-
355
- def array_sum(array)
356
- array.inject(0) {|r,i| r += i }
357
- end
358
-
359
- def array_avg(array)
360
- array.inject(0.0) {|r,i| r += i } / array.size
361
- end
362
-
363
- def array_variance(array)
364
- avg = array_avg(array)
365
- array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
366
- end
367
-
368
- def array_standard_deviation(array)
369
- Math.sqrt(array_variance(array))
370
- end
371
- end
372
-
373
- end
374
- end
@@ -1,13 +0,0 @@
1
- module Embulk
2
- module Guess
3
- require 'embulk/guess/csv'
4
-
5
- class CsvAllStringsGuessPlugin < CsvGuessPlugin
6
- Plugin.register_guess("csv_all_strings", self)
7
-
8
- def new_column(name, type)
9
- {"name" => name, "type" => "string"}
10
- end
11
- end
12
- end
13
- end
@@ -1,18 +0,0 @@
1
- module Embulk
2
- module Guess
3
-
4
- class GzipGuessPlugin < GuessPlugin
5
- Plugin.register_guess('gzip', self)
6
-
7
- GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
-
9
- def guess(config, sample_buffer)
10
- if sample_buffer[0,2] == GZIP_HEADER
11
- return {"decoders" => [{"type" => "gzip"}]}
12
- end
13
- return {}
14
- end
15
- end
16
-
17
- end
18
- end
@@ -1,50 +0,0 @@
1
- module Embulk
2
- module Guess
3
- class JsonGuessPlugin < GuessPlugin
4
- Plugin.register_guess('json', self)
5
-
6
- java_import 'com.google.common.collect.Lists'
7
- java_import 'java.io.ByteArrayInputStream'
8
- java_import 'org.embulk.spi.Exec'
9
- java_import 'org.embulk.spi.json.JsonParser'
10
- java_import 'org.embulk.spi.json.JsonParseException'
11
- java_import 'org.embulk.spi.util.FileInputInputStream'
12
- java_import 'org.embulk.spi.util.InputStreamFileInput'
13
-
14
- def guess(config, sample_buffer)
15
- return {} unless config.fetch("parser", {}).fetch("type", "json") == "json"
16
-
17
- # Use org.embulk.spi.json.JsonParser to respond to multi-line Json
18
- json_parser = new_json_parser(sample_buffer)
19
- one_json_parsed = false
20
- begin
21
- while (v = json_parser.next)
22
- # "v" needs to be JSON object type (isMapValue) because:
23
- # 1) Single-column CSV can be mis-guessed as JSON if JSON non-objects are accepted.
24
- # 2) JsonParserPlugin accepts only the JSON object type.
25
- raise JsonParseException.new("v must be JSON object type") unless v.isMapValue
26
- one_json_parsed = true
27
- end
28
- rescue JsonParseException
29
- # the exception is ignored
30
- end
31
-
32
- if one_json_parsed
33
- return {"parser" => {"type" => "json"}} # if JsonParser can parse even one JSON data
34
- else
35
- return {}
36
- end
37
- end
38
-
39
- private
40
-
41
- def new_json_parser(buffer)
42
- input_streams = Lists::newArrayList(ByteArrayInputStream.new(buffer.to_java_bytes))
43
- iterator_provider = InputStreamFileInput::IteratorProvider.new(input_streams)
44
- input = FileInputInputStream.new(InputStreamFileInput.new(Java::SPI::Exec.getBufferAllocator(), iterator_provider))
45
- input.nextFile
46
- JsonParser.new.open(input)
47
- end
48
- end
49
- end
50
- end