embulk 0.10.24-java → 0.10.29-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c7fcb1a8f1e94112700f6f4593f4038fc9861c4a
4
- data.tar.gz: 8eb07cb5694b38a1937ecbd62d3c193a124a1253
3
+ metadata.gz: 66abd322277fe12424ea6d35c145ef4090d1be8b
4
+ data.tar.gz: e848487bf35166ed5111c4a86a5c3d2520239008
5
5
  SHA512:
6
- metadata.gz: 9aac5551600bee7e5d0836b5f75d956cbcfc657793a062c9849129e6444505b24c38d3f9f9f2d1ce5a6c85a46e44bfa7059890c2a42b57f584ae03eb8712fc4c
7
- data.tar.gz: 584c660fdcd24878c985ca63e55bc606eaad660cc7ed30e208641c08ec6ebe83573200b2f40d8bb908168abe86ea04e6ea9b07f8c87f39252465063cb473e8d4
6
+ metadata.gz: a7256a622d6a3c52ef7bcd2bea647eb60094b3d31803bcc055610a1175c4daeb7f15fffed39f6d12b8d135ae2e34da4e4692e3f425be901350c7bd4c0acb9c1a
7
+ data.tar.gz: a10d0105aea8eafbb30f7134bbc6bb38744e45671b6db53f9229d95eccbdd267f46e1b5f1015cc822ca84a8559a89e1501c480bd0bc5d056185ea03ec731019e
data/embulk.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |gem|
2
2
  gem.name = "embulk"
3
- gem.version = "0.10.24"
3
+ gem.version = "0.10.29"
4
4
  gem.license = "Apache-2.0"
5
5
 
6
6
  gem.summary = "Embulk's runtime library for Ruby."
@@ -33,7 +33,7 @@ Gem::Specification.new do |gem|
33
33
  "documentation_uri" => "https://www.embulk.org/",
34
34
  "homepage_uri" => gem.homepage,
35
35
  # "mailing_list_uri" => "",
36
- "source_code_uri" => "https://github.com/embulk/embulk/tree/v0.10.24",
36
+ "source_code_uri" => "https://github.com/embulk/embulk/tree/v0.10.29",
37
37
  # "wiki_uri" => "",
38
38
  }
39
39
  end
@@ -0,0 +1,3 @@
1
+ module Embulk
2
+ GEM_VERSION_EMBEDDED = "0.10.29"
3
+ end
data/lib/embulk/logger.rb CHANGED
@@ -1,5 +1,6 @@
1
-
2
1
  module Embulk
2
+ require 'embulk/version' # 'embulk/version' is loaded in the very beginning.
3
+
3
4
  # this file is required before loading embulk-core.jar
4
5
 
5
6
  require 'logger'
@@ -1,5 +1,48 @@
1
1
  module Embulk
2
+ CORE_VERSION = Java::org.embulk.EmbulkVersion::VERSION
3
+
2
4
  # Converts the original Java-style version string to Ruby-style.
3
5
  # E.g., "0.9.0-SNAPSHOT" (in Java) is converted to "0.9.0.snapshot" in Ruby.
4
- VERSION = ::String.new(Java::org.embulk.EmbulkVersion::VERSION).tr('-', '.').downcase
6
+ CORE_VERSION_IN_RUBY_GEM_STYLE = ::String.new(CORE_VERSION).tr('-', '.').downcase
7
+ private_constant :CORE_VERSION_IN_RUBY_GEM_STYLE
8
+
9
+ begin
10
+ require 'embulk/gem_version'
11
+ rescue LoadError => e
12
+ raise LoadError, "[Internal Error] This embulk.gem is not properly built with embulk/gem_version.rb to declare its own version."
13
+ end
14
+
15
+ begin
16
+ GEM_VERSION = GEM_VERSION_EMBEDDED
17
+ rescue NameError => e
18
+ raise LoadError, "[Internal Error] This embulk.gem does not contain its own version defined properly."
19
+ end
20
+
21
+ if GEM_VERSION != CORE_VERSION_IN_RUBY_GEM_STYLE
22
+ # "embulk/logger" cannot be used because embulk/version.rb is loaded even before embulk/logger.rb.
23
+ STDERR.puts "*******************************************************************************************"
24
+ STDERR.puts "Running Embulk version (#{CORE_VERSION}) does not match the installed embulk.gem version (#{GEM_VERSION})."
25
+ STDERR.puts ""
26
+ STDERR.puts "If you use Embulk v0.9.* without Bundler:"
27
+ STDERR.puts " Uninstall embulk.gem from your Gem path."
28
+ STDERR.puts " An embulk.gem-equivalent should be embedded in your Embulk's core JAR of v0.9.*."
29
+ STDERR.puts ""
30
+ STDERR.puts "If you use Embulk v0.9.* with Bundler:"
31
+ STDERR.puts " Try updating your Gemfile as below:"
32
+ STDERR.puts " gem 'embulk', '< 0.10'"
33
+ STDERR.puts " Bundler will find the embulk.gem-equivalent embedded in your Embulk's core JAR of v0.9.*."
34
+ STDERR.puts ""
35
+ STDERR.puts "If you use Embulk v0.10.*:"
36
+ STDERR.puts " Be aware that v0.10.* is an unstable development series. If you are aware of that,"
37
+ STDERR.puts " upgrade it to the latest v0.10.*, and use exactly the same version of embulk.gem."
38
+ STDERR.puts " In case you use Bundler, your Gemfile should have 'embulk' as below:"
39
+ STDERR.puts " gem 'embulk', '0.10.XX' # Exactly the same version of your Embulk's core JAR."
40
+ STDERR.puts ""
41
+ STDERR.puts "If you use Embulk v0.8.* or earlier:"
42
+ STDERR.puts " Update to the latest v0.9.*. v0.8 or earlier are deprecated."
43
+ STDERR.puts "*******************************************************************************************"
44
+ raise LoadError, "Running Embulk version (#{CORE_VERSION}) does not match the installed embulk.gem version (#{GEM_VERSION})."
45
+ end
46
+
47
+ VERSION = GEM_VERSION
5
48
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.24
4
+ version: 0.10.29
5
5
  platform: java
6
6
  authors:
7
7
  - Sadayuki Furuhashi
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-01-19 00:00:00.000000000 Z
13
+ date: 2021-04-07 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  requirement: !ruby/object:Gem::Requirement
@@ -51,12 +51,8 @@ files:
51
51
  - lib/embulk/file_output_plugin.rb
52
52
  - lib/embulk/filter_plugin.rb
53
53
  - lib/embulk/formatter_plugin.rb
54
- - lib/embulk/guess/bzip2.rb
54
+ - lib/embulk/gem_version.rb
55
55
  - lib/embulk/guess/charset.rb
56
- - lib/embulk/guess/csv.rb
57
- - lib/embulk/guess/csv_all_strings.rb
58
- - lib/embulk/guess/gzip.rb
59
- - lib/embulk/guess/json.rb
60
56
  - lib/embulk/guess/newline.rb
61
57
  - lib/embulk/guess/schema_guess.rb
62
58
  - lib/embulk/guess/time_format_guess.rb
@@ -82,7 +78,7 @@ metadata:
82
78
  changelog_uri: https://github.com/embulk/embulk/releases
83
79
  documentation_uri: https://www.embulk.org/
84
80
  homepage_uri: https://www.embulk.org/
85
- source_code_uri: https://github.com/embulk/embulk/tree/v0.10.24
81
+ source_code_uri: https://github.com/embulk/embulk/tree/v0.10.29
86
82
  post_install_message:
87
83
  rdoc_options: []
88
84
  require_paths:
@@ -1,23 +0,0 @@
1
- module Embulk
2
- module Guess
3
-
4
- class Bzip2GuessPlugin < GuessPlugin
5
- Plugin.register_guess('bzip2', self)
6
-
7
- # magic: BZ
8
- # version: 'h' = bzip2
9
- # blocksize: 1 .. 9
10
- # block magic: 0x314159265359 (6 bytes)
11
- block_magic = [0x31, 0x41, 0x59, 0x26, 0x53, 0x59].pack('C*')
12
- BZIP2_HEADER_PATTERN = /BZh[1-9]#{Regexp.quote(block_magic)}/n
13
-
14
- def guess(config, sample_buffer)
15
- if sample_buffer[0,10] =~ BZIP2_HEADER_PATTERN
16
- return {"decoders" => [{"type" => "bzip2"}]}
17
- end
18
- return {}
19
- end
20
- end
21
-
22
- end
23
- end
@@ -1,374 +0,0 @@
1
- module Embulk
2
- module Guess
3
- require 'embulk/guess/schema_guess'
4
-
5
- class CsvGuessPlugin < LineGuessPlugin
6
- Plugin.register_guess('csv', self)
7
-
8
- DELIMITER_CANDIDATES = [
9
- ",", "\t", "|", ";"
10
- ]
11
-
12
- QUOTE_CANDIDATES = [
13
- "\"", "'"
14
- ]
15
-
16
- ESCAPE_CANDIDATES = [
17
- "\\", '"'
18
- ]
19
-
20
- NULL_STRING_CANDIDATES = [
21
- "null",
22
- "NULL",
23
- "#N/A",
24
- "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
- ]
26
-
27
- COMMENT_LINE_MARKER_CANDIDATES = [
28
- "#",
29
- "//",
30
- ]
31
-
32
- MAX_SKIP_LINES = 10
33
- NO_SKIP_DETECT_LINES = 10
34
-
35
- def guess_lines(config, sample_lines)
36
- return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"
37
-
38
- parser_config = config["parser"] || {}
39
- if parser_config["type"] == "csv" && parser_config["delimiter"]
40
- delim = parser_config["delimiter"]
41
- else
42
- delim = guess_delimiter(sample_lines)
43
- unless delim
44
- # assuming single column CSV
45
- delim = DELIMITER_CANDIDATES.first
46
- end
47
- end
48
-
49
- parser_guessed = DataSource.new.merge(parser_config).merge({"type" => "csv", "delimiter" => delim})
50
-
51
- unless parser_guessed.has_key?("quote")
52
- quote = guess_quote(sample_lines, delim)
53
- unless quote
54
- if !guess_force_no_quote(sample_lines, delim, '"')
55
- # assuming CSV follows RFC for quoting
56
- quote = '"'
57
- else
58
- # disable quoting (set null)
59
- end
60
- end
61
- parser_guessed["quote"] = quote
62
- end
63
- parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
64
-
65
- unless parser_guessed.has_key?("escape")
66
- if quote = parser_guessed["quote"]
67
- escape = guess_escape(sample_lines, delim, quote)
68
- unless escape
69
- if quote == '"'
70
- # assuming this CSV follows RFC for escaping
71
- escape = '"'
72
- else
73
- # disable escaping (set null)
74
- end
75
- end
76
- parser_guessed["escape"] = escape
77
- else
78
- # escape does nothing if quote is disabled
79
- end
80
- end
81
-
82
- unless parser_guessed.has_key?("null_string")
83
- null_string = guess_null_string(sample_lines, delim)
84
- parser_guessed["null_string"] = null_string if null_string
85
- # don't even set null_string to avoid confusion of null and 'null' in YAML format
86
- end
87
-
88
- # guessing skip_header_lines should be before guessing guess_comment_line_marker
89
- # because lines supplied to CsvTokenizer already don't include skipped header lines.
90
- # skipping empty lines is also disabled here because skipping header lines is done by
91
- # CsvParser which doesn't skip empty lines automatically
92
- sample_records = split_lines(parser_guessed, false, sample_lines, delim, {})
93
- skip_header_lines = guess_skip_header_lines(sample_records)
94
- sample_lines = sample_lines[skip_header_lines..-1]
95
- sample_records = sample_records[skip_header_lines..-1]
96
-
97
- unless parser_guessed.has_key?("comment_line_marker")
98
- comment_line_marker, sample_lines =
99
- guess_comment_line_marker(sample_lines, delim, parser_guessed["quote"], parser_guessed["null_string"])
100
- if comment_line_marker
101
- parser_guessed["comment_line_marker"] = comment_line_marker
102
- end
103
- end
104
-
105
- sample_records = split_lines(parser_guessed, true, sample_lines, delim, {})
106
-
107
- # It should fail if CSV parser cannot parse sample_lines.
108
- if sample_records.nil? || sample_records.empty?
109
- return {}
110
- end
111
-
112
- if sample_lines.size == 1
113
- # The file contains only 1 line. Assume that there are no header line.
114
- header_line = false
115
-
116
- column_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
117
-
118
- unless parser_guessed.has_key?("trim_if_not_quoted")
119
- sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
120
- column_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed)
121
- if column_types != column_types_trimmed
122
- parser_guessed["trim_if_not_quoted"] = true
123
- column_types = column_types_trimmed
124
- else
125
- parser_guessed["trim_if_not_quoted"] = false
126
- end
127
- end
128
- else
129
- # The file contains more than 1 line. If guessed first line's column types are all strings or boolean, and the types are
130
- # different from the other lines, assume that the first line is column names.
131
- first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
132
- other_types = SchemaGuess.types_from_array_records(sample_records[1..-1] || [])
133
-
134
- unless parser_guessed.has_key?("trim_if_not_quoted")
135
- sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
136
- other_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed[1..-1] || [])
137
- if other_types != other_types_trimmed
138
- parser_guessed["trim_if_not_quoted"] = true
139
- other_types = other_types_trimmed
140
- else
141
- parser_guessed["trim_if_not_quoted"] = false
142
- end
143
- end
144
-
145
- header_line = (first_types != other_types && first_types.all? {|t| ["string", "boolean"].include?(t) }) || guess_string_header_line(sample_records)
146
- column_types = other_types
147
- end
148
-
149
- if column_types.empty?
150
- # TODO here is making the guessing failed if the file doesn't contain any columns. However,
151
- # this may not be convenient for users.
152
- return {}
153
- end
154
-
155
- if header_line
156
- parser_guessed["skip_header_lines"] = skip_header_lines + 1
157
- else
158
- parser_guessed["skip_header_lines"] = skip_header_lines
159
- end
160
-
161
- parser_guessed["allow_extra_columns"] = false unless parser_guessed.has_key?("allow_extra_columns")
162
- parser_guessed["allow_optional_columns"] = false unless parser_guessed.has_key?("allow_optional_columns")
163
-
164
- if header_line
165
- column_names = sample_records.first.map(&:strip)
166
- else
167
- column_names = (0..column_types.size).to_a.map {|i| "c#{i}" }
168
- end
169
- schema = []
170
- column_names.zip(column_types).each do |name,type|
171
- if name && type
172
- schema << new_column(name, type)
173
- end
174
- end
175
- parser_guessed["columns"] = schema
176
-
177
- return {"parser" => parser_guessed}
178
- end
179
-
180
- def new_column(name, type)
181
- if type.is_a?(SchemaGuess::TimestampTypeMatch)
182
- {"name" => name, "type" => type, "format" => type.format}
183
- else
184
- {"name" => name, "type" => type}
185
- end
186
- end
187
-
188
- private
189
-
190
- def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
191
- null_string = parser_config["null_string"]
192
- config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
193
- parser_task = config.load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
194
- data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
195
- sample = Buffer.from_ruby_string(data)
196
- decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
197
- tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
198
- rows = []
199
- while tokenizer.nextFile
200
- while tokenizer.nextRecord(skip_empty_lines)
201
- begin
202
- columns = []
203
- while true
204
- begin
205
- column = tokenizer.nextColumn
206
- quoted = tokenizer.wasQuotedColumn
207
- if null_string && !quoted && column == null_string
208
- column = nil
209
- end
210
- columns << column
211
- rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
212
- rows << columns
213
- break
214
- end
215
- end
216
- rescue org.embulk.standards.CsvTokenizer::InvalidValueException
217
- # TODO warning
218
- tokenizer.skipCurrentLine
219
- end
220
- end
221
- end
222
- return rows
223
- rescue
224
- # TODO warning if fallback to this ad-hoc implementation
225
- sample_lines.map {|line| line.split(delim) }
226
- end
227
-
228
- def guess_delimiter(sample_lines)
229
- delim_weights = DELIMITER_CANDIDATES.map do |d|
230
- counts = sample_lines.map {|line| line.count(d) }
231
- total = array_sum(counts)
232
- if total > 0
233
- stddev = array_standard_deviation(counts)
234
- stddev = 0.000000001 if stddev == 0.0
235
- weight = total / stddev
236
- [d, weight]
237
- else
238
- [nil, 0]
239
- end
240
- end
241
-
242
- delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
243
- if delim != nil && weight > 1
244
- return delim
245
- else
246
- return nil
247
- end
248
- end
249
-
250
- def guess_quote(sample_lines, delim)
251
- delim_regexp = Regexp.escape(delim)
252
- quote_weights = QUOTE_CANDIDATES.map do |q|
253
- weights = sample_lines.map do |line|
254
- q_regexp = Regexp.escape(q)
255
- count = line.count(q)
256
- if count > 0
257
- weight = count
258
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
259
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
260
- weight
261
- else
262
- nil
263
- end
264
- end.compact
265
- weights.empty? ? 0 : array_avg(weights)
266
- end
267
- quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
268
- if weight >= 10.0
269
- return quote
270
- else
271
- return nil
272
- end
273
- end
274
-
275
- def guess_force_no_quote(sample_lines, delim, quote_candidate)
276
- delim_regexp = Regexp.escape(delim)
277
- q_regexp = Regexp.escape(quote_candidate)
278
- sample_lines.any? do |line|
279
- # quoting character appear at the middle of a non-quoted value
280
- line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
281
- end
282
- end
283
-
284
- def guess_escape(sample_lines, delim, quote)
285
- guessed = ESCAPE_CANDIDATES.map do |str|
286
- regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
287
- counts = sample_lines.map {|line| line.scan(regexp).count }
288
- count = counts.inject(0) {|r,c| r + c }
289
- [str, count]
290
- end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
291
- found = guessed.first
292
- return found ? found[0] : nil
293
- end
294
-
295
- def guess_null_string(sample_lines, delim)
296
- guessed = NULL_STRING_CANDIDATES.map do |str|
297
- regexp = /(?:^|#{Regexp.quote(delim)})#{Regexp.quote(str)}(?:$|#{Regexp.quote(delim)})/
298
- counts = sample_lines.map {|line| line.scan(regexp).count }
299
- count = counts.inject(0) {|r,c| r + c }
300
- [str, count]
301
- end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
302
- found_str, found_count = guessed.first
303
- return found_str ? found_str : nil
304
- end
305
-
306
- def guess_skip_header_lines(sample_records)
307
- counts = sample_records.map {|records| records.size }
308
- (1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
309
- check_row_count = counts[i-1]
310
- if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
311
- return i - 1
312
- end
313
- end
314
- return 0
315
- end
316
-
317
- def guess_comment_line_marker(sample_lines, delim, quote, null_string)
318
- exclude = []
319
- exclude << /^#{Regexp.escape(quote)}/ if quote && !quote.empty?
320
- exclude << /^#{Regexp.escape(null_string)}(?:#{Regexp.escape(delim)}|$)/ if null_string
321
-
322
- guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
323
- regexp = /^#{Regexp.quote(str)}/
324
- unmatch_lines = sample_lines.reject do |line|
325
- exclude.all? {|ex| line !~ ex } && line =~ regexp
326
- end
327
- match_count = sample_lines.size - unmatch_lines.size
328
- [str, match_count, unmatch_lines]
329
- end.select {|str,match_count,unmatch_lines| match_count > 0 }.sort_by {|str,match_count,unmatch_lines| -match_count }
330
-
331
- str, match_count, unmatch_lines = guessed.first
332
- if str
333
- return str, unmatch_lines
334
- else
335
- return nil, sample_lines
336
- end
337
- end
338
-
339
- def guess_string_header_line(sample_records)
340
- first = sample_records.first
341
- first.count.times do |column_index|
342
- lengths = sample_records.map {|row| row[column_index] }.compact.map {|v| v.to_s.size }
343
- if lengths.size > 1
344
- if array_variance(lengths[1..-1]) <= 0.2
345
- avg = array_avg(lengths[1..-1])
346
- if avg == 0.0 ? lengths[0] > 1 : (avg - lengths[0]).abs / avg > 0.7
347
- return true
348
- end
349
- end
350
- end
351
- end
352
- return false
353
- end
354
-
355
- def array_sum(array)
356
- array.inject(0) {|r,i| r += i }
357
- end
358
-
359
- def array_avg(array)
360
- array.inject(0.0) {|r,i| r += i } / array.size
361
- end
362
-
363
- def array_variance(array)
364
- avg = array_avg(array)
365
- array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
366
- end
367
-
368
- def array_standard_deviation(array)
369
- Math.sqrt(array_variance(array))
370
- end
371
- end
372
-
373
- end
374
- end
@@ -1,13 +0,0 @@
1
- module Embulk
2
- module Guess
3
- require 'embulk/guess/csv'
4
-
5
- class CsvAllStringsGuessPlugin < CsvGuessPlugin
6
- Plugin.register_guess("csv_all_strings", self)
7
-
8
- def new_column(name, type)
9
- {"name" => name, "type" => "string"}
10
- end
11
- end
12
- end
13
- end
@@ -1,18 +0,0 @@
1
- module Embulk
2
- module Guess
3
-
4
- class GzipGuessPlugin < GuessPlugin
5
- Plugin.register_guess('gzip', self)
6
-
7
- GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
-
9
- def guess(config, sample_buffer)
10
- if sample_buffer[0,2] == GZIP_HEADER
11
- return {"decoders" => [{"type" => "gzip"}]}
12
- end
13
- return {}
14
- end
15
- end
16
-
17
- end
18
- end
@@ -1,50 +0,0 @@
1
- module Embulk
2
- module Guess
3
- class JsonGuessPlugin < GuessPlugin
4
- Plugin.register_guess('json', self)
5
-
6
- java_import 'com.google.common.collect.Lists'
7
- java_import 'java.io.ByteArrayInputStream'
8
- java_import 'org.embulk.spi.Exec'
9
- java_import 'org.embulk.spi.json.JsonParser'
10
- java_import 'org.embulk.spi.json.JsonParseException'
11
- java_import 'org.embulk.spi.util.FileInputInputStream'
12
- java_import 'org.embulk.spi.util.InputStreamFileInput'
13
-
14
- def guess(config, sample_buffer)
15
- return {} unless config.fetch("parser", {}).fetch("type", "json") == "json"
16
-
17
- # Use org.embulk.spi.json.JsonParser to respond to multi-line Json
18
- json_parser = new_json_parser(sample_buffer)
19
- one_json_parsed = false
20
- begin
21
- while (v = json_parser.next)
22
- # "v" needs to be JSON object type (isMapValue) because:
23
- # 1) Single-column CSV can be mis-guessed as JSON if JSON non-objects are accepted.
24
- # 2) JsonParserPlugin accepts only the JSON object type.
25
- raise JsonParseException.new("v must be JSON object type") unless v.isMapValue
26
- one_json_parsed = true
27
- end
28
- rescue JsonParseException
29
- # the exception is ignored
30
- end
31
-
32
- if one_json_parsed
33
- return {"parser" => {"type" => "json"}} # if JsonParser can parse even one JSON data
34
- else
35
- return {}
36
- end
37
- end
38
-
39
- private
40
-
41
- def new_json_parser(buffer)
42
- input_streams = Lists::newArrayList(ByteArrayInputStream.new(buffer.to_java_bytes))
43
- iterator_provider = InputStreamFileInput::IteratorProvider.new(input_streams)
44
- input = FileInputInputStream.new(InputStreamFileInput.new(Java::SPI::Exec.getBufferAllocator(), iterator_provider))
45
- input.nextFile
46
- JsonParser.new.open(input)
47
- end
48
- end
49
- end
50
- end