embulk 0.4.10 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/build.gradle +4 -3
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +22 -3
- data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
- data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +57 -31
- data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +6 -5
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +14 -10
- data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +2 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +2 -1
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +22 -0
- data/embulk-docs/plugins/index.html.erb +2 -2
- data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +3 -3
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.5.0.rst +81 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +13 -1
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +9 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +68 -11
- data/lib/embulk/column.rb +31 -8
- data/lib/embulk/command/embulk_new_plugin.rb +30 -22
- data/lib/embulk/command/embulk_run.rb +16 -3
- data/lib/embulk/data/new/README.md.erb +37 -2
- data/lib/embulk/data/new/java/input.java.erb +14 -0
- data/lib/embulk/data/new/java/output.java.erb +4 -0
- data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
- data/lib/embulk/data/new/ruby/input.rb.erb +11 -1
- data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
- data/lib/embulk/guess/csv.rb +7 -81
- data/lib/embulk/guess/schema_guess.rb +107 -0
- data/lib/embulk/guess/time_format_guess.rb +2 -1
- data/lib/embulk/guess_plugin.rb +20 -0
- data/lib/embulk/input_plugin.rb +10 -0
- data/lib/embulk/schema.rb +9 -2
- data/lib/embulk/version.rb +1 -1
- data/test/guess/test_schema_guess.rb +11 -0
- data/test/helper.rb +1 -2
- metadata +11 -4
@@ -375,7 +375,8 @@ module Embulk::Guess
|
|
375
375
|
]
|
376
376
|
|
377
377
|
def self.guess(texts)
|
378
|
-
texts = Array(texts).
|
378
|
+
texts = Array(texts).map {|text| text.to_s }
|
379
|
+
texts.reject! {|text| text == "" }
|
379
380
|
matches = texts.map do |text|
|
380
381
|
PATTERNS.map {|pattern| pattern.match(text) }.compact
|
381
382
|
end.flatten
|
data/lib/embulk/guess_plugin.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Embulk
|
2
2
|
|
3
|
+
require 'embulk/guess/schema_guess'
|
4
|
+
|
3
5
|
class GuessPlugin
|
4
6
|
def guess(config, sample_buffer)
|
5
7
|
raise NotImplementedError, "GuessPlugin#guess(config, sample_buffer) must be implemented"
|
@@ -45,6 +47,12 @@ module Embulk
|
|
45
47
|
|
46
48
|
class TextGuessPlugin < GuessPlugin
|
47
49
|
def guess(config, sample)
|
50
|
+
if config.fetch('parser', {}).fetch('charset', nil).nil?
|
51
|
+
require 'embulk/guess/charset'
|
52
|
+
charset_guess = Guess::CharsetGuessPlugin.new
|
53
|
+
return charset_guess.guess(config, sample)
|
54
|
+
end
|
55
|
+
|
48
56
|
# TODO pure-ruby LineDecoder implementation?
|
49
57
|
begin
|
50
58
|
parser_task = config.param("parser", :hash, default: {}).load_config(Java::LineDecoder::DecoderTask)
|
@@ -79,6 +87,18 @@ module Embulk
|
|
79
87
|
|
80
88
|
class LineGuessPlugin < GuessPlugin
|
81
89
|
def guess(config, sample)
|
90
|
+
if config.fetch('parser', {}).fetch('charset', nil).nil?
|
91
|
+
require 'embulk/guess/charset'
|
92
|
+
charset_guess = Guess::CharsetGuessPlugin.new
|
93
|
+
return charset_guess.guess(config, sample)
|
94
|
+
end
|
95
|
+
|
96
|
+
if config.fetch('parser', {}).fetch('newline', nil).nil?
|
97
|
+
require 'embulk/guess/newline'
|
98
|
+
newline_guess = Guess::NewlineGuessPlugin.new
|
99
|
+
return newline_guess.guess(config, sample)
|
100
|
+
end
|
101
|
+
|
82
102
|
# TODO pure-ruby LineDecoder implementation?
|
83
103
|
begin
|
84
104
|
parser_task = config.param("parser", :hash, default: {}).load_config(Java::LineDecoder::DecoderTask)
|
data/lib/embulk/input_plugin.rb
CHANGED
@@ -17,6 +17,10 @@ module Embulk
|
|
17
17
|
# do nothing by default
|
18
18
|
end
|
19
19
|
|
20
|
+
def self.guess(config)
|
21
|
+
raise NotImplementedError, "#{self}.guess(config) is not implemented. This input plugin does not support guess."
|
22
|
+
end
|
23
|
+
|
20
24
|
def initialize(task, schema, index, page_builder)
|
21
25
|
@task = task
|
22
26
|
@schema = schema
|
@@ -83,6 +87,12 @@ module Embulk
|
|
83
87
|
return nil
|
84
88
|
end
|
85
89
|
|
90
|
+
def guess(java_config)
|
91
|
+
config = DataSource.from_java(java_config)
|
92
|
+
config_diff_hash = @ruby_class.guess(config)
|
93
|
+
return DataSource.from_ruby_hash(config_diff_hash).to_java
|
94
|
+
end
|
95
|
+
|
86
96
|
def run(java_task_source, java_schema, processor_index, java_output)
|
87
97
|
task_source = DataSource.from_java(java_task_source)
|
88
98
|
schema = Schema.from_java(java_schema)
|
data/lib/embulk/schema.rb
CHANGED
@@ -3,8 +3,15 @@ module Embulk
|
|
3
3
|
require 'embulk/column'
|
4
4
|
|
5
5
|
class Schema < Array
|
6
|
-
def initialize(
|
7
|
-
|
6
|
+
def initialize(columns)
|
7
|
+
columns = columns.map.with_index {|c,index|
|
8
|
+
if c.index && c.index != index
|
9
|
+
# TODO ignore this error?
|
10
|
+
raise "Index of column '#{c.name}' is #{c.index} but it is at column #{index}."
|
11
|
+
end
|
12
|
+
Column.new(index, c.name, c.type, c.format)
|
13
|
+
}
|
14
|
+
super(columns)
|
8
15
|
|
9
16
|
record_reader_script =
|
10
17
|
"lambda do |reader|\n" <<
|
data/lib/embulk/version.rb
CHANGED
data/test/helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -129,6 +129,7 @@ files:
|
|
129
129
|
- embulk-core/src/main/java/org/embulk/exec/ExecutionInterruptedException.java
|
130
130
|
- embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java
|
131
131
|
- embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java
|
132
|
+
- embulk-core/src/main/java/org/embulk/exec/ForGuess.java
|
132
133
|
- embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java
|
133
134
|
- embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java
|
134
135
|
- embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java
|
@@ -270,6 +271,7 @@ files:
|
|
270
271
|
- embulk-docs/src/release/release-0.4.7.rst
|
271
272
|
- embulk-docs/src/release/release-0.4.8.rst
|
272
273
|
- embulk-docs/src/release/release-0.4.9.rst
|
274
|
+
- embulk-docs/src/release/release-0.5.0.rst
|
273
275
|
- embulk-standards/build.gradle
|
274
276
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
275
277
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
@@ -325,12 +327,14 @@ files:
|
|
325
327
|
- lib/embulk/data/new/java/test.java.erb
|
326
328
|
- lib/embulk/data/new/ruby/Gemfile
|
327
329
|
- lib/embulk/data/new/ruby/Rakefile
|
330
|
+
- lib/embulk/data/new/ruby/decoder_guess.rb.erb
|
328
331
|
- lib/embulk/data/new/ruby/filter.rb.erb
|
329
332
|
- lib/embulk/data/new/ruby/formatter.rb.erb
|
330
333
|
- lib/embulk/data/new/ruby/gemspec.erb
|
331
334
|
- lib/embulk/data/new/ruby/input.rb.erb
|
332
335
|
- lib/embulk/data/new/ruby/output.rb.erb
|
333
336
|
- lib/embulk/data/new/ruby/parser.rb.erb
|
337
|
+
- lib/embulk/data/new/ruby/parser_guess.rb.erb
|
334
338
|
- lib/embulk/data/package_data.rb
|
335
339
|
- lib/embulk/data_source.rb
|
336
340
|
- lib/embulk/decoder_plugin.rb
|
@@ -346,6 +350,7 @@ files:
|
|
346
350
|
- lib/embulk/guess/csv.rb
|
347
351
|
- lib/embulk/guess/gzip.rb
|
348
352
|
- lib/embulk/guess/newline.rb
|
353
|
+
- lib/embulk/guess/schema_guess.rb
|
349
354
|
- lib/embulk/guess/time_format_guess.rb
|
350
355
|
- lib/embulk/guess_plugin.rb
|
351
356
|
- lib/embulk/input_plugin.rb
|
@@ -362,6 +367,7 @@ files:
|
|
362
367
|
- lib/embulk/schema.rb
|
363
368
|
- lib/embulk/version.rb
|
364
369
|
- settings.gradle
|
370
|
+
- test/guess/test_schema_guess.rb
|
365
371
|
- test/guess/test_time_format_guess.rb
|
366
372
|
- test/helper.rb
|
367
373
|
- classpath/annotations-3.0.0.jar
|
@@ -370,8 +376,8 @@ files:
|
|
370
376
|
- classpath/bval-jsr303-0.5.jar
|
371
377
|
- classpath/commons-beanutils-core-1.8.3.jar
|
372
378
|
- classpath/commons-lang3-3.1.jar
|
373
|
-
- classpath/embulk-core-0.
|
374
|
-
- classpath/embulk-standards-0.
|
379
|
+
- classpath/embulk-core-0.5.0.jar
|
380
|
+
- classpath/embulk-standards-0.5.0.jar
|
375
381
|
- classpath/guava-18.0.jar
|
376
382
|
- classpath/guice-3.0.jar
|
377
383
|
- classpath/guice-multibindings-3.0.jar
|
@@ -418,5 +424,6 @@ signing_key:
|
|
418
424
|
specification_version: 4
|
419
425
|
summary: Embulk, a plugin-based parallel bulk data loader
|
420
426
|
test_files:
|
427
|
+
- test/guess/test_schema_guess.rb
|
421
428
|
- test/guess/test_time_format_guess.rb
|
422
429
|
- test/helper.rb
|