embulk 0.4.10 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/build.gradle +4 -3
  4. data/embulk-core/src/main/java/org/embulk/command/Runner.java +22 -3
  5. data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
  6. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +57 -31
  7. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +1 -1
  8. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +6 -5
  9. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +14 -10
  10. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +16 -0
  11. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +2 -0
  12. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +2 -1
  13. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +22 -0
  14. data/embulk-docs/plugins/index.html.erb +2 -2
  15. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +3 -3
  16. data/embulk-docs/src/release.rst +1 -0
  17. data/embulk-docs/src/release/release-0.5.0.rst +81 -0
  18. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +13 -1
  19. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +9 -0
  20. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +68 -11
  21. data/lib/embulk/column.rb +31 -8
  22. data/lib/embulk/command/embulk_new_plugin.rb +30 -22
  23. data/lib/embulk/command/embulk_run.rb +16 -3
  24. data/lib/embulk/data/new/README.md.erb +37 -2
  25. data/lib/embulk/data/new/java/input.java.erb +14 -0
  26. data/lib/embulk/data/new/java/output.java.erb +4 -0
  27. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
  28. data/lib/embulk/data/new/ruby/input.rb.erb +11 -1
  29. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
  30. data/lib/embulk/guess/csv.rb +7 -81
  31. data/lib/embulk/guess/schema_guess.rb +107 -0
  32. data/lib/embulk/guess/time_format_guess.rb +2 -1
  33. data/lib/embulk/guess_plugin.rb +20 -0
  34. data/lib/embulk/input_plugin.rb +10 -0
  35. data/lib/embulk/schema.rb +9 -2
  36. data/lib/embulk/version.rb +1 -1
  37. data/test/guess/test_schema_guess.rb +11 -0
  38. data/test/helper.rb +1 -2
  39. metadata +11 -4
@@ -375,7 +375,8 @@ module Embulk::Guess
375
375
  ]
376
376
 
377
377
  def self.guess(texts)
378
- texts = Array(texts).select {|text| text != "" }
378
+ texts = Array(texts).map {|text| text.to_s }
379
+ texts.reject! {|text| text == "" }
379
380
  matches = texts.map do |text|
380
381
  PATTERNS.map {|pattern| pattern.match(text) }.compact
381
382
  end.flatten
@@ -1,5 +1,7 @@
1
1
  module Embulk
2
2
 
3
+ require 'embulk/guess/schema_guess'
4
+
3
5
  class GuessPlugin
4
6
  def guess(config, sample_buffer)
5
7
  raise NotImplementedError, "GuessPlugin#guess(config, sample_buffer) must be implemented"
@@ -45,6 +47,12 @@ module Embulk
45
47
 
46
48
  class TextGuessPlugin < GuessPlugin
47
49
  def guess(config, sample)
50
+ if config.fetch('parser', {}).fetch('charset', nil).nil?
51
+ require 'embulk/guess/charset'
52
+ charset_guess = Guess::CharsetGuessPlugin.new
53
+ return charset_guess.guess(config, sample)
54
+ end
55
+
48
56
  # TODO pure-ruby LineDecoder implementation?
49
57
  begin
50
58
  parser_task = config.param("parser", :hash, default: {}).load_config(Java::LineDecoder::DecoderTask)
@@ -79,6 +87,18 @@ module Embulk
79
87
 
80
88
  class LineGuessPlugin < GuessPlugin
81
89
  def guess(config, sample)
90
+ if config.fetch('parser', {}).fetch('charset', nil).nil?
91
+ require 'embulk/guess/charset'
92
+ charset_guess = Guess::CharsetGuessPlugin.new
93
+ return charset_guess.guess(config, sample)
94
+ end
95
+
96
+ if config.fetch('parser', {}).fetch('newline', nil).nil?
97
+ require 'embulk/guess/newline'
98
+ newline_guess = Guess::NewlineGuessPlugin.new
99
+ return newline_guess.guess(config, sample)
100
+ end
101
+
82
102
  # TODO pure-ruby LineDecoder implementation?
83
103
  begin
84
104
  parser_task = config.param("parser", :hash, default: {}).load_config(Java::LineDecoder::DecoderTask)
@@ -17,6 +17,10 @@ module Embulk
17
17
  # do nothing by default
18
18
  end
19
19
 
20
+ def self.guess(config)
21
+ raise NotImplementedError, "#{self}.guess(config) is not implemented. This input plugin does not support guess."
22
+ end
23
+
20
24
  def initialize(task, schema, index, page_builder)
21
25
  @task = task
22
26
  @schema = schema
@@ -83,6 +87,12 @@ module Embulk
83
87
  return nil
84
88
  end
85
89
 
90
+ def guess(java_config)
91
+ config = DataSource.from_java(java_config)
92
+ config_diff_hash = @ruby_class.guess(config)
93
+ return DataSource.from_ruby_hash(config_diff_hash).to_java
94
+ end
95
+
86
96
  def run(java_task_source, java_schema, processor_index, java_output)
87
97
  task_source = DataSource.from_java(java_task_source)
88
98
  schema = Schema.from_java(java_schema)
@@ -3,8 +3,15 @@ module Embulk
3
3
  require 'embulk/column'
4
4
 
5
5
  class Schema < Array
6
- def initialize(src)
7
- super
6
+ def initialize(columns)
7
+ columns = columns.map.with_index {|c,index|
8
+ if c.index && c.index != index
9
+ # TODO ignore this error?
10
+ raise "Index of column '#{c.name}' is #{c.index} but it is at column #{index}."
11
+ end
12
+ Column.new(index, c.name, c.type, c.format)
13
+ }
14
+ super(columns)
8
15
 
9
16
  record_reader_script =
10
17
  "lambda do |reader|\n" <<
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.4.10'
2
+ VERSION = '0.5.0'
3
3
  end
@@ -0,0 +1,11 @@
1
+ require 'helper'
2
+ require 'time'
3
+ require 'embulk/guess/schema_guess'
4
+
5
+ class SchemaGuessTest < ::Test::Unit::TestCase
6
+ G = Embulk::Guess::SchemaGuess
7
+
8
+ def test_guess
9
+ G.from_hash_records([{"int" => "1", "str" => "a"}])
10
+ end
11
+ end
@@ -1,6 +1,5 @@
1
1
  require 'test/unit'
2
2
 
3
- module Embulk
4
- end
3
+ require 'embulk'
5
4
 
6
5
  # TODO simplecov
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.10
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-27 00:00:00.000000000 Z
11
+ date: 2015-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -129,6 +129,7 @@ files:
129
129
  - embulk-core/src/main/java/org/embulk/exec/ExecutionInterruptedException.java
130
130
  - embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java
131
131
  - embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java
132
+ - embulk-core/src/main/java/org/embulk/exec/ForGuess.java
132
133
  - embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java
133
134
  - embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java
134
135
  - embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java
@@ -270,6 +271,7 @@ files:
270
271
  - embulk-docs/src/release/release-0.4.7.rst
271
272
  - embulk-docs/src/release/release-0.4.8.rst
272
273
  - embulk-docs/src/release/release-0.4.9.rst
274
+ - embulk-docs/src/release/release-0.5.0.rst
273
275
  - embulk-standards/build.gradle
274
276
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
275
277
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -325,12 +327,14 @@ files:
325
327
  - lib/embulk/data/new/java/test.java.erb
326
328
  - lib/embulk/data/new/ruby/Gemfile
327
329
  - lib/embulk/data/new/ruby/Rakefile
330
+ - lib/embulk/data/new/ruby/decoder_guess.rb.erb
328
331
  - lib/embulk/data/new/ruby/filter.rb.erb
329
332
  - lib/embulk/data/new/ruby/formatter.rb.erb
330
333
  - lib/embulk/data/new/ruby/gemspec.erb
331
334
  - lib/embulk/data/new/ruby/input.rb.erb
332
335
  - lib/embulk/data/new/ruby/output.rb.erb
333
336
  - lib/embulk/data/new/ruby/parser.rb.erb
337
+ - lib/embulk/data/new/ruby/parser_guess.rb.erb
334
338
  - lib/embulk/data/package_data.rb
335
339
  - lib/embulk/data_source.rb
336
340
  - lib/embulk/decoder_plugin.rb
@@ -346,6 +350,7 @@ files:
346
350
  - lib/embulk/guess/csv.rb
347
351
  - lib/embulk/guess/gzip.rb
348
352
  - lib/embulk/guess/newline.rb
353
+ - lib/embulk/guess/schema_guess.rb
349
354
  - lib/embulk/guess/time_format_guess.rb
350
355
  - lib/embulk/guess_plugin.rb
351
356
  - lib/embulk/input_plugin.rb
@@ -362,6 +367,7 @@ files:
362
367
  - lib/embulk/schema.rb
363
368
  - lib/embulk/version.rb
364
369
  - settings.gradle
370
+ - test/guess/test_schema_guess.rb
365
371
  - test/guess/test_time_format_guess.rb
366
372
  - test/helper.rb
367
373
  - classpath/annotations-3.0.0.jar
@@ -370,8 +376,8 @@ files:
370
376
  - classpath/bval-jsr303-0.5.jar
371
377
  - classpath/commons-beanutils-core-1.8.3.jar
372
378
  - classpath/commons-lang3-3.1.jar
373
- - classpath/embulk-core-0.4.10.jar
374
- - classpath/embulk-standards-0.4.10.jar
379
+ - classpath/embulk-core-0.5.0.jar
380
+ - classpath/embulk-standards-0.5.0.jar
375
381
  - classpath/guava-18.0.jar
376
382
  - classpath/guice-3.0.jar
377
383
  - classpath/guice-multibindings-3.0.jar
@@ -418,5 +424,6 @@ signing_key:
418
424
  specification_version: 4
419
425
  summary: Embulk, a plugin-based parallel bulk data loader
420
426
  test_files:
427
+ - test/guess/test_schema_guess.rb
421
428
  - test/guess/test_time_format_guess.rb
422
429
  - test/helper.rb