embulk 0.4.10 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/build.gradle +4 -3
  4. data/embulk-core/src/main/java/org/embulk/command/Runner.java +22 -3
  5. data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
  6. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +57 -31
  7. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +1 -1
  8. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +6 -5
  9. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +14 -10
  10. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +16 -0
  11. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +2 -0
  12. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +2 -1
  13. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +22 -0
  14. data/embulk-docs/plugins/index.html.erb +2 -2
  15. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +3 -3
  16. data/embulk-docs/src/release.rst +1 -0
  17. data/embulk-docs/src/release/release-0.5.0.rst +81 -0
  18. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +13 -1
  19. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +9 -0
  20. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +68 -11
  21. data/lib/embulk/column.rb +31 -8
  22. data/lib/embulk/command/embulk_new_plugin.rb +30 -22
  23. data/lib/embulk/command/embulk_run.rb +16 -3
  24. data/lib/embulk/data/new/README.md.erb +37 -2
  25. data/lib/embulk/data/new/java/input.java.erb +14 -0
  26. data/lib/embulk/data/new/java/output.java.erb +4 -0
  27. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
  28. data/lib/embulk/data/new/ruby/input.rb.erb +11 -1
  29. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
  30. data/lib/embulk/guess/csv.rb +7 -81
  31. data/lib/embulk/guess/schema_guess.rb +107 -0
  32. data/lib/embulk/guess/time_format_guess.rb +2 -1
  33. data/lib/embulk/guess_plugin.rb +20 -0
  34. data/lib/embulk/input_plugin.rb +10 -0
  35. data/lib/embulk/schema.rb +9 -2
  36. data/lib/embulk/version.rb +1 -1
  37. data/test/guess/test_schema_guess.rb +11 -0
  38. data/test/helper.rb +1 -2
  39. metadata +11 -4
@@ -1,12 +1,37 @@
1
1
  # <%= display_name %> <%= display_category %> plugin for Embulk
2
2
 
3
- TODO: Write short description here
3
+ %case language
4
+ %when :ruby
5
+ TODO: Write short description here and <%= project_name %>.gemspec file.
6
+ %when :java
7
+ TODO: Write short description here and build.gradle file.
8
+ %else
9
+ TODO: Write short description here.
10
+ %end
4
11
 
5
12
  ## Overview
6
13
 
14
+ %case category
15
+ %when :output, :file_output
7
16
  * **Plugin type**: <%= display_category %>
8
- * **Load all or nothing**: yes
17
+ * **Load all or nothing**: no
9
18
  * **Resume supported**: no
19
+ * **Cleanup supported**: yes
20
+ %when :file_input
21
+ * **Plugin type**: <%= display_category %>
22
+ * **Resume supported**: yes
23
+ * **Cleanup supported**: yes
24
+ %when :input
25
+ * **Plugin type**: <%= display_category %>
26
+ * **Resume supported**: yes
27
+ * **Cleanup supported**: yes
28
+ * **Guess supported**: no
29
+ %when :parser, :decoder
30
+ * **Plugin type**: <%= display_category %>
31
+ * **Guess supported**: no
32
+ %else
33
+ * **Plugin type**: <%= display_category %>
34
+ %end
10
35
 
11
36
  ## Configuration
12
37
 
@@ -63,6 +88,16 @@ out:
63
88
  %end
64
89
  ```
65
90
 
91
+ %case category
92
+ %when :parser, :decoder
93
+ (If guess supported) you don't have to write `<%= category %>:` section in the configuration file. After writing `in:` section, you can let embulk guess `<%= category %>:` section using this command:
94
+
95
+ ```
96
+ $ embulk install <%= project_name %>
97
+ $ embulk guess -g <%= name %> config.yml -o guessed.yml
98
+ ```
99
+ %end
100
+
66
101
  ## Build
67
102
 
68
103
  ```
@@ -32,6 +32,7 @@ public class <%= java_class_name %>
32
32
  public SchemaConfig getColumns();
33
33
  }
34
34
 
35
+ @Override
35
36
  public ConfigDiff transaction(ConfigSource config,
36
37
  InputPlugin.Control control)
37
38
  {
@@ -43,6 +44,7 @@ public class <%= java_class_name %>
43
44
  return resume(task.dump(), schema, taskCount, control);
44
45
  }
45
46
 
47
+ @Override
46
48
  public ConfigDiff resume(TaskSource taskSource,
47
49
  Schema schema, int taskCount,
48
50
  InputPlugin.Control control)
@@ -51,12 +53,14 @@ public class <%= java_class_name %>
51
53
  return Exec.newConfigDiff();
52
54
  }
53
55
 
56
+ @Override
54
57
  public void cleanup(TaskSource taskSource,
55
58
  Schema schema, int taskCount,
56
59
  List<CommitReport> successCommitReports)
57
60
  {
58
61
  }
59
62
 
63
+ @Override
60
64
  public CommitReport run(TaskSource taskSource,
61
65
  Schema schema, int taskIndex,
62
66
  PageOutput output)
@@ -66,4 +70,14 @@ public class <%= java_class_name %>
66
70
  // TODO
67
71
  throw new UnsupportedOperationException("The 'run' method needs to be implemented");
68
72
  }
73
+
74
+ @Override
75
+ public ConfigDiff guess(ConfigSource config)
76
+ {
77
+ // TODO
78
+ throw new UnsupportedOperationException("'<%= name %>' input plugin does not support guessing.");
79
+ //ConfigDiff diff = Exec.newConfigDiff();
80
+ //diff.set("property1", "value");
81
+ //return diff;
82
+ }
69
83
  }
@@ -28,6 +28,7 @@ public class <%= java_class_name %>
28
28
  public int getProperty2();
29
29
  }
30
30
 
31
+ @Override
31
32
  public ConfigDiff transaction(ConfigSource config,
32
33
  Schema schema, int taskCount,
33
34
  OutputPlugin.Control control)
@@ -42,6 +43,7 @@ public class <%= java_class_name %>
42
43
  return Exec.newConfigDiff();
43
44
  }
44
45
 
46
+ @Override
45
47
  public ConfigDiff resume(TaskSource taskSource,
46
48
  Schema schema, int taskCount,
47
49
  OutputPlugin.Control control)
@@ -49,12 +51,14 @@ public class <%= java_class_name %>
49
51
  throw new UnsupportedOperationException("<%= name %> output plugin does not support resuming");
50
52
  }
51
53
 
54
+ @Override
52
55
  public void cleanup(TaskSource taskSource,
53
56
  Schema schema, int taskCount,
54
57
  List<CommitReport> successCommitReports)
55
58
  {
56
59
  }
57
60
 
61
+ @Override
58
62
  public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
59
63
  {
60
64
  PluginTask task = taskSource.loadTask(PluginTask.class);
@@ -0,0 +1,25 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g <%= name.dump %> partial-config.yml
6
+ %if language == :ruby
7
+
8
+ #require <%= "embulk/#{embulk_category}/#{name}.rb".dump %>
9
+ %end
10
+
11
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < GuessPlugin
12
+ # Plugin.register_guess(<%= name.dump %>, self)
13
+ #
14
+ # FOO_BAR_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
15
+ #
16
+ # def guess(config, sample_buffer)
17
+ # if sample_buffer[0,2] == FOO_BAR_HEADER
18
+ # return {"decoders" => [{"type" => <%= name.dump %>}]}
19
+ # end
20
+ # return {}
21
+ # end
22
+ #end
23
+
24
+ end
25
+ end
@@ -14,7 +14,7 @@ module Embulk
14
14
  columns = [
15
15
  Column.new(0, "example", :string),
16
16
  Column.new(1, "column", :long),
17
- Column.new(2, "name", :double),
17
+ Column.new(2, "value", :double),
18
18
  ]
19
19
 
20
20
  resume(task, columns, 1, &control)
@@ -27,6 +27,16 @@ module Embulk
27
27
  return next_config_diff
28
28
  end
29
29
 
30
+ # TODO
31
+ #def self.guess(config)
32
+ # sample_records = [
33
+ # {"example"=>"a", "column"=>1, "value"=>0.1},
34
+ # {"example"=>"a", "column"=>2, "value"=>0.2},
35
+ # ]
36
+ # columns = Guess::SchemaGuess.from_hash_records(sample_records)
37
+ # return {"columns" => columns}
38
+ #end
39
+
30
40
  def init
31
41
  # initialization code:
32
42
  @property1 = task["property1"]
@@ -0,0 +1,65 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g <%= name.dump %> partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+ %if language == :ruby
11
+
12
+ #require <%= "embulk/#{embulk_category}/#{name}.rb".dump %>
13
+ %end
14
+
15
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < GuessPlugin
16
+ # Plugin.register_guess(<%= name.dump %>, self)
17
+ #
18
+ # def guess(config, sample_buffer)
19
+ # if sample_buffer[0,2] == GZIP_HEADER
20
+ # guessed = {}
21
+ # guessed["type"] = <%= name.dump %>
22
+ # guessed["property1"] = "guessed-value"
23
+ # return {"parser" => guessed}
24
+ # else
25
+ # return {}
26
+ # end
27
+ # end
28
+ #end
29
+
30
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < TextGuessPlugin
31
+ # Plugin.register_guess(<%= name.dump %>, self)
32
+ #
33
+ # def guess_text(config, sample_text)
34
+ # js = JSON.parse(sample_text) rescue nil
35
+ # if js && js["mykeyword"] == "keyword"
36
+ # guessed = {}
37
+ # guessed["type"] = <%= name.dump %>
38
+ # guessed["property1"] = "guessed-value"
39
+ # return {"parser" => guessed}
40
+ # else
41
+ # return {}
42
+ # end
43
+ # end
44
+ #end
45
+
46
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < LineGuessPlugin
47
+ # Plugin.register_guess(<%= name.dump %>, self)
48
+ #
49
+ # def guess_lines(config, sample_lines)
50
+ # all_line_matched = sample_lines.all? do |line|
51
+ # line =~ /mypattern/
52
+ # end
53
+ # if all_line_matched
54
+ # guessed = {}
55
+ # guessed["type"] = <%= name.dump %>
56
+ # guessed["property1"] = "guessed-value"
57
+ # return {"parser" => guessed}
58
+ # else
59
+ # return {}
60
+ # end
61
+ # end
62
+ #end
63
+
64
+ end
65
+ end
@@ -1,6 +1,6 @@
1
1
  module Embulk
2
2
  module Guess
3
- require_relative 'time_format_guess'
3
+ require_relative 'schema_guess'
4
4
 
5
5
  class CsvGuessPlugin < LineGuessPlugin
6
6
  Plugin.register_guess('csv', self)
@@ -24,15 +24,6 @@ module Embulk
24
24
  "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
25
  ]
26
26
 
27
- # CsvParserPlugin.TRUE_STRINGS
28
- TRUE_STRINGS = Hash[*%w[
29
- true True TRUE
30
- yes Yes YES
31
- y Y
32
- on On ON
33
- 1
34
- ].map {|k| [k, true] }]
35
-
36
27
  def guess_lines(config, sample_lines)
37
28
  delim = guess_delimiter(sample_lines)
38
29
  unless delim
@@ -54,8 +45,8 @@ module Embulk
54
45
  # don't even set null_string to avoid confusion of null and 'null' in YAML format
55
46
 
56
47
  sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
57
- first_types = guess_field_types(sample_records[0, 1])
58
- other_types = guess_field_types(sample_records[1..-1])
48
+ first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
49
+ other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
59
50
 
60
51
  if first_types.size <= 1 || other_types.size <= 1
61
52
  # guess failed
@@ -63,7 +54,7 @@ module Embulk
63
54
  end
64
55
 
65
56
  unless parser_config.has_key?("header_line")
66
- parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
57
+ parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != "string" })
67
58
  end
68
59
 
69
60
  unless parser_config.has_key?("columns")
@@ -73,10 +64,10 @@ module Embulk
73
64
  column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
74
65
  end
75
66
  schema = []
76
- column_names.zip(other_types).each do |name,(type,format)|
67
+ column_names.zip(other_types).each do |name,type|
77
68
  if name && type
78
- if format
79
- schema << {"name" => name, "type" => type, "format" => format}
69
+ if type.is_a?(SchemaGuess::TimestampTypeMatch)
70
+ schema << {"name" => name, "type" => type, "format" => type.format}
80
71
  else
81
72
  schema << {"name" => name, "type" => type}
82
73
  end
@@ -163,71 +154,6 @@ module Embulk
163
154
  return found ? found[0] : nil
164
155
  end
165
156
 
166
- def guess_field_types(field_lines)
167
- column_lines = []
168
- field_lines.each do |fields|
169
- fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
170
- end
171
- columns = column_lines.map do |types|
172
- t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
173
- if t.is_a?(TimestampMatch)
174
- format = TimeFormatGuess.guess(types.map {|type| type.text })
175
- ["timestamp", format]
176
- else
177
- [t]
178
- end
179
- end
180
- return columns
181
- end
182
-
183
- TYPE_COALESCE = Hash[{
184
- long: :double,
185
- boolean: :long,
186
- }.map {|k,v|
187
- [[k.to_s, v.to_s].sort, v.to_s]
188
- }]
189
-
190
- def merge_type(type1, type2)
191
- if type1 == type2
192
- type1
193
- elsif type1.nil? || type2.nil?
194
- type1 || type2
195
- else
196
- TYPE_COALESCE[[type1, type2].sort] || "string"
197
- end
198
- end
199
-
200
- class TimestampMatch < String
201
- def initialize(text)
202
- super("timestamp")
203
- @text = text
204
- end
205
- attr_reader :text
206
- end
207
-
208
- def guess_type(str)
209
- if TRUE_STRINGS[str]
210
- return "boolean"
211
- end
212
-
213
- if TimeFormatGuess.guess(str)
214
- return TimestampMatch.new(str)
215
- end
216
-
217
- if str.to_i.to_s == str
218
- return "long"
219
- end
220
-
221
- if str.include?('.')
222
- a, b = str.split(".", 2)
223
- if a.to_i.to_s == a && b.to_i.to_s == b
224
- return "double"
225
- end
226
- end
227
-
228
- return "string"
229
- end
230
-
231
157
  def array_sum(array)
232
158
  array.inject(0) {|r,i| r += i }
233
159
  end
@@ -0,0 +1,107 @@
1
+ module Embulk::Guess
2
+ require 'embulk/column'
3
+ require 'embulk/guess/time_format_guess'
4
+
5
+ module SchemaGuess
6
+ class TimestampTypeMatch < String
7
+ def initialize(format)
8
+ super("timestamp")
9
+ @format = format
10
+ end
11
+
12
+ attr_reader :format
13
+ end
14
+
15
+ class << self
16
+ def from_hash_records(array_of_hash)
17
+ array_of_hash = Array(array_of_hash)
18
+ if array_of_hash.empty?
19
+ raise "SchemaGuess Can't guess schema from no records"
20
+ end
21
+ column_names = array_of_hash.first.keys
22
+ samples = array_of_hash.to_a.map {|hash| column_names.map {|name| hash[name] } }
23
+ from_array_records(column_names, samples)
24
+ end
25
+
26
+ def from_array_records(column_names, samples)
27
+ column_types = types_from_array_records(samples)
28
+ columns = column_types.zip(column_names).map do |(type,name)|
29
+ hash = {name: name, type: type.to_sym}
30
+ hash[:format] = type.format if type.is_a?(TimestampTypeMatch)
31
+ Embulk::Column.new(hash)
32
+ end
33
+ return Embulk::Schema.new(columns)
34
+ end
35
+
36
+ # TODO this method will be private once guess/csv is refactored
37
+ def types_from_array_records(samples)
38
+ columnar_types = []
39
+ samples.each do |record|
40
+ record.each_with_index {|value,i| (columnar_types[i] ||= []) << guess_type(value.to_s) }
41
+ end
42
+ columnar_types.map {|types| merge_types(types) }
43
+ end
44
+
45
+ private
46
+
47
+ def guess_type(str)
48
+ if TRUE_STRINGS[str]
49
+ return "boolean"
50
+ end
51
+
52
+ if TimeFormatGuess.guess(str)
53
+ return TimestampTypeMatch.new(str)
54
+ end
55
+
56
+ if str.to_i.to_s == str
57
+ return "long"
58
+ end
59
+
60
+ if str.include?('.')
61
+ a, b = str.split(".", 2)
62
+ if a.to_i.to_s == a && b.to_i.to_s == b
63
+ return "double"
64
+ end
65
+ end
66
+
67
+ return "string"
68
+ end
69
+
70
+ def merge_types(types)
71
+ t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
72
+ if t.is_a?(TimestampTypeMatch)
73
+ format = TimeFormatGuess.guess(types.map {|type| type.format })
74
+ return TimestampTypeMatch.new(format)
75
+ else
76
+ return t
77
+ end
78
+ end
79
+
80
+ # taken from CsvParserPlugin.TRUE_STRINGS
81
+ TRUE_STRINGS = Hash[*%w[
82
+ true True TRUE
83
+ yes Yes YES
84
+ y Y
85
+ on On ON
86
+ 1
87
+ ].map {|k| [k, true] }]
88
+
89
+ TYPE_COALESCE = Hash[{
90
+ long: :double,
91
+ boolean: :long,
92
+ }.map {|k,v|
93
+ [[k.to_s, v.to_s].sort, v.to_s]
94
+ }]
95
+
96
+ def merge_type(type1, type2)
97
+ if type1 == type2
98
+ type1
99
+ elsif type1.nil? || type2.nil?
100
+ type1 || type2
101
+ else
102
+ TYPE_COALESCE[[type1, type2].sort] || "string"
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end