embulk 0.4.10 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/build.gradle +4 -3
  4. data/embulk-core/src/main/java/org/embulk/command/Runner.java +22 -3
  5. data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
  6. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +57 -31
  7. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +1 -1
  8. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +6 -5
  9. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +14 -10
  10. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +16 -0
  11. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +2 -0
  12. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +2 -1
  13. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +22 -0
  14. data/embulk-docs/plugins/index.html.erb +2 -2
  15. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +3 -3
  16. data/embulk-docs/src/release.rst +1 -0
  17. data/embulk-docs/src/release/release-0.5.0.rst +81 -0
  18. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +13 -1
  19. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +9 -0
  20. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +68 -11
  21. data/lib/embulk/column.rb +31 -8
  22. data/lib/embulk/command/embulk_new_plugin.rb +30 -22
  23. data/lib/embulk/command/embulk_run.rb +16 -3
  24. data/lib/embulk/data/new/README.md.erb +37 -2
  25. data/lib/embulk/data/new/java/input.java.erb +14 -0
  26. data/lib/embulk/data/new/java/output.java.erb +4 -0
  27. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
  28. data/lib/embulk/data/new/ruby/input.rb.erb +11 -1
  29. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
  30. data/lib/embulk/guess/csv.rb +7 -81
  31. data/lib/embulk/guess/schema_guess.rb +107 -0
  32. data/lib/embulk/guess/time_format_guess.rb +2 -1
  33. data/lib/embulk/guess_plugin.rb +20 -0
  34. data/lib/embulk/input_plugin.rb +10 -0
  35. data/lib/embulk/schema.rb +9 -2
  36. data/lib/embulk/version.rb +1 -1
  37. data/test/guess/test_schema_guess.rb +11 -0
  38. data/test/helper.rb +1 -2
  39. metadata +11 -4
@@ -1,12 +1,37 @@
1
1
  # <%= display_name %> <%= display_category %> plugin for Embulk
2
2
 
3
- TODO: Write short description here
3
+ %case language
4
+ %when :ruby
5
+ TODO: Write short description here and <%= project_name %>.gemspec file.
6
+ %when :java
7
+ TODO: Write short description here and build.gradle file.
8
+ %else
9
+ TODO: Write short description here.
10
+ %end
4
11
 
5
12
  ## Overview
6
13
 
14
+ %case category
15
+ %when :output, :file_output
7
16
  * **Plugin type**: <%= display_category %>
8
- * **Load all or nothing**: yes
17
+ * **Load all or nothing**: no
9
18
  * **Resume supported**: no
19
+ * **Cleanup supported**: yes
20
+ %when :file_input
21
+ * **Plugin type**: <%= display_category %>
22
+ * **Resume supported**: yes
23
+ * **Cleanup supported**: yes
24
+ %when :input
25
+ * **Plugin type**: <%= display_category %>
26
+ * **Resume supported**: yes
27
+ * **Cleanup supported**: yes
28
+ * **Guess supported**: no
29
+ %when :parser, :decoder
30
+ * **Plugin type**: <%= display_category %>
31
+ * **Guess supported**: no
32
+ %else
33
+ * **Plugin type**: <%= display_category %>
34
+ %end
10
35
 
11
36
  ## Configuration
12
37
 
@@ -63,6 +88,16 @@ out:
63
88
  %end
64
89
  ```
65
90
 
91
+ %case category
92
+ %when :parser, :decoder
93
+ (If guess supported) you don't have to write `<%= category %>:` section in the configuration file. After writing `in:` section, you can let embulk guess `<%= category %>:` section using this command:
94
+
95
+ ```
96
+ $ embulk install <%= project_name %>
97
+ $ embulk guess -g <%= name %> config.yml -o guessed.yml
98
+ ```
99
+ %end
100
+
66
101
  ## Build
67
102
 
68
103
  ```
@@ -32,6 +32,7 @@ public class <%= java_class_name %>
32
32
  public SchemaConfig getColumns();
33
33
  }
34
34
 
35
+ @Override
35
36
  public ConfigDiff transaction(ConfigSource config,
36
37
  InputPlugin.Control control)
37
38
  {
@@ -43,6 +44,7 @@ public class <%= java_class_name %>
43
44
  return resume(task.dump(), schema, taskCount, control);
44
45
  }
45
46
 
47
+ @Override
46
48
  public ConfigDiff resume(TaskSource taskSource,
47
49
  Schema schema, int taskCount,
48
50
  InputPlugin.Control control)
@@ -51,12 +53,14 @@ public class <%= java_class_name %>
51
53
  return Exec.newConfigDiff();
52
54
  }
53
55
 
56
+ @Override
54
57
  public void cleanup(TaskSource taskSource,
55
58
  Schema schema, int taskCount,
56
59
  List<CommitReport> successCommitReports)
57
60
  {
58
61
  }
59
62
 
63
+ @Override
60
64
  public CommitReport run(TaskSource taskSource,
61
65
  Schema schema, int taskIndex,
62
66
  PageOutput output)
@@ -66,4 +70,14 @@ public class <%= java_class_name %>
66
70
  // TODO
67
71
  throw new UnsupportedOperationException("The 'run' method needs to be implemented");
68
72
  }
73
+
74
+ @Override
75
+ public ConfigDiff guess(ConfigSource config)
76
+ {
77
+ // TODO
78
+ throw new UnsupportedOperationException("'<%= name %>' input plugin does not support guessing.");
79
+ //ConfigDiff diff = Exec.newConfigDiff();
80
+ //diff.set("property1", "value");
81
+ //return diff;
82
+ }
69
83
  }
@@ -28,6 +28,7 @@ public class <%= java_class_name %>
28
28
  public int getProperty2();
29
29
  }
30
30
 
31
+ @Override
31
32
  public ConfigDiff transaction(ConfigSource config,
32
33
  Schema schema, int taskCount,
33
34
  OutputPlugin.Control control)
@@ -42,6 +43,7 @@ public class <%= java_class_name %>
42
43
  return Exec.newConfigDiff();
43
44
  }
44
45
 
46
+ @Override
45
47
  public ConfigDiff resume(TaskSource taskSource,
46
48
  Schema schema, int taskCount,
47
49
  OutputPlugin.Control control)
@@ -49,12 +51,14 @@ public class <%= java_class_name %>
49
51
  throw new UnsupportedOperationException("<%= name %> output plugin does not support resuming");
50
52
  }
51
53
 
54
+ @Override
52
55
  public void cleanup(TaskSource taskSource,
53
56
  Schema schema, int taskCount,
54
57
  List<CommitReport> successCommitReports)
55
58
  {
56
59
  }
57
60
 
61
+ @Override
58
62
  public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
59
63
  {
60
64
  PluginTask task = taskSource.loadTask(PluginTask.class);
@@ -0,0 +1,25 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g <%= name.dump %> partial-config.yml
6
+ %if language == :ruby
7
+
8
+ #require <%= "embulk/#{embulk_category}/#{name}.rb".dump %>
9
+ %end
10
+
11
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < GuessPlugin
12
+ # Plugin.register_guess(<%= name.dump %>, self)
13
+ #
14
+ # FOO_BAR_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
15
+ #
16
+ # def guess(config, sample_buffer)
17
+ # if sample_buffer[0,2] == FOO_BAR_HEADER
18
+ # return {"decoders" => [{"type" => <%= name.dump %>}]}
19
+ # end
20
+ # return {}
21
+ # end
22
+ #end
23
+
24
+ end
25
+ end
@@ -14,7 +14,7 @@ module Embulk
14
14
  columns = [
15
15
  Column.new(0, "example", :string),
16
16
  Column.new(1, "column", :long),
17
- Column.new(2, "name", :double),
17
+ Column.new(2, "value", :double),
18
18
  ]
19
19
 
20
20
  resume(task, columns, 1, &control)
@@ -27,6 +27,16 @@ module Embulk
27
27
  return next_config_diff
28
28
  end
29
29
 
30
+ # TODO
31
+ #def self.guess(config)
32
+ # sample_records = [
33
+ # {"example"=>"a", "column"=>1, "value"=>0.1},
34
+ # {"example"=>"a", "column"=>2, "value"=>0.2},
35
+ # ]
36
+ # columns = Guess::SchemaGuess.from_hash_records(sample_records)
37
+ # return {"columns" => columns}
38
+ #end
39
+
30
40
  def init
31
41
  # initialization code:
32
42
  @property1 = task["property1"]
@@ -0,0 +1,65 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g <%= name.dump %> partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+ %if language == :ruby
11
+
12
+ #require <%= "embulk/#{embulk_category}/#{name}.rb".dump %>
13
+ %end
14
+
15
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < GuessPlugin
16
+ # Plugin.register_guess(<%= name.dump %>, self)
17
+ #
18
+ # def guess(config, sample_buffer)
19
+ # if sample_buffer[0,2] == GZIP_HEADER
20
+ # guessed = {}
21
+ # guessed["type"] = <%= name.dump %>
22
+ # guessed["property1"] = "guessed-value"
23
+ # return {"parser" => guessed}
24
+ # else
25
+ # return {}
26
+ # end
27
+ # end
28
+ #end
29
+
30
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < TextGuessPlugin
31
+ # Plugin.register_guess(<%= name.dump %>, self)
32
+ #
33
+ # def guess_text(config, sample_text)
34
+ # js = JSON.parse(sample_text) rescue nil
35
+ # if js && js["mykeyword"] == "keyword"
36
+ # guessed = {}
37
+ # guessed["type"] = <%= name.dump %>
38
+ # guessed["property1"] = "guessed-value"
39
+ # return {"parser" => guessed}
40
+ # else
41
+ # return {}
42
+ # end
43
+ # end
44
+ #end
45
+
46
+ #class <%= ruby_class_name.sub('Plugin', 'GuessPlugin') %> < LineGuessPlugin
47
+ # Plugin.register_guess(<%= name.dump %>, self)
48
+ #
49
+ # def guess_lines(config, sample_lines)
50
+ # all_line_matched = sample_lines.all? do |line|
51
+ # line =~ /mypattern/
52
+ # end
53
+ # if all_line_matched
54
+ # guessed = {}
55
+ # guessed["type"] = <%= name.dump %>
56
+ # guessed["property1"] = "guessed-value"
57
+ # return {"parser" => guessed}
58
+ # else
59
+ # return {}
60
+ # end
61
+ # end
62
+ #end
63
+
64
+ end
65
+ end
@@ -1,6 +1,6 @@
1
1
  module Embulk
2
2
  module Guess
3
- require_relative 'time_format_guess'
3
+ require_relative 'schema_guess'
4
4
 
5
5
  class CsvGuessPlugin < LineGuessPlugin
6
6
  Plugin.register_guess('csv', self)
@@ -24,15 +24,6 @@ module Embulk
24
24
  "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
25
  ]
26
26
 
27
- # CsvParserPlugin.TRUE_STRINGS
28
- TRUE_STRINGS = Hash[*%w[
29
- true True TRUE
30
- yes Yes YES
31
- y Y
32
- on On ON
33
- 1
34
- ].map {|k| [k, true] }]
35
-
36
27
  def guess_lines(config, sample_lines)
37
28
  delim = guess_delimiter(sample_lines)
38
29
  unless delim
@@ -54,8 +45,8 @@ module Embulk
54
45
  # don't even set null_string to avoid confusion of null and 'null' in YAML format
55
46
 
56
47
  sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
57
- first_types = guess_field_types(sample_records[0, 1])
58
- other_types = guess_field_types(sample_records[1..-1])
48
+ first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
49
+ other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
59
50
 
60
51
  if first_types.size <= 1 || other_types.size <= 1
61
52
  # guess failed
@@ -63,7 +54,7 @@ module Embulk
63
54
  end
64
55
 
65
56
  unless parser_config.has_key?("header_line")
66
- parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
57
+ parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != "string" })
67
58
  end
68
59
 
69
60
  unless parser_config.has_key?("columns")
@@ -73,10 +64,10 @@ module Embulk
73
64
  column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
74
65
  end
75
66
  schema = []
76
- column_names.zip(other_types).each do |name,(type,format)|
67
+ column_names.zip(other_types).each do |name,type|
77
68
  if name && type
78
- if format
79
- schema << {"name" => name, "type" => type, "format" => format}
69
+ if type.is_a?(SchemaGuess::TimestampTypeMatch)
70
+ schema << {"name" => name, "type" => type, "format" => type.format}
80
71
  else
81
72
  schema << {"name" => name, "type" => type}
82
73
  end
@@ -163,71 +154,6 @@ module Embulk
163
154
  return found ? found[0] : nil
164
155
  end
165
156
 
166
- def guess_field_types(field_lines)
167
- column_lines = []
168
- field_lines.each do |fields|
169
- fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
170
- end
171
- columns = column_lines.map do |types|
172
- t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
173
- if t.is_a?(TimestampMatch)
174
- format = TimeFormatGuess.guess(types.map {|type| type.text })
175
- ["timestamp", format]
176
- else
177
- [t]
178
- end
179
- end
180
- return columns
181
- end
182
-
183
- TYPE_COALESCE = Hash[{
184
- long: :double,
185
- boolean: :long,
186
- }.map {|k,v|
187
- [[k.to_s, v.to_s].sort, v.to_s]
188
- }]
189
-
190
- def merge_type(type1, type2)
191
- if type1 == type2
192
- type1
193
- elsif type1.nil? || type2.nil?
194
- type1 || type2
195
- else
196
- TYPE_COALESCE[[type1, type2].sort] || "string"
197
- end
198
- end
199
-
200
- class TimestampMatch < String
201
- def initialize(text)
202
- super("timestamp")
203
- @text = text
204
- end
205
- attr_reader :text
206
- end
207
-
208
- def guess_type(str)
209
- if TRUE_STRINGS[str]
210
- return "boolean"
211
- end
212
-
213
- if TimeFormatGuess.guess(str)
214
- return TimestampMatch.new(str)
215
- end
216
-
217
- if str.to_i.to_s == str
218
- return "long"
219
- end
220
-
221
- if str.include?('.')
222
- a, b = str.split(".", 2)
223
- if a.to_i.to_s == a && b.to_i.to_s == b
224
- return "double"
225
- end
226
- end
227
-
228
- return "string"
229
- end
230
-
231
157
  def array_sum(array)
232
158
  array.inject(0) {|r,i| r += i }
233
159
  end
@@ -0,0 +1,107 @@
1
+ module Embulk::Guess
2
+ require 'embulk/column'
3
+ require 'embulk/guess/time_format_guess'
4
+
5
+ module SchemaGuess
6
+ class TimestampTypeMatch < String
7
+ def initialize(format)
8
+ super("timestamp")
9
+ @format = format
10
+ end
11
+
12
+ attr_reader :format
13
+ end
14
+
15
+ class << self
16
+ def from_hash_records(array_of_hash)
17
+ array_of_hash = Array(array_of_hash)
18
+ if array_of_hash.empty?
19
+ raise "SchemaGuess Can't guess schema from no records"
20
+ end
21
+ column_names = array_of_hash.first.keys
22
+ samples = array_of_hash.to_a.map {|hash| column_names.map {|name| hash[name] } }
23
+ from_array_records(column_names, samples)
24
+ end
25
+
26
+ def from_array_records(column_names, samples)
27
+ column_types = types_from_array_records(samples)
28
+ columns = column_types.zip(column_names).map do |(type,name)|
29
+ hash = {name: name, type: type.to_sym}
30
+ hash[:format] = type.format if type.is_a?(TimestampTypeMatch)
31
+ Embulk::Column.new(hash)
32
+ end
33
+ return Embulk::Schema.new(columns)
34
+ end
35
+
36
+ # TODO this method will be private once guess/csv is refactored
37
+ def types_from_array_records(samples)
38
+ columnar_types = []
39
+ samples.each do |record|
40
+ record.each_with_index {|value,i| (columnar_types[i] ||= []) << guess_type(value.to_s) }
41
+ end
42
+ columnar_types.map {|types| merge_types(types) }
43
+ end
44
+
45
+ private
46
+
47
+ def guess_type(str)
48
+ if TRUE_STRINGS[str]
49
+ return "boolean"
50
+ end
51
+
52
+ if TimeFormatGuess.guess(str)
53
+ return TimestampTypeMatch.new(str)
54
+ end
55
+
56
+ if str.to_i.to_s == str
57
+ return "long"
58
+ end
59
+
60
+ if str.include?('.')
61
+ a, b = str.split(".", 2)
62
+ if a.to_i.to_s == a && b.to_i.to_s == b
63
+ return "double"
64
+ end
65
+ end
66
+
67
+ return "string"
68
+ end
69
+
70
+ def merge_types(types)
71
+ t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
72
+ if t.is_a?(TimestampTypeMatch)
73
+ format = TimeFormatGuess.guess(types.map {|type| type.format })
74
+ return TimestampTypeMatch.new(format)
75
+ else
76
+ return t
77
+ end
78
+ end
79
+
80
+ # taken from CsvParserPlugin.TRUE_STRINGS
81
+ TRUE_STRINGS = Hash[*%w[
82
+ true True TRUE
83
+ yes Yes YES
84
+ y Y
85
+ on On ON
86
+ 1
87
+ ].map {|k| [k, true] }]
88
+
89
+ TYPE_COALESCE = Hash[{
90
+ long: :double,
91
+ boolean: :long,
92
+ }.map {|k,v|
93
+ [[k.to_s, v.to_s].sort, v.to_s]
94
+ }]
95
+
96
+ def merge_type(type1, type2)
97
+ if type1 == type2
98
+ type1
99
+ elsif type1.nil? || type2.nil?
100
+ type1 || type2
101
+ else
102
+ TYPE_COALESCE[[type1, type2].sort] || "string"
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end