embulk 0.4.10 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/build.gradle +4 -3
  4. data/embulk-core/src/main/java/org/embulk/command/Runner.java +22 -3
  5. data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
  6. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +57 -31
  7. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +1 -1
  8. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +6 -5
  9. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +14 -10
  10. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +16 -0
  11. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +2 -0
  12. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +2 -1
  13. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +22 -0
  14. data/embulk-docs/plugins/index.html.erb +2 -2
  15. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +3 -3
  16. data/embulk-docs/src/release.rst +1 -0
  17. data/embulk-docs/src/release/release-0.5.0.rst +81 -0
  18. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +13 -1
  19. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +9 -0
  20. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +68 -11
  21. data/lib/embulk/column.rb +31 -8
  22. data/lib/embulk/command/embulk_new_plugin.rb +30 -22
  23. data/lib/embulk/command/embulk_run.rb +16 -3
  24. data/lib/embulk/data/new/README.md.erb +37 -2
  25. data/lib/embulk/data/new/java/input.java.erb +14 -0
  26. data/lib/embulk/data/new/java/output.java.erb +4 -0
  27. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
  28. data/lib/embulk/data/new/ruby/input.rb.erb +11 -1
  29. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
  30. data/lib/embulk/guess/csv.rb +7 -81
  31. data/lib/embulk/guess/schema_guess.rb +107 -0
  32. data/lib/embulk/guess/time_format_guess.rb +2 -1
  33. data/lib/embulk/guess_plugin.rb +20 -0
  34. data/lib/embulk/input_plugin.rb +10 -0
  35. data/lib/embulk/schema.rb +9 -2
  36. data/lib/embulk/version.rb +1 -1
  37. data/test/guess/test_schema_guess.rb +11 -0
  38. data/test/helper.rb +1 -2
  39. metadata +11 -4
@@ -29,7 +29,7 @@
29
29
  <th>Name</th>
30
30
  <th>Author</th>
31
31
  <th>About</th>
32
- <th>Version</th>
32
+ <th>Download</th>
33
33
  </tr>
34
34
  </thead>
35
35
  <tbody>
@@ -54,7 +54,7 @@
54
54
  </td>
55
55
  <% end %>
56
56
  <td><%=h gem[:info] %></td>
57
- <td style="width:3em"><%=h gem[:version] %></td>
57
+ <td style="width:3em"><%=h gem[:downloads] %></td>
58
58
  </tr>
59
59
  <% end %>
60
60
  </tbody>
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
56
56
 
57
57
  .. code-block:: console
58
58
 
59
- $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.10.jar -O /usr/local/bin/embulk
59
+ $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.0.jar -O /usr/local/bin/embulk
60
60
  $ sudo chmod +x /usr/local/bin/embulk
61
61
 
62
62
  Step 2. Install Elasticsearch plugin
@@ -84,7 +84,7 @@ Create this configuration file and save as ``config.yml``:
84
84
  path_prefix: ./mydata/csv/
85
85
  out:
86
86
  type: elasticsearch
87
- index_name: embulk
87
+ index: embulk
88
88
  index_type: embulk
89
89
  nodes:
90
90
  - host: localhost
@@ -121,7 +121,7 @@ The generated config-complete.yml file should include complete information as fo
121
121
  - {name: comment, type: string}
122
122
  out:
123
123
  type: elasticsearch
124
- index_name: embulk
124
+ index: embulk
125
125
  index_type: embulk
126
126
  nodes:
127
127
  - {host: localhost}
@@ -21,4 +21,5 @@ Release Notes
21
21
  release/release-0.4.8
22
22
  release/release-0.4.9
23
23
  release/release-0.4.10
24
+ release/release-0.5.0
24
25
 
@@ -0,0 +1,81 @@
1
+ Release 0.5.0
2
+ ==================================
3
+
4
+ New Guess Plugin Architecture
5
+ ------------------
6
+
7
+ Embulk v0.5.0 supports dynamically loadable guess plugins.
8
+
9
+ * For parser and decoder plugins:
10
+
11
+ * CLI subcommand ``guess`` accepts new ``-g, --guess NAMES`` argument to load parser and decoder guess plugins.
12
+
13
+ * Plugin template generator creates stub code of guess plugin for parser and decoder plugins.
14
+
15
+ * For input plugins:
16
+
17
+ * Added ``Embulk::InputPlugin.guess(config)`` and ``spi.InputPlugin#guess(config)`` methods for Ruby and Java plugins.
18
+
19
+ * ``guess`` subcommand executes the new guess method and takes the return value of the method.
20
+
21
+ For example, if you write a parser plugin named ``myparser``, you can use this configuration file first:
22
+
23
+ .. code-block:: yaml
24
+
25
+ in:
26
+ type: file
27
+ path_prefix: path/to/myfiles
28
+ out:
29
+ type: stdout
30
+
31
+ The ``embulk guess`` command with ``-g myparser`` argument calls the guess plugin bundled in the plugin:
32
+
33
+ .. code-block:: console
34
+
35
+ $ embulk gem install embulk-parser-myparser
36
+ $ embulk guess config.yml -o guessed.yml -g myparser
37
+
38
+ On the other hand, if the plugin type is input, you don't need additional command-line arguments. For example, if the input plugin name is ``myinput``, you can use this configuration file:
39
+
40
+ .. code-block:: yaml
41
+
42
+ in:
43
+ type: myinput
44
+ out:
45
+ type: stdout
46
+
47
+ The ``embulk guess`` command finds the ``InputPlugin.guess`` of the input plugin and calls it:
48
+
49
+ .. code-block:: console
50
+
51
+ $ embulk gem install embulk-input-myinput
52
+ $ embulk guess config.yml -o guessed.yml
53
+
54
+ Plugin SPI
55
+ ------------------
56
+
57
+ * Added ``Embulk::InputPlugin.guess(config)`` method for Ruby input plugins.
58
+
59
+ * Backward compatibility: existent plugins don't have to implement the method. The default behavior is raising ``NotImplementedError``.
60
+
61
+ * Added ``spi.InputPlugin#guess(ConfigSource config)`` method for Java input plugins.
62
+
63
+ * Backward compatibility: existent plugins don't have to implement the method. Mehtod linkage errors are handled at embulk-core. The default behavior is raising ``UnsupportedOperationException``.
64
+
65
+ Built-in plugins
66
+ ------------------
67
+
68
+ * ``csv`` parser plugin implements ``max_quoted_size_limit`` option. Default value is 131072 (128KB).
69
+
70
+ * This option is useful when a column value includes a quote character accidentally. If the plugin detects those values, it skips the line and continues parsing from the next line.
71
+
72
+ General Changes
73
+ ------------------
74
+
75
+ * ``spi.util.FileInputInputStream#skip`` method never returns -1 to follow the Java API standard (@hata++).
76
+ * Plugin template generator creates appropriate "Overview" section in README.md file depending on the plugin type.
77
+
78
+
79
+ Release Date
80
+ ------------------
81
+ 2015-03-02
@@ -27,7 +27,7 @@ public class CsvTokenizer
27
27
  private final char escape;
28
28
  private final String newline;
29
29
  private final boolean trimIfNotQuoted;
30
- private final long maxQuotedSizeLimit; // TODO not used yet
30
+ private final long maxQuotedSizeLimit;
31
31
  private final LineDecoder input;
32
32
 
33
33
  private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
@@ -274,6 +274,9 @@ public class CsvTokenizer
274
274
  }
275
275
 
276
276
  } else {
277
+ if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
278
+ throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
279
+ }
277
280
  // keep QUOTED_VALUE state
278
281
  }
279
282
  break;
@@ -351,4 +354,13 @@ public class CsvTokenizer
351
354
  {
352
355
  return c == escape;
353
356
  }
357
+
358
+ static class QuotedSizeLimitExceededException
359
+ extends RuntimeException
360
+ {
361
+ QuotedSizeLimitExceededException(String message)
362
+ {
363
+ super(message);
364
+ }
365
+ }
354
366
  }
@@ -4,13 +4,17 @@ import com.google.common.base.Preconditions;
4
4
  import com.google.inject.Binder;
5
5
  import com.google.inject.Module;
6
6
  import com.google.inject.name.Names;
7
+ import com.google.inject.multibindings.Multibinder;
7
8
  import org.embulk.spi.FormatterPlugin;
8
9
  import org.embulk.spi.InputPlugin;
9
10
  import org.embulk.spi.OutputPlugin;
10
11
  import org.embulk.spi.ParserPlugin;
11
12
  import org.embulk.spi.DecoderPlugin;
12
13
  import org.embulk.spi.EncoderPlugin;
14
+ import org.embulk.exec.GuessExecutor;
15
+ import org.embulk.plugin.PluginType;
13
16
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
17
+ import static org.embulk.exec.GuessExecutor.registerDefaultGuessPluginTo;
14
18
 
15
19
  public class StandardPluginModule
16
20
  implements Module
@@ -39,5 +43,10 @@ public class StandardPluginModule
39
43
 
40
44
  // file encoder plugins
41
45
  registerPluginTo(binder, EncoderPlugin.class, "gzip", GzipFileEncoderPlugin.class);
46
+
47
+ // default guess plugins
48
+ registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
49
+ registerDefaultGuessPluginTo(binder, new PluginType("csv"));
50
+ // charset and newline guess plugins are loaded and invoked by CsvGuessPlugin
42
51
  }
43
52
  }
@@ -6,6 +6,8 @@ import java.nio.charset.UnsupportedCharsetException;
6
6
  import java.util.ArrayList;
7
7
  import java.util.Arrays;
8
8
  import java.util.List;
9
+ import java.util.Random;
10
+
9
11
  import com.fasterxml.jackson.databind.node.JsonNodeFactory;
10
12
  import com.google.common.collect.ImmutableList;
11
13
  import com.google.common.collect.ImmutableMap;
@@ -13,6 +15,9 @@ import org.junit.Before;
13
15
  import org.junit.Rule;
14
16
  import org.junit.Test;
15
17
  import static org.junit.Assert.assertEquals;
18
+ import static org.junit.Assert.assertTrue;
19
+ import static org.junit.Assert.fail;
20
+
16
21
  import org.embulk.EmbulkTestRuntime;
17
22
  import org.embulk.config.ConfigSource;
18
23
  import org.embulk.spi.Buffer;
@@ -255,21 +260,73 @@ public class TestCsvTokenizer
255
260
  "\"trailing\n3\" ,\"trailing\n4\" "));
256
261
  }
257
262
 
258
- /*
259
- @Test(expected = CsvTokenizer.CsvValueValidateException.class)
260
- public void parseTooLargeSizedValues() throws Exception
263
+ @Test
264
+ public void throwQuotedSizeLimitExceededException() throws Exception
261
265
  {
262
- config.set("max_quoted_column_size", 8L);
266
+ config.set("max_quoted_size_limit", 8);
263
267
  reloadPluginTask();
264
- List<List<String>> parsed = doParse(task, bufferList("utf-8",
265
- "aaa,bbb", "\n", "\"cccccccc\",ddd", "\n"));
266
268
 
267
- assertEquals(Arrays.asList(
268
- Arrays.asList("aaa", "bbb"),
269
- Arrays.asList("ccc", "ddd")),
270
- parsed);
269
+ try {
270
+ parse(task,
271
+ "v1,v2",
272
+ "v3,\"0123456789\"");
273
+ fail();
274
+ } catch (Exception e) {
275
+ assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
276
+ }
277
+
278
+ // multi-line
279
+ try {
280
+ parse(task,
281
+ "v1,v2",
282
+ "\"012345\n6789\",v3");
283
+ fail();
284
+ } catch (Exception e) {
285
+ assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
286
+ }
287
+ }
288
+
289
+ @Test
290
+ public void recoverFromQuotedSizeLimitExceededException() throws Exception
291
+ {
292
+ config.set("max_quoted_size_limit", 12);
293
+ reloadPluginTask();
294
+
295
+ String[] lines = new String[] {
296
+ "v1,v2",
297
+ "v3,\"0123", // this is a broken line and should be skipped
298
+ "v4,v5", // this line should be not be skiped
299
+ "v6,v7", // this line should be not be skiped
300
+ };
301
+ FileInput input = newFileInputFromLines(task, lines);
302
+ LineDecoder decoder = new LineDecoder(input, task);
303
+ CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
304
+ Schema schema = task.getSchemaConfig().toSchema();
305
+
306
+ tokenizer.nextFile();
307
+
308
+ assertTrue(tokenizer.nextRecord());
309
+ assertEquals("v1", tokenizer.nextColumn());
310
+ assertEquals("v2", tokenizer.nextColumn());
311
+
312
+ assertTrue(tokenizer.nextRecord());
313
+ assertEquals("v3", tokenizer.nextColumn());
314
+ try {
315
+ tokenizer.nextColumn();
316
+ fail();
317
+ } catch (Exception e) {
318
+ assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
319
+ }
320
+ assertEquals("v3,\"0123", tokenizer.skipCurrentLine());
321
+
322
+ assertTrue(tokenizer.nextRecord());
323
+ assertEquals("v4", tokenizer.nextColumn());
324
+ assertEquals("v5", tokenizer.nextColumn());
325
+
326
+ assertTrue(tokenizer.nextRecord());
327
+ assertEquals("v6", tokenizer.nextColumn());
328
+ assertEquals("v7", tokenizer.nextColumn());
271
329
  }
272
- */
273
330
 
274
331
  /*
275
332
  @Test
@@ -1,20 +1,43 @@
1
1
  module Embulk
2
2
 
3
- class Column < Struct.new(:index, :name, :type)
4
- def initialize(index, name, type)
5
- super(index, name, type)
3
+ class Column < Struct.new(:index, :name, :type, :format)
4
+ def initialize(*args)
5
+ if args.length == 1 && args[0].is_a?(Hash)
6
+ # initialize(hash)
7
+ hash = args.first
8
+ super(hash[:index], hash[:name], hash[:type], hash[:format])
9
+ else
10
+ # initialize(index, name, type, format)
11
+ super(*args)
12
+ end
13
+ end
14
+
15
+ def to_json(*args)
16
+ if type == :timestamp && format
17
+ {"index"=>index, "name"=>name, "type"=>type, "format"=>format}.to_json(*args)
18
+ else
19
+ {"index"=>index, "name"=>name, "type"=>type}.to_json(*args)
20
+ end
6
21
  end
7
22
 
8
23
  if Embulk.java?
9
24
  def self.from_java(java_column)
10
- Column.new(
11
- java_column.getIndex,
12
- java_column.getName,
13
- Type.from_java(java_column.getType))
25
+ type = Type.from_java(java_column.getType)
26
+ if type == :timestamp
27
+ format = java_column.getType.getFormat
28
+ else
29
+ format = nil
30
+ end
31
+
32
+ Column.new(java_column.getIndex, java_column.getName, type, format)
14
33
  end
15
34
 
16
35
  def to_java
17
- Java::Column.new(index, name, Type.new_java_type(type))
36
+ if type == :timestamp && format
37
+ Java::Column.new(index, name, Type.new_java_type(type).withFormat(format))
38
+ else
39
+ Java::Column.new(index, name, Type.new_java_type(type))
40
+ end
18
41
  end
19
42
  end
20
43
  end
@@ -9,7 +9,8 @@ module Embulk
9
9
  embulk_category = :output if category == :file_output
10
10
 
11
11
  project_name = "embulk-#{embulk_category}-#{name}"
12
- plugin_path = "lib/embulk/#{embulk_category}/#{name}.rb"
12
+ plugin_dir = "lib/embulk"
13
+ plugin_path = "#{plugin_dir}/#{embulk_category}/#{name}.rb"
13
14
 
14
15
  if File.exist?(project_name)
15
16
  raise "./#{project_name} already exists. Please delete it first."
@@ -31,27 +32,30 @@ module Embulk
31
32
  display_name = name.split('-').map {|a| a.capitalize }.join(' ')
32
33
  display_category = category.to_s.gsub('_', ' ')
33
34
 
34
- description =
35
- case category
36
- when :input
37
- %[Loads records from #{display_name}.]
38
- when :file_input
39
- %[Reads files stored on #{display_name}.]
40
- when :parser
41
- %[Parses #{display_name} files read by other file input plugins.]
42
- when :decoder
43
- %[Decodes #{display_name}-encoded files read by other file input plugins.]
44
- when :output
45
- %[Dumps records to #{display_name}.]
46
- when :file_output
47
- %[Stores files on #{display_name}.]
48
- when :formatter
49
- %[Formats #{display_name} files for other file output plugins.]
50
- when :encoder
51
- %[Encodes files using #{display_name} for other file output plugins.]
52
- when :filter
53
- %[#{display_name}]
54
- end
35
+ extra_guess_erb = {}
36
+
37
+ case category
38
+ when :input
39
+ description = %[Loads records from #{display_name}.]
40
+ when :file_input
41
+ description = %[Reads files stored on #{display_name}.]
42
+ when :parser
43
+ description = %[Parses #{display_name} files read by other file input plugins.]
44
+ extra_guess_erb["ruby/parser_guess.rb.erb"] = "#{plugin_dir}/guess/#{name}.rb"
45
+ when :decoder
46
+ description = %[Decodes #{display_name}-encoded files read by other file input plugins.]
47
+ extra_guess_erb["ruby/decoder_guess.rb.erb"] = "#{plugin_dir}/guess/#{name}.rb"
48
+ when :output
49
+ description = %[Dumps records to #{display_name}.]
50
+ when :file_output
51
+ description = %[Stores files on #{display_name}.]
52
+ when :formatter
53
+ description = %[Formats #{display_name} files for other file output plugins.]
54
+ when :encoder
55
+ description = %[Encodes files using #{display_name} for other file output plugins.]
56
+ when :filter
57
+ description = %[#{display_name}]
58
+ end
55
59
 
56
60
  pkg = Embulk::PackageData.new("new", project_name, binding())
57
61
 
@@ -78,6 +82,10 @@ module Embulk
78
82
  pkg.cp_erb("java/test.java.erb", "src/test/java/org/embulk/#{embulk_category}/Test#{java_class_name}.java")
79
83
  end
80
84
 
85
+ extra_guess_erb.each_pair do |erb,dest|
86
+ pkg.cp_erb(erb, dest)
87
+ end
88
+
81
89
  success = true
82
90
  puts ""
83
91
  ensure
@@ -34,7 +34,7 @@ module Embulk
34
34
  op = OptionParser.new
35
35
  op.version = Embulk::VERSION
36
36
 
37
- puts "#{Time.now.strftime("%Y-%m-%d %H:%M:%S,%3N %z")}: Embulk v#{Embulk::VERSION}"
37
+ puts "#{Time.now.strftime("%Y-%m-%d %H:%M:%S.%3N %z")}: Embulk v#{Embulk::VERSION}"
38
38
 
39
39
  load_paths = []
40
40
  classpaths = []
@@ -121,6 +121,9 @@ module Embulk
121
121
  op.on('-C', '--classpath PATH', "Add java classpath separated by #{classpath_separator} (CLASSPATH)") do |classpath|
122
122
  classpaths.concat classpath.split(classpath_separator)
123
123
  end
124
+ op.on('-g', '--guess NAMES', "Comma-separated list of guess plugin names") do |names|
125
+ (options[:guessPlugins] ||= []).concat names.split(",")
126
+ end
124
127
  args = 1..1
125
128
 
126
129
  when :new
@@ -161,7 +164,7 @@ examples:
161
164
  args = 0..1
162
165
 
163
166
  when :exec
164
- exec *argv
167
+ exec(*argv)
165
168
  exit 127
166
169
 
167
170
  else
@@ -292,7 +295,17 @@ examples:
292
295
  setup_load_paths(load_paths)
293
296
  setup_classpaths(classpaths)
294
297
 
295
- org.embulk.command.Runner.new(options.to_json).main(subcmd, argv.to_java(:string))
298
+ begin
299
+ org.embulk.command.Runner.new(options.to_json).main(subcmd, argv.to_java(:string))
300
+ rescue => ex
301
+ puts ex.to_s
302
+ ex.backtrace.each do |bt|
303
+ puts " #{bt}"
304
+ end
305
+ puts ""
306
+ puts "Error: #{ex}"
307
+ raise SystemExit.new(1, ex.to_s)
308
+ end
296
309
  end
297
310
  end
298
311