embulk 0.5.5 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/build.gradle +1 -1
  4. data/embulk-core/src/main/java/org/embulk/command/Runner.java +7 -7
  5. data/embulk-core/src/main/java/org/embulk/exec/BulkLoader.java +664 -0
  6. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +5 -0
  7. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutorPlugin.java +130 -0
  8. data/embulk-core/src/main/java/org/embulk/exec/LocalThreadExecutor.java +34 -0
  9. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +3 -3
  10. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +1 -1
  11. data/embulk-core/src/main/java/org/embulk/exec/ResumeState.java +7 -6
  12. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +3 -0
  13. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +35 -3
  14. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +4 -1
  15. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +1 -1
  16. data/embulk-core/src/main/java/org/embulk/spi/ExecutorPlugin.java +19 -0
  17. data/embulk-core/src/main/java/org/embulk/spi/Page.java +6 -0
  18. data/embulk-core/src/main/java/org/embulk/spi/PluginClassLoader.java +73 -1
  19. data/embulk-core/src/main/java/org/embulk/spi/ProcessState.java +10 -0
  20. data/embulk-core/src/main/java/org/embulk/spi/ProcessTask.java +118 -0
  21. data/embulk-core/src/main/java/org/embulk/spi/TaskState.java +70 -0
  22. data/embulk-core/src/main/java/org/embulk/spi/util/Executors.java +92 -0
  23. data/embulk-core/src/main/java/org/embulk/spi/util/Filters.java +17 -3
  24. data/embulk-core/src/test/java/org/embulk/spi/TestBuffer.java +24 -0
  25. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +1 -1
  26. data/embulk-docs/src/release.rst +1 -0
  27. data/embulk-docs/src/release/release-0.6.0.rst +34 -0
  28. data/lib/embulk/executor_plugin.rb +23 -0
  29. data/lib/embulk/java_plugin.rb +5 -0
  30. data/lib/embulk/plugin.rb +13 -2
  31. data/lib/embulk/version.rb +1 -1
  32. metadata +15 -5
  33. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +0 -660
@@ -0,0 +1,10 @@
1
+ package org.embulk.spi;
2
+
3
+ public interface ProcessState
4
+ {
5
+ public void initialize(int inputTaskCount, int outputTaskCount);
6
+
7
+ public TaskState getInputTaskState(int inputTaskIndex);
8
+
9
+ public TaskState getOutputTaskState(int outputTaskIndex);
10
+ }
@@ -0,0 +1,118 @@
1
+ package org.embulk.spi;
2
+
3
+ import java.util.List;
4
+ import com.fasterxml.jackson.annotation.JsonCreator;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.fasterxml.jackson.annotation.JsonIgnore;
7
+ import org.embulk.plugin.PluginType;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.spi.Schema;
10
+ import org.embulk.spi.util.Executors;
11
+
12
+ public class ProcessTask
13
+ {
14
+ private final PluginType inputPluginType;
15
+ private final PluginType outputPluginType;
16
+ private final List<PluginType> filterPluginTypes;
17
+ private final TaskSource inputTaskSource;
18
+ private final TaskSource outputTaskSource;
19
+ private final List<TaskSource> filterTaskSources;
20
+ private final List<Schema> schemas;
21
+ private final Schema executorSchema;
22
+ private TaskSource executorTaskSource;
23
+
24
+ @JsonCreator
25
+ public ProcessTask(
26
+ @JsonProperty("inputType") PluginType inputPluginType,
27
+ @JsonProperty("outputType") PluginType outputPluginType,
28
+ @JsonProperty("filterTypes") List<PluginType> filterPluginTypes,
29
+ @JsonProperty("inputTask") TaskSource inputTaskSource,
30
+ @JsonProperty("outputTask") TaskSource outputTaskSource,
31
+ @JsonProperty("filterTasks") List<TaskSource> filterTaskSources,
32
+ @JsonProperty("schemas") List<Schema> schemas,
33
+ @JsonProperty("executorSchema") Schema executorSchema,
34
+ @JsonProperty("executorTask") TaskSource executorTaskSource)
35
+ {
36
+ this.inputPluginType = inputPluginType;
37
+ this.outputPluginType = outputPluginType;
38
+ this.filterPluginTypes = filterPluginTypes;
39
+ this.inputTaskSource = inputTaskSource;
40
+ this.outputTaskSource = outputTaskSource;
41
+ this.filterTaskSources = filterTaskSources;
42
+ this.schemas = schemas;
43
+ this.executorSchema = executorSchema;
44
+ this.executorTaskSource = executorTaskSource;
45
+ }
46
+
47
+ @JsonProperty("inputType")
48
+ public PluginType getInputPluginType()
49
+ {
50
+ return inputPluginType;
51
+ }
52
+
53
+ @JsonProperty("outputType")
54
+ public PluginType getOutputPluginType()
55
+ {
56
+ return outputPluginType;
57
+ }
58
+
59
+ @JsonProperty("filterTypes")
60
+ public List<PluginType> getFilterPluginTypes()
61
+ {
62
+ return filterPluginTypes;
63
+ }
64
+
65
+ @JsonProperty("inputTask")
66
+ public TaskSource getInputTaskSource()
67
+ {
68
+ return inputTaskSource;
69
+ }
70
+
71
+ @JsonProperty("outputTask")
72
+ public TaskSource getOutputTaskSource()
73
+ {
74
+ return outputTaskSource;
75
+ }
76
+
77
+ @JsonProperty("filterTasks")
78
+ public List<TaskSource> getFilterTaskSources()
79
+ {
80
+ return filterTaskSources;
81
+ }
82
+
83
+ @JsonProperty("schemas")
84
+ public List<Schema> getFilterSchemas()
85
+ {
86
+ return schemas;
87
+ }
88
+
89
+ @JsonProperty("executorSchema")
90
+ public Schema getExecutorSchema()
91
+ {
92
+ return executorSchema;
93
+ }
94
+
95
+ @JsonIgnore
96
+ public Schema getInputSchema()
97
+ {
98
+ return Executors.getInputSchema(schemas);
99
+ }
100
+
101
+ @JsonIgnore
102
+ public Schema getOutputSchema()
103
+ {
104
+ return Executors.getOutputSchema(schemas);
105
+ }
106
+
107
+ @JsonIgnore
108
+ public void setExecutorTaskSource(TaskSource executorTaskSource)
109
+ {
110
+ this.executorTaskSource = executorTaskSource;
111
+ }
112
+
113
+ @JsonProperty("executorTask")
114
+ public TaskSource getExecutorTaskSource()
115
+ {
116
+ return executorTaskSource;
117
+ }
118
+ }
@@ -0,0 +1,70 @@
1
+ package org.embulk.spi;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.config.CommitReport;
5
+
6
+ public class TaskState
7
+ {
8
+ private volatile boolean started = false;
9
+ private volatile boolean finished = false;
10
+ private volatile Optional<CommitReport> commitReport = Optional.absent();
11
+ private volatile Optional<Throwable> exception = Optional.absent();
12
+
13
+ public void start()
14
+ {
15
+ this.started = true;
16
+ }
17
+
18
+ public void finish()
19
+ {
20
+ this.started = true;
21
+ this.finished = true;
22
+ }
23
+
24
+ public void setCommitReport(CommitReport commitReport)
25
+ {
26
+ this.started = true;
27
+ this.commitReport = Optional.of(commitReport);
28
+ }
29
+
30
+ public void setException(Throwable exception)
31
+ {
32
+ this.started = true;
33
+ if (exception == null) {
34
+ this.exception = Optional.absent();
35
+ } else {
36
+ this.exception = Optional.of(exception);
37
+ }
38
+ }
39
+
40
+ public void resetException()
41
+ {
42
+ this.started = true;
43
+ this.exception = Optional.absent();
44
+ }
45
+
46
+ public boolean isStarted()
47
+ {
48
+ return started;
49
+ }
50
+
51
+ public boolean isFinished()
52
+ {
53
+ return finished;
54
+ }
55
+
56
+ public boolean isCommitted()
57
+ {
58
+ return commitReport.isPresent();
59
+ }
60
+
61
+ public Optional<CommitReport> getCommitReport()
62
+ {
63
+ return commitReport;
64
+ }
65
+
66
+ public Optional<Throwable> getException()
67
+ {
68
+ return exception;
69
+ }
70
+ }
@@ -0,0 +1,92 @@
1
+ package org.embulk.spi.util;
2
+
3
+ import java.util.List;
4
+ import org.embulk.config.TaskSource;
5
+ import org.embulk.config.CommitReport;
6
+ import org.embulk.spi.ExecSession;
7
+ import org.embulk.spi.ProcessState;
8
+ import org.embulk.spi.Schema;
9
+ import org.embulk.spi.TransactionalPageOutput;
10
+ import org.embulk.spi.PageOutput;
11
+ import org.embulk.spi.InputPlugin;
12
+ import org.embulk.spi.FilterPlugin;
13
+ import org.embulk.spi.OutputPlugin;
14
+ import org.embulk.spi.ProcessTask;
15
+
16
+ public abstract class Executors
17
+ {
18
+ private Executors() { }
19
+
20
+ public interface ProcessStateCallback
21
+ {
22
+ public void started();
23
+
24
+ public void inputCommitted(CommitReport report);
25
+
26
+ public void outputCommitted(CommitReport report);
27
+ }
28
+
29
+ public static void process(ExecSession exec,
30
+ ProcessTask task, int taskIndex,
31
+ ProcessStateCallback callback)
32
+ {
33
+ InputPlugin inputPlugin = exec.newPlugin(InputPlugin.class, task.getInputPluginType());
34
+ List<FilterPlugin> filterPlugins = Filters.newFilterPlugins(exec, task.getFilterPluginTypes());
35
+ OutputPlugin outputPlugin = exec.newPlugin(OutputPlugin.class, task.getOutputPluginType());
36
+
37
+ // TODO assert task.getExecutorSchema().equals task.getOutputSchema()
38
+
39
+ process(exec, taskIndex,
40
+ inputPlugin, task.getInputSchema(), task.getInputTaskSource(),
41
+ filterPlugins, task.getFilterSchemas(), task.getFilterTaskSources(),
42
+ outputPlugin, task.getOutputSchema(), task.getOutputTaskSource(),
43
+ callback);
44
+ }
45
+
46
+ public static void process(ExecSession exec, int taskIndex,
47
+ InputPlugin inputPlugin, Schema inputSchema, TaskSource inputTaskSource,
48
+ List<FilterPlugin> filterPlugins, List<Schema> filterSchemas, List<TaskSource> filterTaskSources,
49
+ OutputPlugin outputPlugin, Schema outputSchema, TaskSource outputTaskSource,
50
+ ProcessStateCallback callback)
51
+ {
52
+ TransactionalPageOutput tran = outputPlugin.open(outputTaskSource, outputSchema, taskIndex);
53
+
54
+ PageOutput closeThis = tran;
55
+ callback.started();
56
+ try {
57
+ PageOutput filtered = closeThis = Filters.open(filterPlugins, filterTaskSources, filterSchemas, tran);
58
+
59
+ CommitReport inputCommitReport = inputPlugin.run(inputTaskSource, inputSchema, taskIndex, filtered);
60
+ if (inputCommitReport == null) {
61
+ inputCommitReport = exec.newCommitReport();
62
+ }
63
+ callback.inputCommitted(inputCommitReport);
64
+
65
+ CommitReport outputCommitReport = tran.commit();
66
+ tran = null;
67
+ if (outputCommitReport == null) {
68
+ outputCommitReport = exec.newCommitReport();
69
+ }
70
+ callback.outputCommitted(outputCommitReport); // TODO check output.finish() is called. wrap or abstract
71
+
72
+ } finally {
73
+ try {
74
+ if (tran != null) {
75
+ tran.abort();
76
+ }
77
+ } finally {
78
+ closeThis.close();
79
+ }
80
+ }
81
+ }
82
+
83
+ public static Schema getInputSchema(List<Schema> schemas)
84
+ {
85
+ return schemas.get(0);
86
+ }
87
+
88
+ public static Schema getOutputSchema(List<Schema> schemas)
89
+ {
90
+ return schemas.get(schemas.size() - 1);
91
+ }
92
+ }
@@ -16,11 +16,25 @@ public abstract class Filters
16
16
  {
17
17
  private Filters() { }
18
18
 
19
- public static List<FilterPlugin> newFilterPlugins(ExecSession exec, List<ConfigSource> configs)
19
+ public static List<PluginType> getPluginTypes(List<ConfigSource> configs)
20
20
  {
21
- ImmutableList.Builder<FilterPlugin> builder = ImmutableList.builder();
21
+ ImmutableList.Builder<PluginType> builder = ImmutableList.builder();
22
22
  for (ConfigSource config : configs) {
23
- builder.add(exec.newPlugin(FilterPlugin.class, config.get(PluginType.class, "type")));
23
+ builder.add(config.get(PluginType.class, "type"));
24
+ }
25
+ return builder.build();
26
+ }
27
+
28
+ public static List<FilterPlugin> newFilterPluginsFromConfigSources(ExecSession exec, List<ConfigSource> configs)
29
+ {
30
+ return newFilterPlugins(exec, getPluginTypes(configs));
31
+ }
32
+
33
+ public static List<FilterPlugin> newFilterPlugins(ExecSession exec, List<PluginType> pluginTypes)
34
+ {
35
+ ImmutableList.Builder<FilterPlugin> builder = ImmutableList.builder();
36
+ for (PluginType pluginType : pluginTypes) {
37
+ builder.add(exec.newPlugin(FilterPlugin.class, pluginType));
24
38
  }
25
39
  return builder.build();
26
40
  }
@@ -0,0 +1,24 @@
1
+ package org.embulk.spi;
2
+
3
+ import static org.junit.Assert.assertEquals;
4
+ import static org.junit.Assert.assertTrue;
5
+ import static org.junit.Assert.assertFalse;
6
+ import org.junit.Test;
7
+
8
+ public class TestBuffer
9
+ {
10
+ @Test
11
+ public void testEquals() throws Exception
12
+ {
13
+ byte[] bytes = new byte[] { 1, 2, 3, 2, 3 };
14
+ Buffer b1 = Buffer.wrap(bytes, 0, 2); // [1, 2]
15
+ Buffer b2 = Buffer.wrap(bytes, 1, 2); // [2, 3]
16
+ Buffer b3 = Buffer.wrap(bytes, 3, 2); // [2, 3]
17
+
18
+ assertFalse(b1.equals(b2));
19
+ assertTrue(b2.equals(b3));
20
+
21
+ assertFalse(b1.hashCode() == b2.hashCode());
22
+ assertTrue(b2.hashCode() == b3.hashCode());
23
+ }
24
+ }
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
56
56
 
57
57
  .. code-block:: console
58
58
 
59
- $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.5.jar -O /usr/local/bin/embulk
59
+ $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.6.0.jar -O /usr/local/bin/embulk
60
60
  $ sudo chmod +x /usr/local/bin/embulk
61
61
 
62
62
  Step 2. Install Elasticsearch plugin
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.6.0
7
8
  release/release-0.5.5
8
9
  release/release-0.5.4
9
10
  release/release-0.5.3
@@ -0,0 +1,34 @@
1
+ Release 0.6.0
2
+ ==================================
3
+
4
+ Executor Plugin Mechanism
5
+ ------------------
6
+
7
+ Now executor of Embulk is fully extensible using plugins. Executor plugins get input, filter and output plugins from the Embulk framework and runs them using multiple threads, processes, or servers. While input, filter and output plugins are response for data processing, executor plugins are responsible for scheduling the processing tasks and managing parallelism for performance.
8
+
9
+ The built-in executor plugin is ``LocalExecutorPlugin`` that runs tasks using multiple threads. It has a shared thread pool and schedules tasks at most ``(number of available CPU cores) * 2`` tasks in parallel. Number of threads is configurable using ``max_threads`` system parameter.
10
+
11
+ Another available executor is `embulk-executor-mapreduce <https://github.com/embulk/embulk-executor-mapreduce>`_ plugin. This executor plugin runs tasks on Hadoop, a distributed computing environment. It is suitable for processing TBs of data. An unique functionality is that it supports partitioning data by a certain column before passing them to output plugins. An example use case is that the MapReduce executor partitions data by time so that files on destination storage are partitioned for each day.
12
+
13
+ Plugin API
14
+ ------------------
15
+
16
+ * ``exec.LocalExecutor`` class is separated into ``exec.BulkLoader`` class for interface definition and ``exec.LocalExecutorPlugin`` for implementation. If you're application is Embulk through ``LocalExecutor`` class, you need to replace it with ``BulkLoader``.
17
+ * ``spi.ExecAction#run`` can throw ``Exception`` and ``Exec.doWith(ExecSession, ExecAction<T>)`` throws ``ExecutionException``.
18
+ * ``spi.Buffer`` implements ``#equals`` and ``hashCode`` methods.
19
+
20
+ Plugin SPI
21
+ ------------------
22
+
23
+ * Added ``spi.ExecutorPlugin`` interface.
24
+ * Added ``Embulk::ExecutorPlugin`` class.
25
+
26
+ General Changes
27
+ ------------------
28
+
29
+ * If there are no input tasks, the transaction is committed successfully rather than making it failed.
30
+
31
+
32
+ Release Date
33
+ ------------------
34
+ 2015-04-07
@@ -0,0 +1,23 @@
1
+ module Embulk
2
+
3
+ require 'embulk/data_source'
4
+
5
+ class ExecutorPlugin
6
+ # TODO
7
+
8
+ if Embulk.java?
9
+ # TODO new_java
10
+
11
+ def self.from_java(java_class)
12
+ JavaPlugin.ruby_adapter_class(java_class, ExecutorPlugin, RubyAdapter)
13
+ end
14
+
15
+ module RubyAdapter
16
+ module ClassMethods
17
+ end
18
+ # TODO
19
+ end
20
+ end
21
+ end
22
+
23
+ end
@@ -49,6 +49,11 @@ module Embulk
49
49
  Plugin.register_java_guess(name, java_class)
50
50
  end
51
51
 
52
+ def self.register_executor(name, class_fqdn, jar_dir)
53
+ java_class = classloader(jar_dir).loadClass(class_fqdn)
54
+ Plugin.register_java_executor(name, java_class)
55
+ end
56
+
52
57
  def self.ruby_adapter_class(java_class, ruby_base_class, ruby_module)
53
58
  Class.new(ruby_base_class) do
54
59
  const_set(:JAVA_CLASS, java_class)
data/lib/embulk/plugin.rb CHANGED
@@ -13,12 +13,13 @@ module Embulk
13
13
  require 'embulk/decoder_plugin'
14
14
  require 'embulk/encoder_plugin'
15
15
  require 'embulk/guess_plugin'
16
+ require 'embulk/executor_plugin'
16
17
  require 'embulk/java_plugin' if Embulk.java?
17
18
 
18
19
  class PluginManager
19
20
  def initialize
20
21
  @registries = {}
21
- %w[input output parser formatter decoder encoder line_filter filter guess].each do |category|
22
+ %w[input output parser formatter decoder encoder line_filter filter guess executor].each do |category|
22
23
  @registries[category.to_sym] = PluginRegistry.new(category, "embulk/#{category}/")
23
24
  end
24
25
  end
@@ -144,6 +145,11 @@ module Embulk
144
145
  "org.embulk.spi.GuessPlugin" => GuessPlugin)
145
146
  end
146
147
 
148
+ def register_java_executor(type, klass)
149
+ register_java_plugin(:executor, type, klass,
150
+ "org.embulk.spi.ExecutorPlugin" => ExecutorPlugin)
151
+ end
152
+
147
153
  def new_java_input(type)
148
154
  lookup(:input, type).new_java
149
155
  end
@@ -176,6 +182,10 @@ module Embulk
176
182
  lookup(:guess, type).new_java
177
183
  end
178
184
 
185
+ def new_java_executor(type)
186
+ lookup(:executor, type).new_java
187
+ end
188
+
179
189
  private
180
190
 
181
191
  # TODO lookup should fallback to Java PluginSource
@@ -222,7 +232,8 @@ module Embulk
222
232
  :register_formatter, :get_formatter, :register_java_formatter, :new_java_formatter,
223
233
  :register_decoder, :get_decoder, :register_java_decoder, :new_java_decoder,
224
234
  :register_encoder, :get_encoder, :register_java_encoder, :new_java_encoder,
225
- :register_guess, :get_guess, :register_java_guess, :new_java_guess
235
+ :register_guess, :get_guess, :register_java_guess, :new_java_guess,
236
+ :register_executor, :get_executor, :register_java_executor, :new_java_executor
226
237
  end
227
238
  end
228
239
  end