embulk 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +8 -8
  2. data/ChangeLog +12 -0
  3. data/README.md +38 -13
  4. data/build.gradle +6 -1
  5. data/embulk-cli/pom.xml +1 -1
  6. data/embulk-core/pom.xml +1 -1
  7. data/embulk-core/src/main/java/org/embulk/command/Runner.java +87 -8
  8. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +1 -1
  9. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +16 -3
  10. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +1 -1
  11. data/embulk-core/src/main/java/org/embulk/exec/ExecutionInterruptedException.java +10 -0
  12. data/embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java +26 -0
  13. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +37 -1
  14. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +461 -110
  15. data/embulk-core/src/main/java/org/embulk/exec/PartialExecutionException.java +18 -0
  16. data/embulk-core/src/main/java/org/embulk/exec/ResumeState.java +82 -0
  17. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +3 -3
  18. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +35 -4
  19. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +14 -3
  20. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +55 -24
  21. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +8 -0
  22. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +57 -24
  23. data/embulk-core/src/main/java/org/embulk/spi/FilterPlugin.java +21 -0
  24. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +14 -3
  25. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +8 -0
  26. data/embulk-core/src/main/java/org/embulk/spi/util/Filters.java +87 -0
  27. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +4 -2
  28. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +16 -0
  29. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +15 -0
  30. data/embulk-standards/pom.xml +1 -1
  31. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +16 -2
  32. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +14 -1
  33. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +14 -1
  34. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +15 -3
  35. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +15 -1
  36. data/lib/embulk/command/embulk_run.rb +16 -1
  37. data/lib/embulk/data/bundle/embulk/filter_example.rb +42 -0
  38. data/lib/embulk/data/bundle/embulk/input_example.rb +43 -33
  39. data/lib/embulk/data/bundle/embulk/output_example.rb +43 -36
  40. data/lib/embulk/filter_plugin.rb +86 -0
  41. data/lib/embulk/input_plugin.rb +37 -2
  42. data/lib/embulk/java/imports.rb +1 -0
  43. data/lib/embulk/output_plugin.rb +30 -0
  44. data/lib/embulk/plugin.rb +32 -19
  45. data/lib/embulk/schema.rb +16 -9
  46. data/lib/embulk/version.rb +1 -1
  47. data/pom.xml +1 -1
  48. metadata +13 -7
  49. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +0 -10
  50. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +0 -19
@@ -17,5 +17,13 @@ public interface OutputPlugin
17
17
  Schema schema, int processorCount,
18
18
  OutputPlugin.Control control);
19
19
 
20
+ public NextConfig resume(TaskSource taskSource,
21
+ Schema schema, int processorCount,
22
+ OutputPlugin.Control control);
23
+
24
+ public void cleanup(TaskSource taskSource,
25
+ Schema schema, int processorCount,
26
+ List<CommitReport> successCommitReports);
27
+
20
28
  public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int processorIndex);
21
29
  }
@@ -0,0 +1,87 @@
1
+ package org.embulk.spi.util;
2
+
3
+ import java.util.List;
4
+ import com.google.common.collect.ImmutableList;
5
+ import org.embulk.config.TaskSource;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.CommitReport;
8
+ import org.embulk.config.NextConfig;
9
+ import org.embulk.plugin.PluginType;
10
+ import org.embulk.spi.ExecSession;
11
+ import org.embulk.spi.Schema;
12
+ import org.embulk.spi.PageOutput;
13
+ import org.embulk.spi.FilterPlugin;
14
+
15
+ public abstract class Filters
16
+ {
17
+ private Filters() { }
18
+
19
+ public static List<FilterPlugin> newFilterPlugins(ExecSession exec, List<ConfigSource> configs)
20
+ {
21
+ ImmutableList.Builder<FilterPlugin> builder = ImmutableList.builder();
22
+ for (ConfigSource config : configs) {
23
+ builder.add(exec.newPlugin(FilterPlugin.class, config.get(PluginType.class, "type")));
24
+ }
25
+ return builder.build();
26
+ }
27
+
28
+ public interface Control
29
+ {
30
+ public void run(List<TaskSource> taskSources, List<Schema> filterSchemas);
31
+ }
32
+
33
+ public static void transaction(List<FilterPlugin> plugins, List<ConfigSource> configs,
34
+ Schema inputSchema, Filters.Control control)
35
+ {
36
+ new RecursiveControl(plugins, configs, control).transaction(inputSchema);
37
+ }
38
+
39
+ public static PageOutput open(List<FilterPlugin> plugins, List<TaskSource> taskSources,
40
+ List<Schema> filterSchemas, PageOutput output)
41
+ {
42
+ PageOutput out = output;
43
+ int pos = 0;
44
+ while (pos < plugins.size()) {
45
+ out = plugins.get(pos).open(taskSources.get(pos), filterSchemas.get(pos), filterSchemas.get(pos + 1), out);
46
+ pos++;
47
+ }
48
+ return out;
49
+ }
50
+
51
+ private static class RecursiveControl
52
+ {
53
+ private final List<FilterPlugin> plugins;
54
+ private final List<ConfigSource> configs;
55
+ private final Filters.Control finalControl;
56
+ private final ImmutableList.Builder<TaskSource> taskSources;
57
+ private final ImmutableList.Builder<Schema> filterSchemas;
58
+ private int pos;
59
+
60
+ RecursiveControl(List<FilterPlugin> plugins, List<ConfigSource> configs,
61
+ Filters.Control finalControl)
62
+ {
63
+ this.plugins = plugins;
64
+ this.configs = configs;
65
+ this.finalControl = finalControl;
66
+ this.taskSources = ImmutableList.builder();
67
+ this.filterSchemas = ImmutableList.builder();
68
+ }
69
+
70
+ public void transaction(Schema inputSchema)
71
+ {
72
+ filterSchemas.add(inputSchema);
73
+ if (pos < plugins.size()) {
74
+ plugins.get(pos).transaction(configs.get(pos), inputSchema, new FilterPlugin.Control() {
75
+ public void run(TaskSource taskSource, Schema outputSchema)
76
+ {
77
+ taskSources.add(taskSource);
78
+ pos++;
79
+ transaction(outputSchema);
80
+ }
81
+ });
82
+ } else {
83
+ finalControl.run(taskSources.build(), filterSchemas.build());
84
+ }
85
+ }
86
+ }
87
+ }
@@ -3,6 +3,7 @@ package org.embulk;
3
3
  import java.util.Random;
4
4
  import org.junit.runner.Description;
5
5
  import org.junit.runners.model.Statement;
6
+ import com.google.inject.Injector;
6
7
  import com.google.inject.Binder;
7
8
  import com.google.inject.Module;
8
9
  import org.embulk.config.ConfigSource;
@@ -49,8 +50,9 @@ public class EmbulkTestRuntime
49
50
  public EmbulkTestRuntime()
50
51
  {
51
52
  super(new TestRuntimeModule());
52
- ConfigSource execConfig = new DataSourceImpl(null);
53
- this.exec = new ExecSession(getInjector(), execConfig);
53
+ Injector injector = getInjector();
54
+ ConfigSource execConfig = new DataSourceImpl(injector.getInstance(ModelManager.class));
55
+ this.exec = new ExecSession(injector, execConfig);
54
56
  }
55
57
 
56
58
  public ExecSession getExec()
@@ -43,6 +43,7 @@ public class TestFileInputRunner
43
43
  this.buffers = buffers;
44
44
  }
45
45
 
46
+ @Override
46
47
  public NextConfig transaction(ConfigSource config,
47
48
  FileInputPlugin.Control control)
48
49
  {
@@ -50,6 +51,21 @@ public class TestFileInputRunner
50
51
  return null;
51
52
  }
52
53
 
54
+ @Override
55
+ public NextConfig resume(TaskSource taskSource,
56
+ int processorCount,
57
+ FileInputPlugin.Control control)
58
+ {
59
+ throw new UnsupportedOperationException();
60
+ }
61
+
62
+ @Override
63
+ public void cleanup(TaskSource taskSource,
64
+ int processorCount,
65
+ List<CommitReport> successCommitReports)
66
+ {
67
+ }
68
+
53
69
  public TransactionalFileInput open(TaskSource taskSource,
54
70
  int processorIndex)
55
71
  {
@@ -41,6 +41,21 @@ public class TestFileOutputRunner
41
41
  return Exec.newNextConfig();
42
42
  }
43
43
 
44
+ @Override
45
+ public NextConfig resume(TaskSource taskSource,
46
+ int processorCount,
47
+ FileOutputPlugin.Control control)
48
+ {
49
+ throw new UnsupportedOperationException();
50
+ }
51
+
52
+ @Override
53
+ public void cleanup(TaskSource taskSource,
54
+ int processorCount,
55
+ List<CommitReport> successCommitReports)
56
+ {
57
+ }
58
+
44
59
  @Override
45
60
  public TransactionalFileOutput open(TaskSource taskSource,
46
61
  final int processorIndex)
@@ -5,7 +5,7 @@
5
5
  <parent>
6
6
  <groupId>org.embulk</groupId>
7
7
  <artifactId>embulk-parent</artifactId>
8
- <version>0.2.1-SNAPSHOT</version>
8
+ <version>0.3.0-SNAPSHOT</version>
9
9
  </parent>
10
10
 
11
11
  <artifactId>embulk-standards</artifactId>
@@ -56,12 +56,26 @@ public class LocalFileInputPlugin
56
56
  // list files recursively
57
57
  task.setFiles(listFiles(task));
58
58
 
59
- // run with threads. number of processors is same with number of files
60
- control.run(task.dump(), task.getFiles().size());
59
+ // number of processors is same with number of files
60
+ int processorCount = task.getFiles().size();
61
+ return resume(task.dump(), processorCount, control);
62
+ }
61
63
 
64
+ @Override
65
+ public NextConfig resume(TaskSource taskSource,
66
+ int processorCount,
67
+ FileInputPlugin.Control control)
68
+ {
69
+ control.run(taskSource, processorCount);
62
70
  return Exec.newNextConfig();
63
71
  }
64
72
 
73
+ @Override
74
+ public void cleanup(TaskSource taskSource,
75
+ int processorCount,
76
+ List<CommitReport> successCommitReports)
77
+ { }
78
+
65
79
  public List<String> listFiles(PluginTask task)
66
80
  {
67
81
  final ImmutableList.Builder<String> builder = ImmutableList.builder();
@@ -47,11 +47,24 @@ public class LocalFileOutputPlugin
47
47
  {
48
48
  PluginTask task = config.loadConfig(PluginTask.class);
49
49
 
50
- control.run(task.dump());
50
+ return resume(task.dump(), processorCount, control);
51
+ }
51
52
 
53
+ @Override
54
+ public NextConfig resume(TaskSource taskSource,
55
+ int processorCount,
56
+ FileOutputPlugin.Control control)
57
+ {
58
+ control.run(taskSource);
52
59
  return Exec.newNextConfig();
53
60
  }
54
61
 
62
+ @Override
63
+ public void cleanup(TaskSource taskSource,
64
+ int processorCount,
65
+ List<CommitReport> successCommitReports)
66
+ { }
67
+
55
68
  @Override
56
69
  public TransactionalFileOutput open(TaskSource taskSource, final int processorIndex)
57
70
  {
@@ -1,5 +1,6 @@
1
1
  package org.embulk.standards;
2
2
 
3
+ import java.util.List;
3
4
  import org.embulk.config.ConfigSource;
4
5
  import org.embulk.config.TaskSource;
5
6
  import org.embulk.config.NextConfig;
@@ -18,10 +19,22 @@ public class NullOutputPlugin
18
19
  Schema schema, int processorCount,
19
20
  OutputPlugin.Control control)
20
21
  {
21
- control.run(Exec.newTaskSource());
22
+ return resume(Exec.newTaskSource(), schema, processorCount, control);
23
+ }
24
+
25
+ public NextConfig resume(TaskSource taskSource,
26
+ Schema schema, int processorCount,
27
+ OutputPlugin.Control control)
28
+ {
29
+ control.run(taskSource);
22
30
  return Exec.newNextConfig();
23
31
  }
24
32
 
33
+ public void cleanup(TaskSource taskSource,
34
+ Schema schema, int processorCount,
35
+ List<CommitReport> successCommitReports)
36
+ { }
37
+
25
38
  @Override
26
39
  public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int processorIndex)
27
40
  {
@@ -77,13 +77,25 @@ public class S3FileInputPlugin
77
77
  task.setFiles(listFiles(task));
78
78
 
79
79
  // number of processors is same with number of files
80
+ int processorCount = task.getFiles().size();
81
+ return resume(task.dump(), processorCount, control);
82
+ }
80
83
 
81
- // run
82
- control.run(task.dump(), task.getFiles().size());
83
-
84
+ @Override
85
+ public NextConfig resume(TaskSource taskSource,
86
+ int processorCount,
87
+ FileInputPlugin.Control control)
88
+ {
89
+ control.run(taskSource, processorCount);
84
90
  return Exec.newNextConfig();
85
91
  }
86
92
 
93
+ @Override
94
+ public void cleanup(TaskSource taskSource,
95
+ int processorCount,
96
+ List<CommitReport> successCommitReports)
97
+ { }
98
+
87
99
  public static AWSCredentialsProvider getCredentialsProvider(PluginTask task)
88
100
  {
89
101
  final AWSCredentials cred = new BasicAWSCredentials(
@@ -1,5 +1,6 @@
1
1
  package org.embulk.standards;
2
2
 
3
+ import java.util.List;
3
4
  import org.embulk.config.ConfigSource;
4
5
  import org.embulk.config.TaskSource;
5
6
  import org.embulk.config.NextConfig;
@@ -30,10 +31,23 @@ public class StdoutOutputPlugin
30
31
  OutputPlugin.Control control)
31
32
  {
32
33
  final PluginTask task = config.loadConfig(PluginTask.class);
33
- control.run(task.dump());
34
+ return resume(task.dump(), schema, processorCount, control);
35
+ }
36
+
37
+ @Override
38
+ public NextConfig resume(TaskSource taskSource,
39
+ Schema schema, int processorCount,
40
+ OutputPlugin.Control control)
41
+ {
42
+ control.run(taskSource);
34
43
  return Exec.newNextConfig();
35
44
  }
36
45
 
46
+ public void cleanup(TaskSource taskSource,
47
+ Schema schema, int processorCount,
48
+ List<CommitReport> successCommitReports)
49
+ { }
50
+
37
51
  @Override
38
52
  public TransactionalPageOutput open(TaskSource taskSource, final Schema schema,
39
53
  int processorIndex)
@@ -51,6 +51,21 @@ module Embulk
51
51
  op.on('-o', '--output PATH', 'Path to a file to write the next configuration') do |path|
52
52
  options[:nextConfigOutputPath] = path
53
53
  end
54
+ op.on('-r', '--resume-state PATH', 'Path to a file to write or read resume state') do |path|
55
+ options[:resumeStatePath] = path
56
+ end
57
+ args = 1..1
58
+
59
+ when :cleanup
60
+ op.banner = "Usage: run <config.yml>"
61
+ op.on('-b', '--bundle BUNDLE_DIR', 'Path to a Gemfile directory') do |path|
62
+ end
63
+ op.on('-I', '--load-path PATH', 'Add ruby script directory path or jar file path') do |load_path|
64
+ load_paths << load_path
65
+ end
66
+ op.on('-r', '--resume-state PATH', 'Path to a file to write or read resume state') do |path|
67
+ options[:resumeStatePath] = path
68
+ end
54
69
  args = 1..1
55
70
 
56
71
  when :preview
@@ -118,7 +133,7 @@ module Embulk
118
133
  if __FILE__ =~ /^classpath:/ || __FILE__.include?('!/')
119
134
  # data is in embulk-core jar
120
135
  resource_class = org.embulk.command.Runner.java_class
121
- %w[.bundle/config embulk/input_example.rb embulk/output_example.rb Gemfile Gemfile.lock].each do |file| # TODO get file list from the jar
136
+ %w[.bundle/config embulk/input_example.rb embulk/filter_example.rb embulk/output_example.rb Gemfile Gemfile.lock].each do |file| # TODO get file list from the jar
122
137
  url = resource_class.resource("/embulk/data/bundle/#{file}").to_s
123
138
  dst = File.join(path, file)
124
139
  FileUtils.mkdir_p File.dirname(dst)
@@ -0,0 +1,42 @@
1
+ module Embulk
2
+ module Plugin
3
+
4
+ class FilterExample < FilterPlugin
5
+ # filter plugin file name must be: embulk/filter_<name>.rb
6
+ Plugin.register_filter('example', self)
7
+
8
+ def self.transaction(config, in_schema, &control)
9
+ task = {
10
+ 'key' => config.param('key', :string, default: "filter_key"),
11
+ 'value' => config.param('value', :string, default: "filter_value")
12
+ }
13
+
14
+ idx = in_schema.size
15
+ out_columns = in_schema + [Column.new(idx, task['key'], :string)]
16
+
17
+ puts "Example filter started."
18
+ yield(task, out_columns)
19
+ puts "Example filter finished."
20
+ end
21
+
22
+ def initialize(task, in_schema, out_schema, page_builder)
23
+ super
24
+ @value = task['value']
25
+ end
26
+
27
+ def close
28
+ end
29
+
30
+ def add(page)
31
+ page.each do |record|
32
+ @page_builder.add(record + [@value])
33
+ end
34
+ end
35
+
36
+ def finish
37
+ @page_builder.finish
38
+ end
39
+ end
40
+
41
+ end
42
+ end
@@ -1,44 +1,54 @@
1
1
  module Embulk
2
+ module Plugin
3
+
4
+ class InputExample < InputPlugin
5
+ # input plugin file name must be: embulk/input_<name>.rb
6
+ Plugin.register_input('example', self)
7
+
8
+ def self.transaction(config, &control)
9
+ files = ['file1', 'file2']
10
+ task = {
11
+ 'files' => files,
12
+ 'hostname' => config.param('hostname', :string, default: nil)
13
+ }
14
+
15
+ columns = [
16
+ Column.new(0, 'file', :string),
17
+ Column.new(1, 'hostname', :string),
18
+ Column.new(2, 'col0', :long),
19
+ Column.new(3, 'col1', :double),
20
+ ]
21
+
22
+ resume(task, columns, files.length, &control)
23
+ end
2
24
 
3
- class InputExample < InputPlugin
4
- # input plugin file name must be: embulk/input_<name>.rb
5
- Plugin.register_input('example', self)
6
-
7
- def self.transaction(config, &control)
8
- task = {
9
- 'message' => config.param('message', :string, default: nil)
10
- }
11
- threads = config.param('threads', :integer, default: 2)
12
-
13
- columns = [
14
- Column.new(0, 'col0', :long),
15
- Column.new(1, 'col1', :double),
16
- Column.new(2, 'col2', :string),
17
- ]
25
+ def self.resume(task, columns, count, &control)
26
+ puts "Example input started."
27
+ commit_reports = yield(task, columns, count)
28
+ puts "Example input finished. Commit reports = #{commit_reports.to_json}"
18
29
 
19
- puts "Example input started."
20
- commit_reports = yield(task, columns, threads)
21
- puts "Example input finished. Commit reports = #{commit_reports.to_json}"
30
+ next_config_diff = {}
31
+ return next_config_diff
32
+ end
22
33
 
23
- return {}
24
- end
34
+ def initialize(task, schema, index, page_builder)
35
+ super
36
+ @file = task['files'][index]
37
+ @hostname = task['hostname']
38
+ end
25
39
 
26
- def initialize(task, schema, index, page_builder)
27
- super
28
- end
40
+ def run
41
+ puts "Example input thread #{@index}..."
29
42
 
30
- def run
31
- puts "Example input thread #{@index}..."
43
+ 10.times do |i|
44
+ @page_builder.add([@file, @hostname, i, 10.0])
45
+ end
46
+ @page_builder.finish # don't forget to call finish :-)
32
47
 
33
- 10.times do |i|
34
- @page_builder.add([i, 10.0, "example"])
48
+ commit_report = {}
49
+ return commit_report
35
50
  end
36
- @page_builder.finish # don't forget to call finish :-)
37
-
38
- commit_report = {
39
- }
40
- return commit_report
41
51
  end
42
- end
43
52
 
53
+ end
44
54
  end