embulk 0.8.18-java → 0.8.19-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +10 -0
  3. data/build.gradle +10 -3
  4. data/embulk-cli/build.gradle +2 -0
  5. data/embulk-cli/src/main/bat/selfrun.bat +98 -0
  6. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkExample.java +82 -0
  7. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkMigrate.java +458 -0
  8. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkNew.java +419 -0
  9. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkSelfUpdate.java +248 -0
  10. data/embulk-cli/src/main/sh/selfrun.sh +0 -103
  11. data/embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java +158 -143
  12. data/embulk-core/build.gradle +2 -2
  13. data/embulk-core/src/main/java/org/embulk/EmbulkVersion.java +109 -0
  14. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +11 -0
  15. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +29 -3
  16. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +47 -13
  17. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +6 -3
  18. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +385 -64
  19. data/embulk-core/src/main/java/org/embulk/spi/TempFileSpace.java +2 -1
  20. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +62 -0
  21. data/embulk-docs/src/built-in.rst +59 -21
  22. data/embulk-docs/src/customization.rst +8 -8
  23. data/embulk-docs/src/developers/index.rst +45 -0
  24. data/embulk-docs/src/index.rst +11 -7
  25. data/embulk-docs/src/recipe.rst +1 -1
  26. data/embulk-docs/src/recipe/{scheduled-csv-load-to-elasticsearch-kibana4.rst → scheduled-csv-load-to-elasticsearch-kibana5.rst} +26 -24
  27. data/embulk-docs/src/release.rst +1 -0
  28. data/embulk-docs/src/release/release-0.4.0.rst +1 -1
  29. data/embulk-docs/src/release/release-0.5.0.rst +1 -1
  30. data/embulk-docs/src/release/release-0.6.0.rst +1 -1
  31. data/embulk-docs/src/release/release-0.6.20.rst +1 -1
  32. data/embulk-docs/src/release/release-0.8.19.rst +43 -0
  33. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +2 -2
  34. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +30 -1
  35. data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +10 -0
  36. data/embulk-standards/src/test/java/org/embulk/standards/preview/TestFilePreview.java +73 -0
  37. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_skip_suggest_if_empty_sample_records.csv +5 -0
  38. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_skip_suggest_if_empty_sample_records_guessed.yml +2 -0
  39. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_skip_suggest_if_empty_sample_records_seed.yml +1 -0
  40. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_sample_buffer_bytes.csv +5 -0
  41. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_sample_buffer_bytes_exec.yml +1 -0
  42. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_sample_buffer_bytes_load.yml +19 -0
  43. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_sample_buffer_bytes_previewed.csv +1 -0
  44. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_simple.csv +5 -0
  45. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_simple_load.yml +19 -0
  46. data/embulk-standards/src/test/resources/org/embulk/standards/preview/file/test/test_simple_previewed.csv +4 -0
  47. data/embulk-test/src/main/java/org/embulk/test/PreviewResultInputPlugin.java +65 -0
  48. data/embulk-test/src/main/java/org/embulk/test/TestingBulkLoader.java +5 -0
  49. data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +59 -2
  50. data/embulk.gemspec +2 -1
  51. data/lib/embulk/command/embulk_run.rb +11 -49
  52. data/lib/embulk/data/new/README.md.vm +106 -0
  53. data/lib/embulk/data/new/{gitignore.erb → gitignore.vm} +3 -3
  54. data/lib/embulk/data/new/java/{build.gradle.erb → build.gradle.vm} +8 -8
  55. data/lib/embulk/data/new/java/{decoder.java.erb → decoder.java.vm} +6 -4
  56. data/lib/embulk/data/new/java/{encoder.java.erb → encoder.java.vm} +7 -5
  57. data/lib/embulk/data/new/java/{file_input.java.erb → file_input.java.vm} +9 -7
  58. data/lib/embulk/data/new/java/{file_output.java.erb → file_output.java.vm} +7 -5
  59. data/lib/embulk/data/new/java/{filter.java.erb → filter.java.vm} +4 -3
  60. data/lib/embulk/data/new/java/{formatter.java.erb → formatter.java.vm} +5 -4
  61. data/lib/embulk/data/new/java/{input.java.erb → input.java.vm} +6 -4
  62. data/lib/embulk/data/new/java/{output.java.erb → output.java.vm} +7 -5
  63. data/lib/embulk/data/new/java/{parser.java.erb → parser.java.vm} +5 -4
  64. data/lib/embulk/data/new/java/plugin_loader.rb.vm +3 -0
  65. data/lib/embulk/data/new/java/test.java.vm +5 -0
  66. data/lib/embulk/data/new/ruby/decoder_guess.rb.vm +25 -0
  67. data/lib/embulk/data/new/ruby/{filter.rb.erb → filter.rb.vm} +2 -2
  68. data/lib/embulk/data/new/ruby/{formatter.rb.erb → formatter.rb.vm} +2 -2
  69. data/lib/embulk/data/new/ruby/gemspec.vm +20 -0
  70. data/lib/embulk/data/new/ruby/{input.rb.erb → input.rb.vm} +10 -10
  71. data/lib/embulk/data/new/ruby/{output.rb.erb → output.rb.vm} +7 -7
  72. data/lib/embulk/data/new/ruby/{parser.rb.erb → parser.rb.vm} +2 -2
  73. data/lib/embulk/data/new/ruby/parser_guess.rb.vm +65 -0
  74. data/lib/embulk/guess/csv.rb +5 -0
  75. data/lib/embulk/version.rb +22 -1
  76. metadata +55 -35
  77. data/lib/embulk/command/embulk_example.rb +0 -33
  78. data/lib/embulk/command/embulk_generate_bin.rb +0 -62
  79. data/lib/embulk/command/embulk_migrate_plugin.rb +0 -244
  80. data/lib/embulk/command/embulk_new_plugin.rb +0 -126
  81. data/lib/embulk/command/embulk_selfupdate.rb +0 -121
  82. data/lib/embulk/data/new/README.md.erb +0 -111
  83. data/lib/embulk/data/new/java/plugin_loader.rb.erb +0 -3
  84. data/lib/embulk/data/new/java/test.java.erb +0 -5
  85. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +0 -25
  86. data/lib/embulk/data/new/ruby/gemspec.erb +0 -20
  87. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +0 -65
@@ -31,7 +31,7 @@ dependencies {
31
31
  compile 'org.slf4j:slf4j-api:1.7.12'
32
32
  compile 'org.jruby:jruby-complete:' + project.jrubyVersion
33
33
  compile 'com.google.code.findbugs:annotations:3.0.0'
34
- compile 'org.yaml:snakeyaml:1.14'
34
+ compile 'org.yaml:snakeyaml:1.18'
35
35
  compile 'javax.validation:validation-api:1.1.0.Final'
36
36
  compile 'org.apache.bval:bval-jsr303:0.5'
37
37
  compile 'io.airlift:slice:0.9'
@@ -45,7 +45,7 @@ dependencies {
45
45
 
46
46
  gems 'rubygems:bundler:1.10.6'
47
47
  gems 'rubygems:msgpack:1.1.0'
48
- gems 'rubygems:liquid:3.0.6'
48
+ gems 'rubygems:liquid:4.0.0'
49
49
  }
50
50
 
51
51
  task unpackGems(type: JRubyPrepare) {
@@ -0,0 +1,109 @@
1
+ package org.embulk;
2
+
3
+ import java.io.IOException;
4
+ import java.net.URL;
5
+ import java.security.CodeSource;
6
+ import java.security.ProtectionDomain;
7
+ import java.util.jar.Attributes;
8
+ import java.util.jar.JarFile;
9
+ import java.util.jar.Manifest;
10
+
11
+ public final class EmbulkVersion
12
+ {
13
+ private EmbulkVersion()
14
+ {
15
+ }
16
+
17
+ // Expecting Embulk is always packaged in the embulk-cli jar whenever the Embulk version is checked in Java.
18
+ static {
19
+ VERSION = getImplementationVersion(getSelfJarManifest(), "[embulk-version-unavailable]");
20
+ }
21
+
22
+ private static Manifest getSelfJarManifest()
23
+ {
24
+ try {
25
+ final ProtectionDomain protectionDomain;
26
+ try {
27
+ protectionDomain = EmbulkVersion.class.getProtectionDomain();
28
+ }
29
+ catch (SecurityException ex) {
30
+ System.err.println("Embulk version unavailable due to ProtectionDomain inaccessible.");
31
+ ex.printStackTrace();
32
+ return null;
33
+ }
34
+
35
+ final CodeSource codeSource = protectionDomain.getCodeSource();
36
+ if (codeSource == null) {
37
+ System.err.println("Embulk version unavailable due to CodeSource unavailable.");
38
+ return null;
39
+ }
40
+
41
+ final URL selfJarUrl = codeSource.getLocation();
42
+ if (selfJarUrl == null) {
43
+ System.err.println("Embulk version unavailable due to the location of CodeSource unavailable.");
44
+ return null;
45
+ }
46
+ else if (!selfJarUrl.getProtocol().equals("file")) {
47
+ System.err.println("Embulk version unavailable as the location of CodeSource is not local.");
48
+ return null;
49
+ }
50
+
51
+ final String selfJarPathString = selfJarUrl.getPath();
52
+ if (selfJarPathString == null) {
53
+ System.err.println("Embulk version unavailable due to the path of CodeSource unavailable.");
54
+ return null;
55
+ }
56
+ else if (selfJarPathString.isEmpty()) {
57
+ System.err.println("Embulk version unavailable due to the path of CodeSource empty.");
58
+ return null;
59
+ }
60
+
61
+ try (final JarFile selfJarFile = new JarFile(selfJarPathString)) {
62
+ try {
63
+ return selfJarFile.getManifest();
64
+ }
65
+ catch (IllegalStateException ex) {
66
+ System.err.println("Embulk version unavailable due to the jar file closed unexpectedly.");
67
+ ex.printStackTrace();
68
+ return null;
69
+ }
70
+ catch (IOException ex) {
71
+ System.err.println("Embulk version unavailable due to failure to get the manifst in the jar file.");
72
+ ex.printStackTrace();
73
+ return null;
74
+ }
75
+ }
76
+ catch (SecurityException ex) {
77
+ System.err.println("Embulk version unavailable due to the jar file inaccessible.");
78
+ ex.printStackTrace();
79
+ return null;
80
+ }
81
+ catch (IOException ex) {
82
+ System.err.println("Embulk version unavailable due to failure to access the jar file.");
83
+ ex.printStackTrace();
84
+ return null;
85
+ }
86
+ }
87
+ catch (Throwable ex) {
88
+ System.err.println("Embulk version unavailable due to an unknown exception.");
89
+ ex.printStackTrace();
90
+ return null;
91
+ }
92
+ }
93
+
94
+ private static String getImplementationVersion(final Manifest manifest, final String defaultVersion)
95
+ {
96
+ if (manifest == null) {
97
+ return defaultVersion;
98
+ }
99
+ final Attributes mainAttributes = manifest.getMainAttributes();
100
+ final String implementationVersion = mainAttributes.getValue(Attributes.Name.IMPLEMENTATION_VERSION);
101
+ if (implementationVersion == null) {
102
+ System.err.println("Embulk version unavailable due to the manifest not containing Implementation-Version.");
103
+ return defaultVersion;
104
+ }
105
+ return implementationVersion;
106
+ }
107
+
108
+ public static final String VERSION;
109
+ }
@@ -55,6 +55,10 @@ public class GuessExecutor
55
55
  @Config("exclude_guess_plugins")
56
56
  @ConfigDefault("[]")
57
57
  public List<PluginType> getExcludeGuessPlugins();
58
+
59
+ @Config("guess_sample_buffer_bytes")
60
+ @ConfigDefault("32768") // 32 * 1024
61
+ public int getSampleBufferBytes();
58
62
  }
59
63
 
60
64
  public static void registerDefaultGuessPluginTo(Binder binder, PluginType type)
@@ -63,6 +67,13 @@ public class GuessExecutor
63
67
  multibinder.addBinding().toInstance(type);
64
68
  }
65
69
 
70
+ // Used by FileInputRunner#guess(..)
71
+ public static ConfigSource createSampleBufferConfigFromExecConfig(ConfigSource execConfig)
72
+ {
73
+ final GuessExecutorTask execTask = execConfig.loadConfig(GuessExecutorTask.class);
74
+ return Exec.newConfigSource().set("sample_buffer_bytes", execTask.getSampleBufferBytes());
75
+ }
76
+
66
77
  @Inject
67
78
  public GuessExecutor(@ForSystemConfig ConfigSource systemConfig,
68
79
  @ForGuess Set<PluginType> defaultGuessPlugins)
@@ -12,9 +12,9 @@ import org.embulk.config.Task;
12
12
  import org.embulk.config.TaskSource;
13
13
  import org.embulk.config.ConfigSource;
14
14
  import org.embulk.config.TaskReport;
15
+ import org.embulk.exec.SamplingParserPlugin.SampleBufferTask;
15
16
  import org.embulk.plugin.PluginType;
16
17
  import org.embulk.spi.Buffer;
17
- import org.embulk.spi.FileInputPlugin;
18
18
  import org.embulk.spi.FileInputRunner;
19
19
  import org.embulk.spi.Schema;
20
20
  import org.embulk.spi.Page;
@@ -26,6 +26,7 @@ import org.embulk.spi.Exec;
26
26
  import org.embulk.spi.ExecSession;
27
27
  import org.embulk.spi.ExecAction;
28
28
  import org.embulk.spi.util.Filters;
29
+ import org.slf4j.Logger;
29
30
 
30
31
  public class PreviewExecutor
31
32
  {
@@ -35,6 +36,10 @@ public class PreviewExecutor
35
36
  public interface PreviewTask
36
37
  extends Task
37
38
  {
39
+ @Config("exec")
40
+ @ConfigDefault("{}")
41
+ public ConfigSource getExecConfig();
42
+
38
43
  @Config("in")
39
44
  @NotNull
40
45
  public ConfigSource getInputConfig();
@@ -52,6 +57,14 @@ public class PreviewExecutor
52
57
  public void setInputTask(TaskSource taskSource);
53
58
  }
54
59
 
60
+ public interface PreviewExecutorTask
61
+ extends Task
62
+ {
63
+ @Config("preview_sample_buffer_bytes")
64
+ @ConfigDefault("32768") // 32 * 1024
65
+ public int getSampleBufferBytes();
66
+ }
67
+
55
68
  @Inject
56
69
  public PreviewExecutor(Injector injector,
57
70
  @ForSystemConfig ConfigSource systemConfig)
@@ -93,7 +106,7 @@ public class PreviewExecutor
93
106
  List<FilterPlugin> filterPlugins = newFilterPlugins(task);
94
107
 
95
108
  if (inputPlugin instanceof FileInputRunner) { // file input runner
96
- Buffer sample = SamplingParserPlugin.runFileInputSampling((FileInputRunner)inputPlugin, config.getNested("in"));
109
+ Buffer sample = SamplingParserPlugin.runFileInputSampling((FileInputRunner)inputPlugin, config.getNested("in"), createSampleBufferConfigFromExecConfig(config.getNested("exec")));
97
110
  FileInputRunner previewRunner = new FileInputRunner(new BufferFileInputPlugin(sample));
98
111
  return doPreview(task, previewRunner, filterPlugins);
99
112
  }
@@ -102,6 +115,12 @@ public class PreviewExecutor
102
115
  }
103
116
  }
104
117
 
118
+ private static ConfigSource createSampleBufferConfigFromExecConfig(ConfigSource execConfig)
119
+ {
120
+ final PreviewExecutorTask execTask = execConfig.loadConfig(PreviewExecutorTask.class);
121
+ return Exec.newConfigSource().set("sample_buffer_bytes", execTask.getSampleBufferBytes());
122
+ }
123
+
105
124
  private PreviewResult doPreview(final PreviewTask task, final InputPlugin input, final List<FilterPlugin> filterPlugins)
106
125
  {
107
126
  try {
@@ -144,16 +163,19 @@ public class PreviewExecutor
144
163
  private static class SamplingPageOutput
145
164
  implements PageOutput
146
165
  {
166
+ private final Logger log = Exec.getLogger(this.getClass());
147
167
  private final int sampleRows;
148
168
  private final Schema schema;
149
169
  private List<Page> pages;
150
170
  private int recordCount;
171
+ private PreviewResult res;
151
172
 
152
173
  public SamplingPageOutput(int sampleRows, Schema schema)
153
174
  {
154
175
  this.sampleRows = sampleRows;
155
176
  this.schema = schema;
156
177
  this.pages = new ArrayList<Page>();
178
+ this.res = null;
157
179
  }
158
180
 
159
181
  public int getRecordCount()
@@ -174,10 +196,14 @@ public class PreviewExecutor
174
196
  @Override
175
197
  public void finish()
176
198
  {
199
+ if (res != null) {
200
+ log.error("PreviewResult recreation will cause a bug. The plugin must call PageOutput#finish() only once.");
201
+ }
202
+
177
203
  if (recordCount == 0) {
178
204
  throw new NoSampleException("No input records to preview");
179
205
  }
180
- PreviewResult res = new PreviewResult(schema, pages);
206
+ res = new PreviewResult(schema, pages);
181
207
  pages = null;
182
208
  throw new PreviewedNoticeError(res);
183
209
  }
@@ -1,8 +1,13 @@
1
1
  package org.embulk.exec;
2
2
 
3
+ import java.text.NumberFormat;
3
4
  import java.util.List;
5
+
4
6
  import com.google.inject.Inject;
5
7
  import com.google.common.base.Preconditions;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.config.Task;
6
11
  import org.embulk.config.TaskSource;
7
12
  import org.embulk.config.ConfigSource;
8
13
  import org.embulk.config.TaskReport;
@@ -15,6 +20,9 @@ import org.embulk.spi.ParserPlugin;
15
20
  import org.embulk.spi.FileInput;
16
21
  import org.embulk.spi.FileInputRunner;
17
22
  import org.embulk.spi.PageOutput;
23
+ import org.slf4j.Logger;
24
+
25
+ import static java.util.Locale.ENGLISH;
18
26
  import static org.embulk.spi.util.Inputs.each;
19
27
 
20
28
  /*
@@ -25,9 +33,18 @@ public class SamplingParserPlugin
25
33
  {
26
34
  public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig)
27
35
  {
36
+ return runFileInputSampling(runner, inputConfig, Exec.newConfigSource());
37
+ }
38
+
39
+ public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig, ConfigSource sampleBufferConfig)
40
+ {
41
+ final SampleBufferTask sampleBufferTask = sampleBufferConfig.loadConfig(SampleBufferTask.class);
42
+
28
43
  // override in.parser.type so that FileInputRunner creates SamplingParserPlugin
29
44
  ConfigSource samplingInputConfig = inputConfig.deepCopy();
30
- samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling");
45
+ samplingInputConfig.getNestedOrSetEmpty("parser")
46
+ .set("type", "system_sampling")
47
+ .set("sample_buffer_bytes", sampleBufferTask.getSampleBufferBytes());
31
48
  samplingInputConfig.set("decoders", null);
32
49
 
33
50
  try {
@@ -120,42 +137,59 @@ public class SamplingParserPlugin
120
137
  }
121
138
  }
122
139
 
123
- private final int minSampleSize;
124
- private final int sampleSize;
140
+ private final NumberFormat numberFormat = NumberFormat.getNumberInstance(ENGLISH);
141
+ private final Logger log = Exec.getLogger(this.getClass());
142
+ private final int minSampleBufferBytes;
143
+
144
+ public interface PluginTask
145
+ extends Task, SampleBufferTask
146
+ {
147
+ }
148
+
149
+ public interface SampleBufferTask
150
+ extends Task
151
+ {
152
+ @Config("sample_buffer_bytes")
153
+ @ConfigDefault("32768") // 32 * 1024
154
+ public int getSampleBufferBytes();
155
+ }
125
156
 
126
157
  @Inject
127
158
  public SamplingParserPlugin(@ForSystemConfig ConfigSource systemConfig)
128
159
  {
129
- this.minSampleSize = 40; // empty gzip file is 33 bytes. // TODO get sample size from system config
130
- this.sampleSize = 32*1024; // TODO get sample size from system config. See also GuessExecutor.run.
131
- Preconditions.checkArgument(minSampleSize < sampleSize, "minSampleSize must be smaller than sampleSize");
160
+ this.minSampleBufferBytes = 40; // empty gzip file is 33 bytes. // TODO get sample size from system config
132
161
  }
133
162
 
134
163
  @Override
135
164
  public void transaction(ConfigSource config, ParserPlugin.Control control)
136
165
  {
137
- control.run(Exec.newTaskSource(), null);
166
+ PluginTask task = config.loadConfig(PluginTask.class);
167
+ Preconditions.checkArgument(minSampleBufferBytes < task.getSampleBufferBytes(), "minSampleBufferBytes must be smaller than sample_buffer_bytes");
168
+
169
+ log.info("Try to read {} bytes from input source", numberFormat.format(task.getSampleBufferBytes()));
170
+ control.run(task.dump(), null);
138
171
  }
139
172
 
140
173
  @Override
141
174
  public void run(TaskSource taskSource, Schema schema,
142
175
  FileInput input, PageOutput output)
143
176
  {
144
- Buffer buffer = readSample(input, sampleSize);
177
+ PluginTask task = taskSource.loadTask(PluginTask.class);
178
+ Buffer buffer = readSample(input, task.getSampleBufferBytes());
145
179
  if (!taskSource.get(boolean.class, "force", false)) {
146
- if (buffer.limit() < minSampleSize) {
180
+ if (buffer.limit() < minSampleBufferBytes) {
147
181
  throw new NotEnoughSampleError(buffer.limit());
148
182
  }
149
183
  }
150
184
  throw new SampledNoticeError(buffer);
151
185
  }
152
186
 
153
- public static Buffer readSample(FileInput fileInput, int sampleSize)
187
+ public static Buffer readSample(FileInput fileInput, int sampleBufferBytes)
154
188
  {
155
- return readSample(fileInput, Buffer.allocate(sampleSize), 0, sampleSize);
189
+ return readSample(fileInput, Buffer.allocate(sampleBufferBytes), 0, sampleBufferBytes);
156
190
  }
157
191
 
158
- public static Buffer readSample(FileInput fileInput, Buffer sample, int offset, int sampleSize)
192
+ public static Buffer readSample(FileInput fileInput, Buffer sample, int offset, int sampleBufferBytes)
159
193
  {
160
194
  if (!fileInput.nextFile()) {
161
195
  // no input files
@@ -168,7 +202,7 @@ public class SamplingParserPlugin
168
202
  sample.setBytes(offset, buffer, 0, size);
169
203
  offset += size;
170
204
  buffer.release();
171
- if (offset >= sampleSize) {
205
+ if (offset >= sampleBufferBytes) {
172
206
  break;
173
207
  }
174
208
  }
@@ -17,6 +17,8 @@ import org.embulk.exec.SamplingParserPlugin;
17
17
  import org.embulk.exec.ConfigurableGuessInputPlugin;
18
18
  import org.embulk.exec.NoSampleException;
19
19
 
20
+ import static org.embulk.exec.GuessExecutor.createSampleBufferConfigFromExecConfig;
21
+
20
22
  public class FileInputRunner
21
23
  implements InputPlugin, ConfigurableGuessInputPlugin
22
24
  {
@@ -80,14 +82,15 @@ public class FileInputRunner
80
82
  return guess(Exec.newConfigSource(), config);
81
83
  }
82
84
 
83
- public ConfigDiff guess(ConfigSource execConfig, ConfigSource config)
85
+ public ConfigDiff guess(ConfigSource execConfig, ConfigSource inputConfig)
84
86
  {
85
- Buffer sample = SamplingParserPlugin.runFileInputSampling(this, config);
87
+ final ConfigSource sampleBufferConfig = createSampleBufferConfigFromExecConfig(execConfig);
88
+ final Buffer sample = SamplingParserPlugin.runFileInputSampling(this, inputConfig, sampleBufferConfig);
86
89
  // SamplingParserPlugin.runFileInputSampling throws NoSampleException if there're
87
90
  // no files or all files are smaller than minSampleSize (40 bytes).
88
91
 
89
92
  GuessExecutor guessExecutor = Exec.getInjector().getInstance(GuessExecutor.class);
90
- return guessExecutor.guessParserConfig(sample, config, execConfig);
93
+ return guessExecutor.guessParserConfig(sample, inputConfig, execConfig);
91
94
  }
92
95
 
93
96
  private class RunnerControl
@@ -1,16 +1,12 @@
1
1
  package org.embulk.spi;
2
2
 
3
- import java.io.Serializable;
4
- import java.util.Map;
5
3
  import java.util.List;
6
4
  import java.util.Arrays;
7
5
  import java.util.ArrayList;
8
- import java.util.Comparator;
9
- import java.util.Collections;
10
- import com.google.common.collect.BiMap;
11
- import com.google.common.collect.HashBiMap;
12
6
  import io.airlift.slice.Slice;
13
7
  import io.airlift.slice.Slices;
8
+ import org.embulk.spi.type.Type;
9
+ import org.embulk.spi.type.Types;
14
10
  import org.msgpack.value.Value;
15
11
  import org.msgpack.value.ImmutableValue;
16
12
  import org.embulk.spi.time.Timestamp;
@@ -30,7 +26,8 @@ public class PageBuilder
30
26
  private int count;
31
27
  private int position;
32
28
  private final byte[] nullBitSet;
33
- private final BiMap<String, Integer> stringReferences = HashBiMap.create();
29
+ private final Row row;
30
+ private List<String> stringReferences = new ArrayList<>();
34
31
  private List<ImmutableValue> valueReferences = new ArrayList<>();
35
32
  private int referenceSize;
36
33
  private int nextVariableLengthDataOffset;
@@ -43,6 +40,7 @@ public class PageBuilder
43
40
  this.columnOffsets = PageFormat.columnOffsets(schema);
44
41
  this.nullBitSet = new byte[PageFormat.nullBitSetSize(schema)];
45
42
  Arrays.fill(nullBitSet, (byte) -1);
43
+ this.row = Row.newRow(schema);
46
44
  this.fixedRecordSize = PageFormat.recordHeaderSize(schema) + PageFormat.totalColumnSize(schema);
47
45
  this.nextVariableLengthDataOffset = fixedRecordSize;
48
46
  newBuffer();
@@ -54,7 +52,7 @@ public class PageBuilder
54
52
  this.bufferSlice = Slices.wrappedBuffer(buffer.array(), buffer.offset(), buffer.capacity());
55
53
  this.count = 0;
56
54
  this.position = PageFormat.PAGE_HEADER_SIZE;
57
- this.stringReferences.clear();
55
+ this.stringReferences = new ArrayList<>();
58
56
  this.valueReferences = new ArrayList<>();
59
57
  this.referenceSize = 0;
60
58
  }
@@ -71,12 +69,8 @@ public class PageBuilder
71
69
 
72
70
  public void setNull(int columnIndex)
73
71
  {
74
- nullBitSet[columnIndex >>> 3] |= (1 << (columnIndex & 7));
75
- }
72
+ row.setNull(columnIndex);
76
73
 
77
- private void clearNull(int columnIndex)
78
- {
79
- nullBitSet[columnIndex >>> 3] &= ~(1 << (columnIndex & 7));
80
74
  }
81
75
 
82
76
  public void setBoolean(Column column, boolean value)
@@ -87,8 +81,7 @@ public class PageBuilder
87
81
 
88
82
  public void setBoolean(int columnIndex, boolean value)
89
83
  {
90
- bufferSlice.setByte(getOffset(columnIndex), value ? (byte) 1 : (byte) 0);
91
- clearNull(columnIndex);
84
+ row.setBoolean(columnIndex, value);
92
85
  }
93
86
 
94
87
  public void setLong(Column column, long value)
@@ -99,8 +92,7 @@ public class PageBuilder
99
92
 
100
93
  public void setLong(int columnIndex, long value)
101
94
  {
102
- bufferSlice.setLong(getOffset(columnIndex), value);
103
- clearNull(columnIndex);
95
+ row.setLong(columnIndex, value);
104
96
  }
105
97
 
106
98
  public void setDouble(Column column, double value)
@@ -111,8 +103,7 @@ public class PageBuilder
111
103
 
112
104
  public void setDouble(int columnIndex, double value)
113
105
  {
114
- bufferSlice.setDouble(getOffset(columnIndex), value);
115
- clearNull(columnIndex);
106
+ row.setDouble(columnIndex, value);
116
107
  }
117
108
 
118
109
  public void setString(Column column, String value)
@@ -125,19 +116,10 @@ public class PageBuilder
125
116
  {
126
117
  if (value == null) {
127
118
  setNull(columnIndex);
128
- return;
129
119
  }
130
-
131
- Integer reuseIndex = stringReferences.get(value);
132
- if (reuseIndex != null) {
133
- bufferSlice.setInt(getOffset(columnIndex), reuseIndex);
134
- } else {
135
- int index = stringReferences.size();
136
- stringReferences.put(value, index);
137
- bufferSlice.setInt(getOffset(columnIndex), index);
138
- referenceSize += value.length() * 2 + 4; // assuming size of char = size of byte * 2 + length
120
+ else {
121
+ row.setString(columnIndex, value);
139
122
  }
140
- clearNull(columnIndex);
141
123
  }
142
124
 
143
125
  public void setJson(Column column, Value value)
@@ -150,14 +132,10 @@ public class PageBuilder
150
132
  {
151
133
  if (value == null) {
152
134
  setNull(columnIndex);
153
- return;
154
135
  }
155
-
156
- int index = valueReferences.size();
157
- valueReferences.add(value.immutableValue());
158
- bufferSlice.setInt(getOffset(columnIndex), index);
159
- referenceSize += 256; // TODO how to estimate size of the value?
160
- clearNull(columnIndex);
136
+ else {
137
+ row.setJson(columnIndex, value);
138
+ }
161
139
  }
162
140
 
163
141
  public void setTimestamp(Column column, Timestamp value)
@@ -170,49 +148,76 @@ public class PageBuilder
170
148
  {
171
149
  if (value == null) {
172
150
  setNull(columnIndex);
173
- return;
174
151
  }
152
+ else {
153
+ row.setTimestamp(columnIndex, value);
154
+ }
155
+ }
175
156
 
176
- int offset = getOffset(columnIndex);
177
- bufferSlice.setLong(offset, value.getEpochSecond());
178
- bufferSlice.setInt(offset + 8, value.getNano());
157
+ private void writeNull(int columnIndex)
158
+ {
159
+ nullBitSet[columnIndex >>> 3] |= (1 << (columnIndex & 7));
160
+ }
161
+
162
+ private void clearNull(int columnIndex)
163
+ {
164
+ nullBitSet[columnIndex >>> 3] &= ~(1 << (columnIndex & 7));
165
+ }
166
+
167
+ private void writeBoolean(int columnIndex, boolean value)
168
+ {
169
+ bufferSlice.setByte(getOffset(columnIndex), value ? (byte) 1 : (byte) 0);
179
170
  clearNull(columnIndex);
180
171
  }
181
172
 
182
- private int getOffset(int columnIndex)
173
+ private void writeLong(int columnIndex, long value)
183
174
  {
184
- return position + columnOffsets[columnIndex];
175
+ bufferSlice.setLong(getOffset(columnIndex), value);
176
+ clearNull(columnIndex);
185
177
  }
186
178
 
187
- private static class StringReferenceSortComparator
188
- implements Comparator<Map.Entry<String, Integer>>, Serializable
179
+ private void writeDouble(int columnIndex, double value)
189
180
  {
190
- @Override
191
- public int compare(Map.Entry<String, Integer> e1, Map.Entry<String, Integer> e2)
192
- {
193
- return e1.getValue().compareTo(e2.getValue());
194
- }
181
+ bufferSlice.setDouble(getOffset(columnIndex), value);
182
+ clearNull(columnIndex);
183
+ }
195
184
 
196
- @Override
197
- public boolean equals(Object obj)
198
- {
199
- return obj instanceof StringReferenceSortComparator;
200
- }
185
+ private void writeString(int columnIndex, String value)
186
+ {
187
+ int index = stringReferences.size();
188
+ stringReferences.add(value);
189
+ bufferSlice.setInt(getOffset(columnIndex), index);
190
+ referenceSize += value.length() * 2 + 4; // assuming size of char = size of byte * 2 + length
191
+ clearNull(columnIndex);
201
192
  }
202
193
 
203
- private List<String> getSortedStringReferences()
194
+ private void writeJson(int columnIndex, Value value)
204
195
  {
205
- ArrayList<Map.Entry<String, Integer>> s = new ArrayList<>(stringReferences.entrySet());
206
- Collections.sort(s, new StringReferenceSortComparator());
207
- String[] array = new String[s.size()];
208
- for (int i=0; i < array.length; i++) {
209
- array[i] = s.get(i).getKey();
210
- }
211
- return Arrays.asList(array);
196
+ int index = valueReferences.size();
197
+ valueReferences.add(value.immutableValue());
198
+ bufferSlice.setInt(getOffset(columnIndex), index);
199
+ referenceSize += 256; // TODO how to estimate size of the value?
200
+ clearNull(columnIndex);
201
+ }
202
+
203
+ private void writeTimestamp(int columnIndex, Timestamp value)
204
+ {
205
+ int offset = getOffset(columnIndex);
206
+ bufferSlice.setLong(offset, value.getEpochSecond());
207
+ bufferSlice.setInt(offset + 8, value.getNano());
208
+ clearNull(columnIndex);
209
+ }
210
+
211
+ private int getOffset(int columnIndex)
212
+ {
213
+ return position + columnOffsets[columnIndex];
212
214
  }
213
215
 
214
216
  public void addRecord()
215
217
  {
218
+ // record
219
+ row.write(this);
220
+
216
221
  // record header
217
222
  bufferSlice.setInt(position, nextVariableLengthDataOffset); // nextVariableLengthDataOffset means record size
218
223
  bufferSlice.setBytes(position + 4, nullBitSet);
@@ -237,7 +242,7 @@ public class PageBuilder
237
242
 
238
243
  // flush page
239
244
  Page page = Page.wrap(buffer)
240
- .setStringReferences(getSortedStringReferences())
245
+ .setStringReferences(stringReferences)
241
246
  .setValueReferences(valueReferences);
242
247
  buffer = null;
243
248
  bufferSlice = null;
@@ -270,6 +275,322 @@ public class PageBuilder
270
275
  output.close();
271
276
  }
272
277
 
278
+ /**
279
+ * Row is a container to stage values before adding into reference lists such as |stringReferences|.
280
+ *
281
+ * |Row| works as a buffer against plugins that may add values incorrectly without |PageBuilder#addRecord|.
282
+ * It accepts just one value per column while |PageBuilder| can double-store values regardless of columns.
283
+ * Double-stored values are overwritten.
284
+ */
285
+ private static class Row
286
+ {
287
+ private static Row newRow(Schema schema)
288
+ {
289
+ ColumnValue[] values = new ColumnValue[schema.getColumnCount()];
290
+ for (Column column : schema.getColumns()) {
291
+ values[column.getIndex()] = newValue(column);
292
+ }
293
+ return new Row(values);
294
+ }
295
+
296
+ private static ColumnValue newValue(Column column)
297
+ {
298
+ Type type = column.getType();
299
+ if (type.equals(Types.BOOLEAN)) {
300
+ return new BooleanColumnValue(column);
301
+ }
302
+ else if (type.equals(Types.DOUBLE)) {
303
+ return new DoubleColumnValue(column);
304
+ }
305
+ else if (type.equals(Types.LONG)) {
306
+ return new LongColumnValue(column);
307
+ }
308
+ else if (type.equals(Types.STRING)) {
309
+ return new StringColumnValue(column);
310
+ }
311
+ else if (type.equals(Types.JSON)) {
312
+ return new JsonColumnValue(column);
313
+ }
314
+ else if (type.equals(Types.TIMESTAMP)) {
315
+ return new TimestampColumnValue(column);
316
+ }
317
+ else {
318
+ throw new IllegalStateException("Unsupported type " + type.getName());
319
+ }
320
+ }
321
+
322
+ private final ColumnValue[] values;
323
+
324
+ private Row(ColumnValue[] values)
325
+ {
326
+ this.values = values;
327
+ }
328
+
329
+ private void setNull(int columnIndex)
330
+ {
331
+ values[columnIndex].setNull();
332
+ }
333
+
334
+ private void setBoolean(int columnIndex, boolean value)
335
+ {
336
+ values[columnIndex].setBoolean(value);
337
+ }
338
+
339
+ private void setLong(int columnIndex, long value)
340
+ {
341
+ values[columnIndex].setLong(value);
342
+ }
343
+
344
+ private void setDouble(int columnIndex, double value)
345
+ {
346
+ values[columnIndex].setDouble(value);
347
+ }
348
+
349
+ private void setString(int columnIndex, String value)
350
+ {
351
+ values[columnIndex].setString(value);
352
+ }
353
+
354
+ private void setJson(int columnIndex, Value value)
355
+ {
356
+ values[columnIndex].setJson(value);
357
+ }
358
+
359
+ private void setTimestamp(int columnIndex, Timestamp value)
360
+ {
361
+ values[columnIndex].setTimestamp(value);
362
+ }
363
+
364
+ private void write(PageBuilder pageBuilder)
365
+ {
366
+ for (ColumnValue v : values) {
367
+ v.write(pageBuilder);
368
+ }
369
+ }
370
+ }
371
+
372
+ private interface ColumnValue
373
+ {
374
+ void setBoolean(boolean value);
375
+
376
+ void setLong(long value);
377
+
378
+ void setDouble(double value);
379
+
380
+ void setString(String value);
381
+
382
+ void setJson(Value value);
383
+
384
+ void setTimestamp(Timestamp value);
385
+
386
+ void setNull();
387
+
388
+ void write(PageBuilder pageBuilder);
389
+ }
390
+
391
+ private static abstract class AbstractColumnValue
392
+ implements ColumnValue
393
+ {
394
+ protected final Column column;
395
+ protected boolean isNull;
396
+
397
+ protected AbstractColumnValue(Column column)
398
+ {
399
+ this.column = column;
400
+ }
401
+
402
+ public void setBoolean(boolean value)
403
+ {
404
+ throw new IllegalStateException("Not reach here");
405
+ }
406
+
407
+ public void setLong(long value)
408
+ {
409
+ throw new IllegalStateException("Not reach here");
410
+ }
411
+
412
+ public void setDouble(double value)
413
+ {
414
+ throw new IllegalStateException("Not reach here");
415
+ }
416
+
417
+ public void setString(String value)
418
+ {
419
+ throw new IllegalStateException("Not reach here");
420
+ }
421
+
422
+ public void setJson(Value value)
423
+ {
424
+ throw new IllegalStateException("Not reach here");
425
+ }
426
+
427
+ public void setTimestamp(Timestamp value)
428
+ {
429
+ throw new IllegalStateException("Not reach here");
430
+ }
431
+
432
+ public void setNull()
433
+ {
434
+ isNull = true;
435
+ }
436
+
437
+ public void write(PageBuilder pageBuilder)
438
+ {
439
+ if (!isNull) {
440
+ writeNotNull(pageBuilder);
441
+ }
442
+ else {
443
+ pageBuilder.writeNull(column.getIndex());
444
+ }
445
+ }
446
+
447
+ protected abstract void writeNotNull(PageBuilder pageBuilder);
448
+ }
449
+
450
+ private static class BooleanColumnValue
451
+ extends AbstractColumnValue
452
+ {
453
+ private boolean value;
454
+
455
+ BooleanColumnValue(Column column)
456
+ {
457
+ super(column);
458
+ }
459
+
460
+ @Override
461
+ public void setBoolean(boolean value)
462
+ {
463
+ this.value = value;
464
+ this.isNull = false;
465
+ }
466
+
467
+ @Override
468
+ public void writeNotNull(PageBuilder pageBuilder)
469
+ {
470
+ pageBuilder.writeBoolean(column.getIndex(), value);
471
+ }
472
+ }
473
+
474
+ private static class LongColumnValue
475
+ extends AbstractColumnValue
476
+ {
477
+ private long value;
478
+
479
+ LongColumnValue(Column column)
480
+ {
481
+ super(column);
482
+ }
483
+
484
+ @Override
485
+ public void setLong(long value)
486
+ {
487
+ this.value = value;
488
+ this.isNull = false;
489
+ }
490
+
491
+ @Override
492
+ public void writeNotNull(PageBuilder pageBuilder)
493
+ {
494
+ pageBuilder.writeLong(column.getIndex(), value);
495
+ }
496
+ }
497
+
498
+ private static class DoubleColumnValue
499
+ extends AbstractColumnValue
500
+ {
501
+ private double value;
502
+
503
+ DoubleColumnValue(Column column)
504
+ {
505
+ super(column);
506
+ }
507
+
508
+ @Override
509
+ public void setDouble(double value)
510
+ {
511
+ this.value = value;
512
+ this.isNull = false;
513
+ }
514
+
515
+ @Override
516
+ public void writeNotNull(PageBuilder pageBuilder)
517
+ {
518
+ pageBuilder.writeDouble(column.getIndex(), value);
519
+ }
520
+ }
521
+
522
+ private static class StringColumnValue
523
+ extends AbstractColumnValue
524
+ {
525
+ private String value;
526
+
527
+ StringColumnValue(Column column)
528
+ {
529
+ super(column);
530
+ }
531
+
532
+ @Override
533
+ public void setString(String value)
534
+ {
535
+ this.value = value;
536
+ this.isNull = false;
537
+ }
538
+
539
+ @Override
540
+ public void writeNotNull(PageBuilder pageBuilder)
541
+ {
542
+ pageBuilder.writeString(column.getIndex(), value);
543
+ }
544
+ }
545
+
546
+ private static class JsonColumnValue
547
+ extends AbstractColumnValue
548
+ {
549
+ private Value value;
550
+
551
+ JsonColumnValue(Column column)
552
+ {
553
+ super(column);
554
+ }
555
+
556
+ @Override
557
+ public void setJson(Value value)
558
+ {
559
+ this.value = value;
560
+ this.isNull = false;
561
+ }
562
+
563
+ @Override
564
+ public void writeNotNull(PageBuilder pageBuilder)
565
+ {
566
+ pageBuilder.writeJson(column.getIndex(), value);
567
+ }
568
+ }
569
+
570
+ private static class TimestampColumnValue
571
+ extends AbstractColumnValue
572
+ {
573
+ private Timestamp value;
574
+
575
+ TimestampColumnValue(Column column)
576
+ {
577
+ super(column);
578
+ }
579
+
580
+ @Override
581
+ public void setTimestamp(Timestamp value)
582
+ {
583
+ this.value = value;
584
+ this.isNull = false;
585
+ }
586
+
587
+ @Override
588
+ public void writeNotNull(PageBuilder pageBuilder)
589
+ {
590
+ pageBuilder.writeTimestamp(column.getIndex(), value);
591
+ }
592
+ }
593
+
273
594
  /* TODO for variable-length types
274
595
  private void flushAndTakeOverRemaingData()
275
596
  {