embulk 0.8.15 → 0.8.16

Sign up to get free protection for your applications and to get access to all the features.
Files changed (118) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -1
  3. data/appveyor.yml +8 -0
  4. data/build.gradle +86 -45
  5. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +1 -1
  6. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +43 -4
  7. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +15 -0
  8. data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +38 -1
  9. data/embulk-docs/src/built-in.rst +34 -0
  10. data/embulk-docs/src/release.rst +1 -0
  11. data/embulk-docs/src/release/release-0.8.16.rst +43 -0
  12. data/embulk-standards/build.gradle +1 -0
  13. data/embulk-standards/src/main/java/org/embulk/standards/RemoveColumnsFilterPlugin.java +268 -0
  14. data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +13 -0
  15. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +1 -0
  16. data/embulk-standards/src/test/java/org/embulk/standards/TestRemoveColumnsFilterPlugin.java +121 -0
  17. data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +8 -0
  18. data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvAllStringsGuessPlugin.java +38 -0
  19. data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +229 -0
  20. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row.csv +1 -0
  21. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header.csv +2 -0
  22. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_guessed.yml +12 -0
  23. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_seed.yml +1 -0
  24. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_guessed.yml +12 -0
  25. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_seed.yml +1 -0
  26. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows.csv +1 -0
  27. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header.csv +2 -0
  28. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_guessed.yml +16 -0
  29. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_seed.yml +1 -0
  30. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed.csv +2 -0
  31. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_guessed.yml +16 -0
  32. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_seed.yml +1 -0
  33. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_guessed.yml +16 -0
  34. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_seed.yml +1 -0
  35. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed.csv +1 -0
  36. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_guessed.yml +16 -0
  37. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_seed.yml +1 -0
  38. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row.csv +1 -0
  39. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header.csv +2 -0
  40. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_guessed.yml +12 -0
  41. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_seed.yml +1 -0
  42. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_guessed.yml +12 -0
  43. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_seed.yml +1 -0
  44. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows.csv +2 -0
  45. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_guessed.yml +12 -0
  46. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_seed.yml +1 -0
  47. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows.csv +2 -0
  48. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header.csv +3 -0
  49. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_guessed.yml +16 -0
  50. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_seed.yml +1 -0
  51. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_guessed.yml +16 -0
  52. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_seed.yml +1 -0
  53. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows.csv +2 -0
  54. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_guessed.yml +12 -0
  55. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_seed.yml +1 -0
  56. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape.csv +5 -0
  57. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_guessed.yml +17 -0
  58. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_seed.yml +1 -0
  59. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column.csv +4 -0
  60. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_guessed.yml +12 -0
  61. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_seed.yml +1 -0
  62. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header.csv +5 -0
  63. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_guessed.yml +12 -0
  64. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_seed.yml +1 -0
  65. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter.csv +5 -0
  66. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_guessed.yml +17 -0
  67. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_seed.yml +1 -0
  68. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple.csv +5 -0
  69. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_guessed.yml +17 -0
  70. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_seed.yml +1 -0
  71. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote.csv +5 -0
  72. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_guessed.yml +17 -0
  73. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_seed.yml +1 -0
  74. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column.csv +4 -0
  75. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_guessed.yml +12 -0
  76. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_seed.yml +1 -0
  77. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header.csv +5 -0
  78. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_guessed.yml +12 -0
  79. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_seed.yml +1 -0
  80. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter.csv +4 -0
  81. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_guessed.yml +16 -0
  82. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_seed.yml +1 -0
  83. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple.csv +5 -0
  84. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_guessed.yml +17 -0
  85. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_seed.yml +1 -0
  86. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep.csv +5 -0
  87. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_expected.csv +4 -0
  88. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_filter.yml +2 -0
  89. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_in.yml +18 -0
  90. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.csv +5 -0
  91. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.yml +2 -0
  92. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_expected.csv +4 -0
  93. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_in.yml +17 -0
  94. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_unmatched_filter.yml +3 -0
  95. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_without_unmatched_filter.yml +2 -0
  96. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove.csv +5 -0
  97. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_expected.csv +4 -0
  98. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_filter.yml +2 -0
  99. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml +18 -0
  100. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_with_unmatched_filter.yml +3 -0
  101. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_without_unmatched_filter.yml +2 -0
  102. data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +458 -28
  103. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  104. data/gradle/wrapper/gradle-wrapper.properties +2 -2
  105. data/gradlew +30 -21
  106. data/gradlew.bat +4 -10
  107. data/lib/embulk/command/embulk_migrate_plugin.rb +2 -2
  108. data/lib/embulk/data/new/java/build.gradle.erb +5 -3
  109. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
  110. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +2 -2
  111. data/lib/embulk/data/new/java/gradlew +30 -21
  112. data/lib/embulk/data/new/java/gradlew.bat +4 -10
  113. data/lib/embulk/guess/csv.rb +44 -22
  114. data/lib/embulk/guess/newline.rb +10 -4
  115. data/lib/embulk/guess_plugin.rb +3 -1
  116. data/lib/embulk/java/time_helper.rb +2 -2
  117. data/lib/embulk/version.rb +1 -1
  118. metadata +92 -5
@@ -0,0 +1,4 @@
1
+ 1,32864,20150127
2
+ 2,14824,20150127
3
+ 3,27559,20150128
4
+ 4,11270,20150129
@@ -0,0 +1,18 @@
1
+ type: file
2
+ parser:
3
+ charset: UTF-8
4
+ newline: CRLF
5
+ type: csv
6
+ delimiter: ','
7
+ quote: '"'
8
+ escape: '"'
9
+ trim_if_not_quoted: false
10
+ skip_header_lines: 1
11
+ allow_extra_columns: false
12
+ allow_optional_columns: false
13
+ columns:
14
+ - {name: id, type: long}
15
+ - {name: account, type: long}
16
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
17
+ - {name: purchase, type: string}
18
+ - {name: comment, type: string}
@@ -0,0 +1,3 @@
1
+ type: remove_columns
2
+ remove: [invalid, time, comment]
3
+ accept_unmatched_columns: true
@@ -1,11 +1,14 @@
1
1
  package org.embulk.test;
2
2
 
3
+ import com.google.common.base.Throwables;
3
4
  import com.google.common.collect.ImmutableList;
4
- import com.google.common.base.Optional;
5
+ import com.google.common.collect.Lists;
5
6
  import com.google.common.io.ByteStreams;
6
7
  import com.google.inject.Binder;
7
8
  import com.google.inject.Injector;
8
9
  import com.google.inject.Module;
10
+
11
+ import java.io.BufferedReader;
9
12
  import java.io.IOException;
10
13
  import java.io.InputStream;
11
14
  import java.io.OutputStream;
@@ -15,22 +18,33 @@ import java.nio.file.Path;
15
18
  import java.util.ArrayList;
16
19
  import java.util.Collections;
17
20
  import java.util.List;
21
+
18
22
  import org.embulk.EmbulkEmbed;
19
- import org.embulk.config.Config;
20
23
  import org.embulk.config.ConfigDiff;
21
24
  import org.embulk.config.ConfigLoader;
22
25
  import org.embulk.config.ConfigSource;
26
+ import org.embulk.config.ModelManager;
23
27
  import org.embulk.config.TaskReport;
24
- import org.embulk.exec.ExecutionResult;
28
+ import org.embulk.spi.ColumnConfig;
25
29
  import org.embulk.spi.Schema;
30
+ import org.embulk.spi.SchemaConfig;
26
31
  import org.embulk.spi.TempFileException;
27
32
  import org.embulk.spi.TempFileSpace;
33
+ import org.embulk.spi.type.Type;
28
34
  import org.junit.rules.TestRule;
29
35
  import org.junit.rules.TestWatcher;
30
36
  import org.junit.runner.Description;
31
37
  import org.junit.runners.model.Statement;
32
38
  import static com.google.common.base.Preconditions.checkArgument;
39
+ import static java.nio.charset.StandardCharsets.UTF_8;
40
+ import static java.nio.file.Files.newBufferedReader;
41
+ import static java.util.Locale.ENGLISH;
42
+ import static com.google.common.base.Preconditions.checkNotNull;
43
+ import static com.google.common.base.Preconditions.checkState;
44
+ import static java.nio.charset.StandardCharsets.UTF_8;
45
+ import static java.nio.file.Files.newBufferedReader;
33
46
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
47
+ import static org.embulk.test.EmbulkTests.copyResource;
34
48
 
35
49
  public class TestingEmbulk
36
50
  implements TestRule
@@ -152,6 +166,10 @@ public class TestingEmbulk
152
166
  .fromYamlString(EmbulkTests.readResource(name));
153
167
  }
154
168
 
169
+ private static final List<String> SUPPORTED_TYPES = ImmutableList.of(
170
+ "boolean", "long", "double", "string", "timestamp", "json"
171
+ );
172
+
155
173
  public static interface RunResult
156
174
  {
157
175
  ConfigDiff getConfigDiff();
@@ -167,38 +185,341 @@ public class TestingEmbulk
167
185
  List<TaskReport> getOutputTaskReports();
168
186
  }
169
187
 
170
- public RunResult runInput(ConfigSource inConfig, Path outputPath)
171
- throws IOException
188
+ public class InputBuilder
172
189
  {
173
- String fileName = outputPath.getFileName().toString();
174
- checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
175
- Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
190
+ private ConfigSource inConfig = null;
191
+ private List<ConfigSource> filtersConfig = ImmutableList.of();
192
+ private ConfigSource execConfig = newConfig();
193
+ private Path outputPath = null;
176
194
 
177
- Files.createDirectories(dir);
195
+ private InputBuilder()
196
+ { }
178
197
 
179
- ConfigSource execConfig = newConfig()
180
- .set("min_output_tasks", 1);
198
+ public InputBuilder in(ConfigSource inConfig)
199
+ {
200
+ checkNotNull(inConfig, "inConfig");
201
+ this.inConfig = inConfig.deepCopy();
202
+ return this;
203
+ }
181
204
 
182
- ConfigSource outConfig = newConfig()
183
- .set("type", "file")
184
- .set("path_prefix", dir.resolve("fragments_").toString())
185
- .set("file_ext", "csv")
186
- .set("formatter", newConfig()
205
+ public InputBuilder filters(List<ConfigSource> filtersConfig)
206
+ {
207
+ checkNotNull(filtersConfig, "filtersConfig");
208
+ ImmutableList.Builder<ConfigSource> builder = ImmutableList.builder();
209
+ for (ConfigSource filter : filtersConfig) {
210
+ builder.add(filter.deepCopy());
211
+ }
212
+ this.filtersConfig = builder.build();
213
+ return this;
214
+ }
215
+
216
+ public InputBuilder exec(ConfigSource execConfig)
217
+ {
218
+ checkNotNull(execConfig, "execConfig");
219
+ this.execConfig = execConfig.deepCopy();
220
+ return this;
221
+ }
222
+
223
+ public InputBuilder outputPath(Path outputPath)
224
+ {
225
+ checkNotNull(outputPath, "outputPath");
226
+ this.outputPath = outputPath;
227
+ return this;
228
+ }
229
+
230
+ public ConfigDiff guess()
231
+ {
232
+ checkState(inConfig != null, "in config must be set");
233
+
234
+ // config = {exec: execConfig, in: inConfig}
235
+ ConfigSource config = newConfig()
236
+ .set("exec", execConfig)
237
+ .set("in", inConfig)
238
+ .set("filters", filtersConfig);
239
+
240
+ // embed.guess returns GuessExecutor.ConfigDiff
241
+ return embed.guess(config).getNested("in");
242
+ }
243
+
244
+ public RunResult run()
245
+ throws IOException
246
+ {
247
+ checkState(inConfig != null, "in config must be set");
248
+ checkState(outputPath != null, "outputPath must be set");
249
+
250
+ String fileName = outputPath.getFileName().toString();
251
+ checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
252
+ Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
253
+
254
+ Files.createDirectories(dir);
255
+
256
+ // exec: config
257
+ execConfig.set("min_output_tasks", 1);
258
+
259
+ // out: config
260
+ ConfigSource outConfig = newConfig()
261
+ .set("type", "file")
262
+ .set("path_prefix", dir.resolve("fragments_").toString())
263
+ .set("file_ext", "csv")
264
+ .set("formatter", newConfig()
265
+ .set("type", "csv")
266
+ .set("header_line", false)
267
+ .set("newline", "LF"));
268
+
269
+ // combine exec:, out: and in:
270
+ ConfigSource config = newConfig()
271
+ .set("exec", execConfig)
272
+ .set("in", inConfig)
273
+ .set("filters", filtersConfig)
274
+ .set("out", outConfig);
275
+
276
+ // embed.run returns TestingBulkLoader.TestingExecutionResult because
277
+ // LoaderState.buildExecuteResultWithWarningException is overridden.
278
+ RunResult result = (RunResult) embed.run(config);
279
+
280
+ return buildRunResultWithOutput(result, dir, outputPath);
281
+ }
282
+ }
283
+
284
+ public class ParserBuilder
285
+ {
286
+ private ConfigSource parserConfig = newConfig();
287
+ private ConfigSource execConfig = newConfig();
288
+ private Path inputPath = null;
289
+ private Path outputPath = null;
290
+
291
+ private ParserBuilder()
292
+ { }
293
+
294
+ public ParserBuilder parser(ConfigSource parserConfig)
295
+ {
296
+ checkNotNull(parserConfig, "parserConfig");
297
+ this.parserConfig = parserConfig.deepCopy();
298
+ return this;
299
+ }
300
+
301
+ public ParserBuilder exec(ConfigSource execConfig)
302
+ {
303
+ checkNotNull(execConfig, "execConfig");
304
+ this.execConfig = execConfig.deepCopy();
305
+ return this;
306
+ }
307
+
308
+ public ParserBuilder inputPath(Path inputPath)
309
+ {
310
+ checkNotNull(inputPath, "inputPath");
311
+ this.inputPath = inputPath;
312
+ return this;
313
+ }
314
+
315
+ public ParserBuilder inputResource(String resourceName)
316
+ throws IOException
317
+ {
318
+ checkNotNull(resourceName, "resourceName");
319
+ Path path = createTempFile("csv");
320
+ copyResource(resourceName, path);
321
+ return inputPath(path);
322
+ }
323
+
324
+ public ParserBuilder outputPath(Path outputPath)
325
+ {
326
+ checkNotNull(outputPath, "outputPath");
327
+ this.outputPath = outputPath;
328
+ return this;
329
+ }
330
+
331
+ public ConfigDiff guess()
332
+ {
333
+ checkState(inputPath != null, "inputPath must be set");
334
+
335
+ // in: config
336
+ ConfigSource inConfig = newConfig()
337
+ .set("type", "file")
338
+ .set("path_prefix", inputPath.toAbsolutePath().toString());
339
+ inConfig.set("parser", parserConfig);
340
+
341
+ // config = {exec: execConfig, in: inConfig}
342
+ ConfigSource config = newConfig()
343
+ .set("exec", execConfig)
344
+ .set("in", inConfig);
345
+
346
+ // embed.guess calls GuessExecutor and returns ConfigDiff
347
+ return embed.guess(config).getNested("in").getNested("parser");
348
+ }
349
+
350
+ public RunResult run()
351
+ throws IOException
352
+ {
353
+ checkState(parserConfig != null, "parser config must be set");
354
+ checkState(inputPath != null, "inputPath must be set");
355
+ checkState(outputPath != null, "outputPath must be set");
356
+
357
+ String fileName = outputPath.getFileName().toString();
358
+ checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
359
+ Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
360
+
361
+ Files.createDirectories(dir);
362
+
363
+ // in: config
364
+ ConfigSource inConfig = newConfig()
365
+ .set("type", "file")
366
+ .set("path_prefix", inputPath.toAbsolutePath().toString());
367
+ inConfig.set("parser", parserConfig);
368
+
369
+ // exec: config
370
+ execConfig.set("min_output_tasks", 1);
371
+
372
+ // out: config
373
+ ConfigSource outConfig = newConfig()
374
+ .set("type", "file")
375
+ .set("path_prefix", dir.resolve("fragments_").toString())
376
+ .set("file_ext", "csv")
377
+ .set("formatter", newConfig()
378
+ .set("type", "csv")
379
+ .set("header_line", false)
380
+ .set("newline", "LF"));
381
+
382
+ // config = {exec: execConfig, in: inConfig, out: outConfig}
383
+ ConfigSource config = newConfig()
384
+ .set("exec", execConfig)
385
+ .set("in", inConfig)
386
+ .set("out", outConfig);
387
+
388
+ // embed.run returns TestingBulkLoader.TestingExecutionResult because
389
+ // LoaderState.buildExecuteResultWithWarningException is overridden.
390
+ RunResult result = (RunResult) embed.run(config);
391
+
392
+ return buildRunResultWithOutput(result, dir, outputPath);
393
+ }
394
+ }
395
+
396
+ public class OutputBuilder
397
+ {
398
+ private ConfigSource outConfig = null;
399
+ private ConfigSource execConfig = newConfig();
400
+ private Path inputPath;
401
+ private SchemaConfig inputSchema;
402
+
403
+ public OutputBuilder()
404
+ { }
405
+
406
+ public OutputBuilder out(ConfigSource outConfig)
407
+ {
408
+ checkNotNull(outConfig, "outConfig");
409
+ this.outConfig = outConfig;
410
+ return this;
411
+ }
412
+
413
+ public OutputBuilder exec(ConfigSource execConfig)
414
+ {
415
+ checkNotNull(execConfig, "execConfig");
416
+ this.execConfig = execConfig;
417
+ return this;
418
+ }
419
+
420
+ public OutputBuilder inputPath(Path inputPath)
421
+ {
422
+ checkNotNull(inputPath, "inputPath");
423
+ this.inputPath = inputPath;
424
+ return this;
425
+ }
426
+
427
+ public OutputBuilder inputResource(String resourceName)
428
+ throws IOException
429
+ {
430
+ checkNotNull(resourceName, "resourceName");
431
+ Path path = createTempFile("csv");
432
+ copyResource(resourceName, path);
433
+ return inputPath(path);
434
+ }
435
+
436
+ public OutputBuilder inputSchema(SchemaConfig inputSchema)
437
+ {
438
+ checkNotNull(inputSchema, "inputSchema");
439
+ this.inputSchema = inputSchema;
440
+ return this;
441
+ }
442
+
443
+ public RunResult run()
444
+ throws IOException
445
+ {
446
+ checkState(outConfig != null, "out config must be set");
447
+ checkState(inputPath != null, "inputPath must be set");
448
+
449
+ String fileName = inputPath.toAbsolutePath().toString();
450
+ checkArgument(fileName.endsWith(".csv"), "inputPath must end with .csv");
451
+
452
+ // exec: config
453
+ execConfig.set("min_output_tasks", 1);
454
+
455
+ // in: config
456
+ ConfigSource inConfig = newConfig()
457
+ .set("type", "file")
458
+ .set("path_prefix", fileName)
459
+ .set("parser", newParserConfig());
460
+
461
+ // config = {exec: execConfig, in: inConfig, out: outConfig}
462
+ ConfigSource config = newConfig()
463
+ .set("exec", execConfig)
464
+ .set("in", inConfig)
465
+ .set("out", outConfig);
466
+
467
+ // embed.run returns TestingBulkLoader.TestingExecutionResult because
468
+ // LoaderState.buildExecuteResultWithWarningException is overridden.
469
+ return (RunResult) embed.run(config);
470
+ }
471
+
472
+ private ConfigSource newParserConfig()
473
+ {
474
+ return newConfig()
475
+ .set("charset", "UTF-8")
476
+ .set("newline", "LF")
187
477
  .set("type", "csv")
188
- .set("header_line", false)
189
- .set("newline", "LF"));
478
+ .set("delimiter", ",")
479
+ .set("quote", "\"")
480
+ .set("escape", "\"")
481
+ .set("columns", newSchemaConfig());
482
+ }
190
483
 
191
- ConfigSource config = newConfig()
192
- .set("exec", execConfig)
193
- .set("in", inConfig)
194
- .set("out", outConfig);
484
+ private SchemaConfig newSchemaConfig()
485
+ {
486
+ ImmutableList.Builder<ColumnConfig> schema = ImmutableList.builder();
487
+ try (BufferedReader reader = newBufferedReader(inputPath, UTF_8)) {
488
+ for (String column : reader.readLine().split(",")) {
489
+ ColumnConfig columnConfig = newColumnConfig(column);
490
+ if (columnConfig != null) {
491
+ schema.add(columnConfig);
492
+ }
493
+ }
494
+ return new SchemaConfig(schema.build());
495
+ }
496
+ catch (IOException e) {
497
+ throw Throwables.propagate(e);
498
+ }
499
+ }
195
500
 
196
- // embed.run returns TestingBulkLoader.TestingExecutionResult because
197
- RunResult result = (RunResult) embed.run(config);
501
+ private ColumnConfig newColumnConfig(String column)
502
+ {
503
+ String[] tuple = column.split(":", 2);
504
+ checkArgument(tuple.length == 2, "tuple must be a pair of column name and type");
505
+ String type = tuple[1];
506
+ if (!SUPPORTED_TYPES.contains(type)) {
507
+ throw new IllegalArgumentException(String.format(ENGLISH,
508
+ "Unknown column type %s. Supported types are boolean, long, double, string, timestamp and json: %s",
509
+ tuple[1], column));
510
+ }
511
+ return new ColumnConfig(newConfig()
512
+ .set("name", tuple[0])
513
+ .set("type", type));
514
+ }
515
+ }
198
516
 
517
+ private RunResult buildRunResultWithOutput(RunResult result, Path outputDir, Path outputPath)
518
+ throws IOException
519
+ {
199
520
  try (OutputStream out = Files.newOutputStream(outputPath)) {
200
521
  List<Path> fragments = new ArrayList<Path>();
201
- try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "fragments_*.csv")) {
522
+ try (DirectoryStream<Path> stream = Files.newDirectoryStream(outputDir, "fragments_*.csv")) {
202
523
  for (Path fragment : stream) {
203
524
  fragments.add(fragment);
204
525
  }
@@ -214,9 +535,118 @@ public class TestingEmbulk
214
535
  return result;
215
536
  }
216
537
 
217
- // TODO add runOutput(ConfigSource outConfig, Path inputPath) where inputPath is a path to a CSV file
218
- // whose column types can be naturally guessed using csv guess plugin. Callers use EmbulkTests.copyResource
219
- // to copy a resource file to a temp file before calling it.
538
+ public InputBuilder inputBuilder()
539
+ {
540
+ return new InputBuilder();
541
+ }
542
+
543
+ public ParserBuilder parserBuilder()
544
+ {
545
+ return new ParserBuilder();
546
+ }
547
+
548
+ public OutputBuilder outputBuilder()
549
+ {
550
+ return new OutputBuilder();
551
+ }
552
+
553
+ public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath)
554
+ throws IOException
555
+ {
556
+ return parserBuilder()
557
+ .parser(parserConfig)
558
+ .inputPath(inputPath)
559
+ .outputPath(outputPath)
560
+ .run();
561
+ }
562
+
563
+ public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath, ConfigSource execConfig)
564
+ throws IOException
565
+ {
566
+ return parserBuilder()
567
+ .parser(parserConfig)
568
+ .inputPath(inputPath)
569
+ .outputPath(outputPath)
570
+ .exec(execConfig)
571
+ .run();
572
+ }
573
+
574
+ public RunResult runInput(ConfigSource inConfig, Path outputPath)
575
+ throws IOException
576
+ {
577
+ return inputBuilder()
578
+ .in(inConfig)
579
+ .outputPath(outputPath)
580
+ .run();
581
+ }
582
+
583
+ public RunResult runInput(ConfigSource inConfig, Path outputPath, ConfigSource execConfig)
584
+ throws IOException
585
+ {
586
+ return inputBuilder()
587
+ .exec(execConfig)
588
+ .in(inConfig)
589
+ .outputPath(outputPath)
590
+ .run();
591
+ }
592
+
593
+ public RunResult runOutput(ConfigSource outConfig, Path inputPath)
594
+ throws IOException
595
+ {
596
+ return outputBuilder()
597
+ .out(outConfig)
598
+ .inputPath(inputPath)
599
+ .run();
600
+ }
601
+
602
+ public RunResult runOutput(ConfigSource outConfig, Path inputPath, ConfigSource execConfig)
603
+ throws IOException
604
+ {
605
+ return outputBuilder()
606
+ .exec(execConfig)
607
+ .out(outConfig)
608
+ .inputPath(inputPath)
609
+ .run();
610
+ }
611
+
612
+ public ConfigDiff guessInput(ConfigSource inSeedConfig)
613
+ {
614
+ return inputBuilder()
615
+ .in(inSeedConfig)
616
+ .guess();
617
+ }
618
+
619
+ public ConfigDiff guessInput(ConfigSource inSeedConfig, ConfigSource execConfig)
620
+ {
621
+ return inputBuilder()
622
+ .exec(execConfig)
623
+ .in(inSeedConfig)
624
+ .guess();
625
+ }
626
+
627
+ public ConfigDiff guessParser(Path inputPath)
628
+ {
629
+ return parserBuilder()
630
+ .inputPath(inputPath)
631
+ .guess();
632
+ }
633
+
634
+ public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath)
635
+ {
636
+ return parserBuilder()
637
+ .parser(parserSeedConfig)
638
+ .inputPath(inputPath)
639
+ .guess();
640
+ }
641
+
642
+ public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath, ConfigSource execConfig)
643
+ {
644
+ return parserBuilder()
645
+ .parser(parserSeedConfig)
646
+ .inputPath(inputPath)
647
+ .exec(execConfig)
648
+ .guess();
649
+ }
220
650
 
221
651
  // TODO add runFilter(ConfigSource filterConfig, Path inputPath, Path outputPath) where inputPath is a path to
222
652
  // a CSV file whose column types can be naturally guessed using csv guess plugin.