embulk 0.8.15-java → 0.8.16-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -1
  3. data/appveyor.yml +8 -0
  4. data/build.gradle +86 -45
  5. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +1 -1
  6. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +43 -4
  7. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +15 -0
  8. data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +38 -1
  9. data/embulk-docs/src/built-in.rst +34 -0
  10. data/embulk-docs/src/release.rst +1 -0
  11. data/embulk-docs/src/release/release-0.8.16.rst +43 -0
  12. data/embulk-standards/build.gradle +1 -0
  13. data/embulk-standards/src/main/java/org/embulk/standards/RemoveColumnsFilterPlugin.java +268 -0
  14. data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +13 -0
  15. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +1 -0
  16. data/embulk-standards/src/test/java/org/embulk/standards/TestRemoveColumnsFilterPlugin.java +121 -0
  17. data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +8 -0
  18. data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvAllStringsGuessPlugin.java +38 -0
  19. data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +229 -0
  20. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row.csv +1 -0
  21. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header.csv +2 -0
  22. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_guessed.yml +12 -0
  23. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_seed.yml +1 -0
  24. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_guessed.yml +12 -0
  25. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_seed.yml +1 -0
  26. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows.csv +1 -0
  27. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header.csv +2 -0
  28. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_guessed.yml +16 -0
  29. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_seed.yml +1 -0
  30. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed.csv +2 -0
  31. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_guessed.yml +16 -0
  32. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_seed.yml +1 -0
  33. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_guessed.yml +16 -0
  34. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_seed.yml +1 -0
  35. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed.csv +1 -0
  36. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_guessed.yml +16 -0
  37. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_seed.yml +1 -0
  38. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row.csv +1 -0
  39. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header.csv +2 -0
  40. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_guessed.yml +12 -0
  41. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_seed.yml +1 -0
  42. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_guessed.yml +12 -0
  43. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_seed.yml +1 -0
  44. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows.csv +2 -0
  45. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_guessed.yml +12 -0
  46. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_seed.yml +1 -0
  47. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows.csv +2 -0
  48. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header.csv +3 -0
  49. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_guessed.yml +16 -0
  50. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_seed.yml +1 -0
  51. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_guessed.yml +16 -0
  52. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_seed.yml +1 -0
  53. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows.csv +2 -0
  54. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_guessed.yml +12 -0
  55. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_seed.yml +1 -0
  56. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape.csv +5 -0
  57. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_guessed.yml +17 -0
  58. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_seed.yml +1 -0
  59. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column.csv +4 -0
  60. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_guessed.yml +12 -0
  61. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_seed.yml +1 -0
  62. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header.csv +5 -0
  63. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_guessed.yml +12 -0
  64. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_seed.yml +1 -0
  65. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter.csv +5 -0
  66. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_guessed.yml +17 -0
  67. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_seed.yml +1 -0
  68. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple.csv +5 -0
  69. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_guessed.yml +17 -0
  70. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_seed.yml +1 -0
  71. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote.csv +5 -0
  72. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_guessed.yml +17 -0
  73. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_seed.yml +1 -0
  74. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column.csv +4 -0
  75. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_guessed.yml +12 -0
  76. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_seed.yml +1 -0
  77. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header.csv +5 -0
  78. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_guessed.yml +12 -0
  79. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_seed.yml +1 -0
  80. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter.csv +4 -0
  81. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_guessed.yml +16 -0
  82. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_seed.yml +1 -0
  83. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple.csv +5 -0
  84. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_guessed.yml +17 -0
  85. data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_seed.yml +1 -0
  86. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep.csv +5 -0
  87. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_expected.csv +4 -0
  88. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_filter.yml +2 -0
  89. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_in.yml +18 -0
  90. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.csv +5 -0
  91. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.yml +2 -0
  92. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_expected.csv +4 -0
  93. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_in.yml +17 -0
  94. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_unmatched_filter.yml +3 -0
  95. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_without_unmatched_filter.yml +2 -0
  96. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove.csv +5 -0
  97. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_expected.csv +4 -0
  98. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_filter.yml +2 -0
  99. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml +18 -0
  100. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_with_unmatched_filter.yml +3 -0
  101. data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_without_unmatched_filter.yml +2 -0
  102. data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +458 -28
  103. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  104. data/gradle/wrapper/gradle-wrapper.properties +2 -2
  105. data/gradlew +30 -21
  106. data/gradlew.bat +4 -10
  107. data/lib/embulk/command/embulk_migrate_plugin.rb +2 -2
  108. data/lib/embulk/data/new/java/build.gradle.erb +5 -3
  109. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
  110. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +2 -2
  111. data/lib/embulk/data/new/java/gradlew +30 -21
  112. data/lib/embulk/data/new/java/gradlew.bat +4 -10
  113. data/lib/embulk/guess/csv.rb +44 -22
  114. data/lib/embulk/guess/newline.rb +10 -4
  115. data/lib/embulk/guess_plugin.rb +3 -1
  116. data/lib/embulk/java/time_helper.rb +2 -2
  117. data/lib/embulk/version.rb +1 -1
  118. metadata +92 -5
@@ -0,0 +1,4 @@
1
+ 1,32864,20150127
2
+ 2,14824,20150127
3
+ 3,27559,20150128
4
+ 4,11270,20150129
@@ -0,0 +1,18 @@
1
+ type: file
2
+ parser:
3
+ charset: UTF-8
4
+ newline: CRLF
5
+ type: csv
6
+ delimiter: ','
7
+ quote: '"'
8
+ escape: '"'
9
+ trim_if_not_quoted: false
10
+ skip_header_lines: 1
11
+ allow_extra_columns: false
12
+ allow_optional_columns: false
13
+ columns:
14
+ - {name: id, type: long}
15
+ - {name: account, type: long}
16
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
17
+ - {name: purchase, type: string}
18
+ - {name: comment, type: string}
@@ -0,0 +1,3 @@
1
+ type: remove_columns
2
+ remove: [invalid, time, comment]
3
+ accept_unmatched_columns: true
@@ -1,11 +1,14 @@
1
1
  package org.embulk.test;
2
2
 
3
+ import com.google.common.base.Throwables;
3
4
  import com.google.common.collect.ImmutableList;
4
- import com.google.common.base.Optional;
5
+ import com.google.common.collect.Lists;
5
6
  import com.google.common.io.ByteStreams;
6
7
  import com.google.inject.Binder;
7
8
  import com.google.inject.Injector;
8
9
  import com.google.inject.Module;
10
+
11
+ import java.io.BufferedReader;
9
12
  import java.io.IOException;
10
13
  import java.io.InputStream;
11
14
  import java.io.OutputStream;
@@ -15,22 +18,33 @@ import java.nio.file.Path;
15
18
  import java.util.ArrayList;
16
19
  import java.util.Collections;
17
20
  import java.util.List;
21
+
18
22
  import org.embulk.EmbulkEmbed;
19
- import org.embulk.config.Config;
20
23
  import org.embulk.config.ConfigDiff;
21
24
  import org.embulk.config.ConfigLoader;
22
25
  import org.embulk.config.ConfigSource;
26
+ import org.embulk.config.ModelManager;
23
27
  import org.embulk.config.TaskReport;
24
- import org.embulk.exec.ExecutionResult;
28
+ import org.embulk.spi.ColumnConfig;
25
29
  import org.embulk.spi.Schema;
30
+ import org.embulk.spi.SchemaConfig;
26
31
  import org.embulk.spi.TempFileException;
27
32
  import org.embulk.spi.TempFileSpace;
33
+ import org.embulk.spi.type.Type;
28
34
  import org.junit.rules.TestRule;
29
35
  import org.junit.rules.TestWatcher;
30
36
  import org.junit.runner.Description;
31
37
  import org.junit.runners.model.Statement;
32
38
  import static com.google.common.base.Preconditions.checkArgument;
39
+ import static java.nio.charset.StandardCharsets.UTF_8;
40
+ import static java.nio.file.Files.newBufferedReader;
41
+ import static java.util.Locale.ENGLISH;
42
+ import static com.google.common.base.Preconditions.checkNotNull;
43
+ import static com.google.common.base.Preconditions.checkState;
44
+ import static java.nio.charset.StandardCharsets.UTF_8;
45
+ import static java.nio.file.Files.newBufferedReader;
33
46
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
47
+ import static org.embulk.test.EmbulkTests.copyResource;
34
48
 
35
49
  public class TestingEmbulk
36
50
  implements TestRule
@@ -152,6 +166,10 @@ public class TestingEmbulk
152
166
  .fromYamlString(EmbulkTests.readResource(name));
153
167
  }
154
168
 
169
+ private static final List<String> SUPPORTED_TYPES = ImmutableList.of(
170
+ "boolean", "long", "double", "string", "timestamp", "json"
171
+ );
172
+
155
173
  public static interface RunResult
156
174
  {
157
175
  ConfigDiff getConfigDiff();
@@ -167,38 +185,341 @@ public class TestingEmbulk
167
185
  List<TaskReport> getOutputTaskReports();
168
186
  }
169
187
 
170
- public RunResult runInput(ConfigSource inConfig, Path outputPath)
171
- throws IOException
188
+ public class InputBuilder
172
189
  {
173
- String fileName = outputPath.getFileName().toString();
174
- checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
175
- Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
190
+ private ConfigSource inConfig = null;
191
+ private List<ConfigSource> filtersConfig = ImmutableList.of();
192
+ private ConfigSource execConfig = newConfig();
193
+ private Path outputPath = null;
176
194
 
177
- Files.createDirectories(dir);
195
+ private InputBuilder()
196
+ { }
178
197
 
179
- ConfigSource execConfig = newConfig()
180
- .set("min_output_tasks", 1);
198
+ public InputBuilder in(ConfigSource inConfig)
199
+ {
200
+ checkNotNull(inConfig, "inConfig");
201
+ this.inConfig = inConfig.deepCopy();
202
+ return this;
203
+ }
181
204
 
182
- ConfigSource outConfig = newConfig()
183
- .set("type", "file")
184
- .set("path_prefix", dir.resolve("fragments_").toString())
185
- .set("file_ext", "csv")
186
- .set("formatter", newConfig()
205
+ public InputBuilder filters(List<ConfigSource> filtersConfig)
206
+ {
207
+ checkNotNull(filtersConfig, "filtersConfig");
208
+ ImmutableList.Builder<ConfigSource> builder = ImmutableList.builder();
209
+ for (ConfigSource filter : filtersConfig) {
210
+ builder.add(filter.deepCopy());
211
+ }
212
+ this.filtersConfig = builder.build();
213
+ return this;
214
+ }
215
+
216
+ public InputBuilder exec(ConfigSource execConfig)
217
+ {
218
+ checkNotNull(execConfig, "execConfig");
219
+ this.execConfig = execConfig.deepCopy();
220
+ return this;
221
+ }
222
+
223
+ public InputBuilder outputPath(Path outputPath)
224
+ {
225
+ checkNotNull(outputPath, "outputPath");
226
+ this.outputPath = outputPath;
227
+ return this;
228
+ }
229
+
230
+ public ConfigDiff guess()
231
+ {
232
+ checkState(inConfig != null, "in config must be set");
233
+
234
+ // config = {exec: execConfig, in: inConfig}
235
+ ConfigSource config = newConfig()
236
+ .set("exec", execConfig)
237
+ .set("in", inConfig)
238
+ .set("filters", filtersConfig);
239
+
240
+ // embed.guess returns GuessExecutor.ConfigDiff
241
+ return embed.guess(config).getNested("in");
242
+ }
243
+
244
+ public RunResult run()
245
+ throws IOException
246
+ {
247
+ checkState(inConfig != null, "in config must be set");
248
+ checkState(outputPath != null, "outputPath must be set");
249
+
250
+ String fileName = outputPath.getFileName().toString();
251
+ checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
252
+ Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
253
+
254
+ Files.createDirectories(dir);
255
+
256
+ // exec: config
257
+ execConfig.set("min_output_tasks", 1);
258
+
259
+ // out: config
260
+ ConfigSource outConfig = newConfig()
261
+ .set("type", "file")
262
+ .set("path_prefix", dir.resolve("fragments_").toString())
263
+ .set("file_ext", "csv")
264
+ .set("formatter", newConfig()
265
+ .set("type", "csv")
266
+ .set("header_line", false)
267
+ .set("newline", "LF"));
268
+
269
+ // combine exec:, out: and in:
270
+ ConfigSource config = newConfig()
271
+ .set("exec", execConfig)
272
+ .set("in", inConfig)
273
+ .set("filters", filtersConfig)
274
+ .set("out", outConfig);
275
+
276
+ // embed.run returns TestingBulkLoader.TestingExecutionResult because
277
+ // LoaderState.buildExecuteResultWithWarningException is overridden.
278
+ RunResult result = (RunResult) embed.run(config);
279
+
280
+ return buildRunResultWithOutput(result, dir, outputPath);
281
+ }
282
+ }
283
+
284
+ public class ParserBuilder
285
+ {
286
+ private ConfigSource parserConfig = newConfig();
287
+ private ConfigSource execConfig = newConfig();
288
+ private Path inputPath = null;
289
+ private Path outputPath = null;
290
+
291
+ private ParserBuilder()
292
+ { }
293
+
294
+ public ParserBuilder parser(ConfigSource parserConfig)
295
+ {
296
+ checkNotNull(parserConfig, "parserConfig");
297
+ this.parserConfig = parserConfig.deepCopy();
298
+ return this;
299
+ }
300
+
301
+ public ParserBuilder exec(ConfigSource execConfig)
302
+ {
303
+ checkNotNull(execConfig, "execConfig");
304
+ this.execConfig = execConfig.deepCopy();
305
+ return this;
306
+ }
307
+
308
+ public ParserBuilder inputPath(Path inputPath)
309
+ {
310
+ checkNotNull(inputPath, "inputPath");
311
+ this.inputPath = inputPath;
312
+ return this;
313
+ }
314
+
315
+ public ParserBuilder inputResource(String resourceName)
316
+ throws IOException
317
+ {
318
+ checkNotNull(resourceName, "resourceName");
319
+ Path path = createTempFile("csv");
320
+ copyResource(resourceName, path);
321
+ return inputPath(path);
322
+ }
323
+
324
+ public ParserBuilder outputPath(Path outputPath)
325
+ {
326
+ checkNotNull(outputPath, "outputPath");
327
+ this.outputPath = outputPath;
328
+ return this;
329
+ }
330
+
331
+ public ConfigDiff guess()
332
+ {
333
+ checkState(inputPath != null, "inputPath must be set");
334
+
335
+ // in: config
336
+ ConfigSource inConfig = newConfig()
337
+ .set("type", "file")
338
+ .set("path_prefix", inputPath.toAbsolutePath().toString());
339
+ inConfig.set("parser", parserConfig);
340
+
341
+ // config = {exec: execConfig, in: inConfig}
342
+ ConfigSource config = newConfig()
343
+ .set("exec", execConfig)
344
+ .set("in", inConfig);
345
+
346
+ // embed.guess calls GuessExecutor and returns ConfigDiff
347
+ return embed.guess(config).getNested("in").getNested("parser");
348
+ }
349
+
350
+ public RunResult run()
351
+ throws IOException
352
+ {
353
+ checkState(parserConfig != null, "parser config must be set");
354
+ checkState(inputPath != null, "inputPath must be set");
355
+ checkState(outputPath != null, "outputPath must be set");
356
+
357
+ String fileName = outputPath.getFileName().toString();
358
+ checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
359
+ Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
360
+
361
+ Files.createDirectories(dir);
362
+
363
+ // in: config
364
+ ConfigSource inConfig = newConfig()
365
+ .set("type", "file")
366
+ .set("path_prefix", inputPath.toAbsolutePath().toString());
367
+ inConfig.set("parser", parserConfig);
368
+
369
+ // exec: config
370
+ execConfig.set("min_output_tasks", 1);
371
+
372
+ // out: config
373
+ ConfigSource outConfig = newConfig()
374
+ .set("type", "file")
375
+ .set("path_prefix", dir.resolve("fragments_").toString())
376
+ .set("file_ext", "csv")
377
+ .set("formatter", newConfig()
378
+ .set("type", "csv")
379
+ .set("header_line", false)
380
+ .set("newline", "LF"));
381
+
382
+ // config = {exec: execConfig, in: inConfig, out: outConfig}
383
+ ConfigSource config = newConfig()
384
+ .set("exec", execConfig)
385
+ .set("in", inConfig)
386
+ .set("out", outConfig);
387
+
388
+ // embed.run returns TestingBulkLoader.TestingExecutionResult because
389
+ // LoaderState.buildExecuteResultWithWarningException is overridden.
390
+ RunResult result = (RunResult) embed.run(config);
391
+
392
+ return buildRunResultWithOutput(result, dir, outputPath);
393
+ }
394
+ }
395
+
396
+ public class OutputBuilder
397
+ {
398
+ private ConfigSource outConfig = null;
399
+ private ConfigSource execConfig = newConfig();
400
+ private Path inputPath;
401
+ private SchemaConfig inputSchema;
402
+
403
+ public OutputBuilder()
404
+ { }
405
+
406
+ public OutputBuilder out(ConfigSource outConfig)
407
+ {
408
+ checkNotNull(outConfig, "outConfig");
409
+ this.outConfig = outConfig;
410
+ return this;
411
+ }
412
+
413
+ public OutputBuilder exec(ConfigSource execConfig)
414
+ {
415
+ checkNotNull(execConfig, "execConfig");
416
+ this.execConfig = execConfig;
417
+ return this;
418
+ }
419
+
420
+ public OutputBuilder inputPath(Path inputPath)
421
+ {
422
+ checkNotNull(inputPath, "inputPath");
423
+ this.inputPath = inputPath;
424
+ return this;
425
+ }
426
+
427
+ public OutputBuilder inputResource(String resourceName)
428
+ throws IOException
429
+ {
430
+ checkNotNull(resourceName, "resourceName");
431
+ Path path = createTempFile("csv");
432
+ copyResource(resourceName, path);
433
+ return inputPath(path);
434
+ }
435
+
436
+ public OutputBuilder inputSchema(SchemaConfig inputSchema)
437
+ {
438
+ checkNotNull(inputSchema, "inputSchema");
439
+ this.inputSchema = inputSchema;
440
+ return this;
441
+ }
442
+
443
+ public RunResult run()
444
+ throws IOException
445
+ {
446
+ checkState(outConfig != null, "out config must be set");
447
+ checkState(inputPath != null, "inputPath must be set");
448
+
449
+ String fileName = inputPath.toAbsolutePath().toString();
450
+ checkArgument(fileName.endsWith(".csv"), "inputPath must end with .csv");
451
+
452
+ // exec: config
453
+ execConfig.set("min_output_tasks", 1);
454
+
455
+ // in: config
456
+ ConfigSource inConfig = newConfig()
457
+ .set("type", "file")
458
+ .set("path_prefix", fileName)
459
+ .set("parser", newParserConfig());
460
+
461
+ // config = {exec: execConfig, in: inConfig, out: outConfig}
462
+ ConfigSource config = newConfig()
463
+ .set("exec", execConfig)
464
+ .set("in", inConfig)
465
+ .set("out", outConfig);
466
+
467
+ // embed.run returns TestingBulkLoader.TestingExecutionResult because
468
+ // LoaderState.buildExecuteResultWithWarningException is overridden.
469
+ return (RunResult) embed.run(config);
470
+ }
471
+
472
+ private ConfigSource newParserConfig()
473
+ {
474
+ return newConfig()
475
+ .set("charset", "UTF-8")
476
+ .set("newline", "LF")
187
477
  .set("type", "csv")
188
- .set("header_line", false)
189
- .set("newline", "LF"));
478
+ .set("delimiter", ",")
479
+ .set("quote", "\"")
480
+ .set("escape", "\"")
481
+ .set("columns", newSchemaConfig());
482
+ }
190
483
 
191
- ConfigSource config = newConfig()
192
- .set("exec", execConfig)
193
- .set("in", inConfig)
194
- .set("out", outConfig);
484
+ private SchemaConfig newSchemaConfig()
485
+ {
486
+ ImmutableList.Builder<ColumnConfig> schema = ImmutableList.builder();
487
+ try (BufferedReader reader = newBufferedReader(inputPath, UTF_8)) {
488
+ for (String column : reader.readLine().split(",")) {
489
+ ColumnConfig columnConfig = newColumnConfig(column);
490
+ if (columnConfig != null) {
491
+ schema.add(columnConfig);
492
+ }
493
+ }
494
+ return new SchemaConfig(schema.build());
495
+ }
496
+ catch (IOException e) {
497
+ throw Throwables.propagate(e);
498
+ }
499
+ }
195
500
 
196
- // embed.run returns TestingBulkLoader.TestingExecutionResult because
197
- RunResult result = (RunResult) embed.run(config);
501
+ private ColumnConfig newColumnConfig(String column)
502
+ {
503
+ String[] tuple = column.split(":", 2);
504
+ checkArgument(tuple.length == 2, "tuple must be a pair of column name and type");
505
+ String type = tuple[1];
506
+ if (!SUPPORTED_TYPES.contains(type)) {
507
+ throw new IllegalArgumentException(String.format(ENGLISH,
508
+ "Unknown column type %s. Supported types are boolean, long, double, string, timestamp and json: %s",
509
+ tuple[1], column));
510
+ }
511
+ return new ColumnConfig(newConfig()
512
+ .set("name", tuple[0])
513
+ .set("type", type));
514
+ }
515
+ }
198
516
 
517
+ private RunResult buildRunResultWithOutput(RunResult result, Path outputDir, Path outputPath)
518
+ throws IOException
519
+ {
199
520
  try (OutputStream out = Files.newOutputStream(outputPath)) {
200
521
  List<Path> fragments = new ArrayList<Path>();
201
- try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "fragments_*.csv")) {
522
+ try (DirectoryStream<Path> stream = Files.newDirectoryStream(outputDir, "fragments_*.csv")) {
202
523
  for (Path fragment : stream) {
203
524
  fragments.add(fragment);
204
525
  }
@@ -214,9 +535,118 @@ public class TestingEmbulk
214
535
  return result;
215
536
  }
216
537
 
217
- // TODO add runOutput(ConfigSource outConfig, Path inputPath) where inputPath is a path to a CSV file
218
- // whose column types can be naturally guessed using csv guess plugin. Callers use EmbulkTests.copyResource
219
- // to copy a resource file to a temp file before calling it.
538
+ public InputBuilder inputBuilder()
539
+ {
540
+ return new InputBuilder();
541
+ }
542
+
543
+ public ParserBuilder parserBuilder()
544
+ {
545
+ return new ParserBuilder();
546
+ }
547
+
548
+ public OutputBuilder outputBuilder()
549
+ {
550
+ return new OutputBuilder();
551
+ }
552
+
553
+ public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath)
554
+ throws IOException
555
+ {
556
+ return parserBuilder()
557
+ .parser(parserConfig)
558
+ .inputPath(inputPath)
559
+ .outputPath(outputPath)
560
+ .run();
561
+ }
562
+
563
+ public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath, ConfigSource execConfig)
564
+ throws IOException
565
+ {
566
+ return parserBuilder()
567
+ .parser(parserConfig)
568
+ .inputPath(inputPath)
569
+ .outputPath(outputPath)
570
+ .exec(execConfig)
571
+ .run();
572
+ }
573
+
574
+ public RunResult runInput(ConfigSource inConfig, Path outputPath)
575
+ throws IOException
576
+ {
577
+ return inputBuilder()
578
+ .in(inConfig)
579
+ .outputPath(outputPath)
580
+ .run();
581
+ }
582
+
583
+ public RunResult runInput(ConfigSource inConfig, Path outputPath, ConfigSource execConfig)
584
+ throws IOException
585
+ {
586
+ return inputBuilder()
587
+ .exec(execConfig)
588
+ .in(inConfig)
589
+ .outputPath(outputPath)
590
+ .run();
591
+ }
592
+
593
+ public RunResult runOutput(ConfigSource outConfig, Path inputPath)
594
+ throws IOException
595
+ {
596
+ return outputBuilder()
597
+ .out(outConfig)
598
+ .inputPath(inputPath)
599
+ .run();
600
+ }
601
+
602
+ public RunResult runOutput(ConfigSource outConfig, Path inputPath, ConfigSource execConfig)
603
+ throws IOException
604
+ {
605
+ return outputBuilder()
606
+ .exec(execConfig)
607
+ .out(outConfig)
608
+ .inputPath(inputPath)
609
+ .run();
610
+ }
611
+
612
+ public ConfigDiff guessInput(ConfigSource inSeedConfig)
613
+ {
614
+ return inputBuilder()
615
+ .in(inSeedConfig)
616
+ .guess();
617
+ }
618
+
619
+ public ConfigDiff guessInput(ConfigSource inSeedConfig, ConfigSource execConfig)
620
+ {
621
+ return inputBuilder()
622
+ .exec(execConfig)
623
+ .in(inSeedConfig)
624
+ .guess();
625
+ }
626
+
627
+ public ConfigDiff guessParser(Path inputPath)
628
+ {
629
+ return parserBuilder()
630
+ .inputPath(inputPath)
631
+ .guess();
632
+ }
633
+
634
+ public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath)
635
+ {
636
+ return parserBuilder()
637
+ .parser(parserSeedConfig)
638
+ .inputPath(inputPath)
639
+ .guess();
640
+ }
641
+
642
+ public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath, ConfigSource execConfig)
643
+ {
644
+ return parserBuilder()
645
+ .parser(parserSeedConfig)
646
+ .inputPath(inputPath)
647
+ .exec(execConfig)
648
+ .guess();
649
+ }
220
650
 
221
651
  // TODO add runFilter(ConfigSource filterConfig, Path inputPath, Path outputPath) where inputPath is a path to
222
652
  // a CSV file whose column types can be naturally guessed using csv guess plugin.