embulk 0.8.15 → 0.8.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/appveyor.yml +8 -0
- data/build.gradle +86 -45
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +43 -4
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +38 -1
- data/embulk-docs/src/built-in.rst +34 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.16.rst +43 -0
- data/embulk-standards/build.gradle +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RemoveColumnsFilterPlugin.java +268 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +13 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRemoveColumnsFilterPlugin.java +121 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +8 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvAllStringsGuessPlugin.java +38 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +229 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header.csv +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_in.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_without_unmatched_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_without_unmatched_filter.yml +2 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +458 -28
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/gradlew +30 -21
- data/gradlew.bat +4 -10
- data/lib/embulk/command/embulk_migrate_plugin.rb +2 -2
- data/lib/embulk/data/new/java/build.gradle.erb +5 -3
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/lib/embulk/data/new/java/gradlew +30 -21
- data/lib/embulk/data/new/java/gradlew.bat +4 -10
- data/lib/embulk/guess/csv.rb +44 -22
- data/lib/embulk/guess/newline.rb +10 -4
- data/lib/embulk/guess_plugin.rb +3 -1
- data/lib/embulk/java/time_helper.rb +2 -2
- data/lib/embulk/version.rb +1 -1
- metadata +92 -5
data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
type: file
|
2
|
+
parser:
|
3
|
+
charset: UTF-8
|
4
|
+
newline: CRLF
|
5
|
+
type: csv
|
6
|
+
delimiter: ','
|
7
|
+
quote: '"'
|
8
|
+
escape: '"'
|
9
|
+
trim_if_not_quoted: false
|
10
|
+
skip_header_lines: 1
|
11
|
+
allow_extra_columns: false
|
12
|
+
allow_optional_columns: false
|
13
|
+
columns:
|
14
|
+
- {name: id, type: long}
|
15
|
+
- {name: account, type: long}
|
16
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
17
|
+
- {name: purchase, type: string}
|
18
|
+
- {name: comment, type: string}
|
@@ -1,11 +1,14 @@
|
|
1
1
|
package org.embulk.test;
|
2
2
|
|
3
|
+
import com.google.common.base.Throwables;
|
3
4
|
import com.google.common.collect.ImmutableList;
|
4
|
-
import com.google.common.
|
5
|
+
import com.google.common.collect.Lists;
|
5
6
|
import com.google.common.io.ByteStreams;
|
6
7
|
import com.google.inject.Binder;
|
7
8
|
import com.google.inject.Injector;
|
8
9
|
import com.google.inject.Module;
|
10
|
+
|
11
|
+
import java.io.BufferedReader;
|
9
12
|
import java.io.IOException;
|
10
13
|
import java.io.InputStream;
|
11
14
|
import java.io.OutputStream;
|
@@ -15,22 +18,33 @@ import java.nio.file.Path;
|
|
15
18
|
import java.util.ArrayList;
|
16
19
|
import java.util.Collections;
|
17
20
|
import java.util.List;
|
21
|
+
|
18
22
|
import org.embulk.EmbulkEmbed;
|
19
|
-
import org.embulk.config.Config;
|
20
23
|
import org.embulk.config.ConfigDiff;
|
21
24
|
import org.embulk.config.ConfigLoader;
|
22
25
|
import org.embulk.config.ConfigSource;
|
26
|
+
import org.embulk.config.ModelManager;
|
23
27
|
import org.embulk.config.TaskReport;
|
24
|
-
import org.embulk.
|
28
|
+
import org.embulk.spi.ColumnConfig;
|
25
29
|
import org.embulk.spi.Schema;
|
30
|
+
import org.embulk.spi.SchemaConfig;
|
26
31
|
import org.embulk.spi.TempFileException;
|
27
32
|
import org.embulk.spi.TempFileSpace;
|
33
|
+
import org.embulk.spi.type.Type;
|
28
34
|
import org.junit.rules.TestRule;
|
29
35
|
import org.junit.rules.TestWatcher;
|
30
36
|
import org.junit.runner.Description;
|
31
37
|
import org.junit.runners.model.Statement;
|
32
38
|
import static com.google.common.base.Preconditions.checkArgument;
|
39
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
40
|
+
import static java.nio.file.Files.newBufferedReader;
|
41
|
+
import static java.util.Locale.ENGLISH;
|
42
|
+
import static com.google.common.base.Preconditions.checkNotNull;
|
43
|
+
import static com.google.common.base.Preconditions.checkState;
|
44
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
45
|
+
import static java.nio.file.Files.newBufferedReader;
|
33
46
|
import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
|
47
|
+
import static org.embulk.test.EmbulkTests.copyResource;
|
34
48
|
|
35
49
|
public class TestingEmbulk
|
36
50
|
implements TestRule
|
@@ -152,6 +166,10 @@ public class TestingEmbulk
|
|
152
166
|
.fromYamlString(EmbulkTests.readResource(name));
|
153
167
|
}
|
154
168
|
|
169
|
+
private static final List<String> SUPPORTED_TYPES = ImmutableList.of(
|
170
|
+
"boolean", "long", "double", "string", "timestamp", "json"
|
171
|
+
);
|
172
|
+
|
155
173
|
public static interface RunResult
|
156
174
|
{
|
157
175
|
ConfigDiff getConfigDiff();
|
@@ -167,38 +185,341 @@ public class TestingEmbulk
|
|
167
185
|
List<TaskReport> getOutputTaskReports();
|
168
186
|
}
|
169
187
|
|
170
|
-
public
|
171
|
-
throws IOException
|
188
|
+
public class InputBuilder
|
172
189
|
{
|
173
|
-
|
174
|
-
|
175
|
-
|
190
|
+
private ConfigSource inConfig = null;
|
191
|
+
private List<ConfigSource> filtersConfig = ImmutableList.of();
|
192
|
+
private ConfigSource execConfig = newConfig();
|
193
|
+
private Path outputPath = null;
|
176
194
|
|
177
|
-
|
195
|
+
private InputBuilder()
|
196
|
+
{ }
|
178
197
|
|
179
|
-
|
180
|
-
|
198
|
+
public InputBuilder in(ConfigSource inConfig)
|
199
|
+
{
|
200
|
+
checkNotNull(inConfig, "inConfig");
|
201
|
+
this.inConfig = inConfig.deepCopy();
|
202
|
+
return this;
|
203
|
+
}
|
181
204
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
.
|
186
|
-
|
205
|
+
public InputBuilder filters(List<ConfigSource> filtersConfig)
|
206
|
+
{
|
207
|
+
checkNotNull(filtersConfig, "filtersConfig");
|
208
|
+
ImmutableList.Builder<ConfigSource> builder = ImmutableList.builder();
|
209
|
+
for (ConfigSource filter : filtersConfig) {
|
210
|
+
builder.add(filter.deepCopy());
|
211
|
+
}
|
212
|
+
this.filtersConfig = builder.build();
|
213
|
+
return this;
|
214
|
+
}
|
215
|
+
|
216
|
+
public InputBuilder exec(ConfigSource execConfig)
|
217
|
+
{
|
218
|
+
checkNotNull(execConfig, "execConfig");
|
219
|
+
this.execConfig = execConfig.deepCopy();
|
220
|
+
return this;
|
221
|
+
}
|
222
|
+
|
223
|
+
public InputBuilder outputPath(Path outputPath)
|
224
|
+
{
|
225
|
+
checkNotNull(outputPath, "outputPath");
|
226
|
+
this.outputPath = outputPath;
|
227
|
+
return this;
|
228
|
+
}
|
229
|
+
|
230
|
+
public ConfigDiff guess()
|
231
|
+
{
|
232
|
+
checkState(inConfig != null, "in config must be set");
|
233
|
+
|
234
|
+
// config = {exec: execConfig, in: inConfig}
|
235
|
+
ConfigSource config = newConfig()
|
236
|
+
.set("exec", execConfig)
|
237
|
+
.set("in", inConfig)
|
238
|
+
.set("filters", filtersConfig);
|
239
|
+
|
240
|
+
// embed.guess returns GuessExecutor.ConfigDiff
|
241
|
+
return embed.guess(config).getNested("in");
|
242
|
+
}
|
243
|
+
|
244
|
+
public RunResult run()
|
245
|
+
throws IOException
|
246
|
+
{
|
247
|
+
checkState(inConfig != null, "in config must be set");
|
248
|
+
checkState(outputPath != null, "outputPath must be set");
|
249
|
+
|
250
|
+
String fileName = outputPath.getFileName().toString();
|
251
|
+
checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
|
252
|
+
Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
|
253
|
+
|
254
|
+
Files.createDirectories(dir);
|
255
|
+
|
256
|
+
// exec: config
|
257
|
+
execConfig.set("min_output_tasks", 1);
|
258
|
+
|
259
|
+
// out: config
|
260
|
+
ConfigSource outConfig = newConfig()
|
261
|
+
.set("type", "file")
|
262
|
+
.set("path_prefix", dir.resolve("fragments_").toString())
|
263
|
+
.set("file_ext", "csv")
|
264
|
+
.set("formatter", newConfig()
|
265
|
+
.set("type", "csv")
|
266
|
+
.set("header_line", false)
|
267
|
+
.set("newline", "LF"));
|
268
|
+
|
269
|
+
// combine exec:, out: and in:
|
270
|
+
ConfigSource config = newConfig()
|
271
|
+
.set("exec", execConfig)
|
272
|
+
.set("in", inConfig)
|
273
|
+
.set("filters", filtersConfig)
|
274
|
+
.set("out", outConfig);
|
275
|
+
|
276
|
+
// embed.run returns TestingBulkLoader.TestingExecutionResult because
|
277
|
+
// LoaderState.buildExecuteResultWithWarningException is overridden.
|
278
|
+
RunResult result = (RunResult) embed.run(config);
|
279
|
+
|
280
|
+
return buildRunResultWithOutput(result, dir, outputPath);
|
281
|
+
}
|
282
|
+
}
|
283
|
+
|
284
|
+
public class ParserBuilder
|
285
|
+
{
|
286
|
+
private ConfigSource parserConfig = newConfig();
|
287
|
+
private ConfigSource execConfig = newConfig();
|
288
|
+
private Path inputPath = null;
|
289
|
+
private Path outputPath = null;
|
290
|
+
|
291
|
+
private ParserBuilder()
|
292
|
+
{ }
|
293
|
+
|
294
|
+
public ParserBuilder parser(ConfigSource parserConfig)
|
295
|
+
{
|
296
|
+
checkNotNull(parserConfig, "parserConfig");
|
297
|
+
this.parserConfig = parserConfig.deepCopy();
|
298
|
+
return this;
|
299
|
+
}
|
300
|
+
|
301
|
+
public ParserBuilder exec(ConfigSource execConfig)
|
302
|
+
{
|
303
|
+
checkNotNull(execConfig, "execConfig");
|
304
|
+
this.execConfig = execConfig.deepCopy();
|
305
|
+
return this;
|
306
|
+
}
|
307
|
+
|
308
|
+
public ParserBuilder inputPath(Path inputPath)
|
309
|
+
{
|
310
|
+
checkNotNull(inputPath, "inputPath");
|
311
|
+
this.inputPath = inputPath;
|
312
|
+
return this;
|
313
|
+
}
|
314
|
+
|
315
|
+
public ParserBuilder inputResource(String resourceName)
|
316
|
+
throws IOException
|
317
|
+
{
|
318
|
+
checkNotNull(resourceName, "resourceName");
|
319
|
+
Path path = createTempFile("csv");
|
320
|
+
copyResource(resourceName, path);
|
321
|
+
return inputPath(path);
|
322
|
+
}
|
323
|
+
|
324
|
+
public ParserBuilder outputPath(Path outputPath)
|
325
|
+
{
|
326
|
+
checkNotNull(outputPath, "outputPath");
|
327
|
+
this.outputPath = outputPath;
|
328
|
+
return this;
|
329
|
+
}
|
330
|
+
|
331
|
+
public ConfigDiff guess()
|
332
|
+
{
|
333
|
+
checkState(inputPath != null, "inputPath must be set");
|
334
|
+
|
335
|
+
// in: config
|
336
|
+
ConfigSource inConfig = newConfig()
|
337
|
+
.set("type", "file")
|
338
|
+
.set("path_prefix", inputPath.toAbsolutePath().toString());
|
339
|
+
inConfig.set("parser", parserConfig);
|
340
|
+
|
341
|
+
// config = {exec: execConfig, in: inConfig}
|
342
|
+
ConfigSource config = newConfig()
|
343
|
+
.set("exec", execConfig)
|
344
|
+
.set("in", inConfig);
|
345
|
+
|
346
|
+
// embed.guess calls GuessExecutor and returns ConfigDiff
|
347
|
+
return embed.guess(config).getNested("in").getNested("parser");
|
348
|
+
}
|
349
|
+
|
350
|
+
public RunResult run()
|
351
|
+
throws IOException
|
352
|
+
{
|
353
|
+
checkState(parserConfig != null, "parser config must be set");
|
354
|
+
checkState(inputPath != null, "inputPath must be set");
|
355
|
+
checkState(outputPath != null, "outputPath must be set");
|
356
|
+
|
357
|
+
String fileName = outputPath.getFileName().toString();
|
358
|
+
checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
|
359
|
+
Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
|
360
|
+
|
361
|
+
Files.createDirectories(dir);
|
362
|
+
|
363
|
+
// in: config
|
364
|
+
ConfigSource inConfig = newConfig()
|
365
|
+
.set("type", "file")
|
366
|
+
.set("path_prefix", inputPath.toAbsolutePath().toString());
|
367
|
+
inConfig.set("parser", parserConfig);
|
368
|
+
|
369
|
+
// exec: config
|
370
|
+
execConfig.set("min_output_tasks", 1);
|
371
|
+
|
372
|
+
// out: config
|
373
|
+
ConfigSource outConfig = newConfig()
|
374
|
+
.set("type", "file")
|
375
|
+
.set("path_prefix", dir.resolve("fragments_").toString())
|
376
|
+
.set("file_ext", "csv")
|
377
|
+
.set("formatter", newConfig()
|
378
|
+
.set("type", "csv")
|
379
|
+
.set("header_line", false)
|
380
|
+
.set("newline", "LF"));
|
381
|
+
|
382
|
+
// config = {exec: execConfig, in: inConfig, out: outConfig}
|
383
|
+
ConfigSource config = newConfig()
|
384
|
+
.set("exec", execConfig)
|
385
|
+
.set("in", inConfig)
|
386
|
+
.set("out", outConfig);
|
387
|
+
|
388
|
+
// embed.run returns TestingBulkLoader.TestingExecutionResult because
|
389
|
+
// LoaderState.buildExecuteResultWithWarningException is overridden.
|
390
|
+
RunResult result = (RunResult) embed.run(config);
|
391
|
+
|
392
|
+
return buildRunResultWithOutput(result, dir, outputPath);
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
public class OutputBuilder
|
397
|
+
{
|
398
|
+
private ConfigSource outConfig = null;
|
399
|
+
private ConfigSource execConfig = newConfig();
|
400
|
+
private Path inputPath;
|
401
|
+
private SchemaConfig inputSchema;
|
402
|
+
|
403
|
+
public OutputBuilder()
|
404
|
+
{ }
|
405
|
+
|
406
|
+
public OutputBuilder out(ConfigSource outConfig)
|
407
|
+
{
|
408
|
+
checkNotNull(outConfig, "outConfig");
|
409
|
+
this.outConfig = outConfig;
|
410
|
+
return this;
|
411
|
+
}
|
412
|
+
|
413
|
+
public OutputBuilder exec(ConfigSource execConfig)
|
414
|
+
{
|
415
|
+
checkNotNull(execConfig, "execConfig");
|
416
|
+
this.execConfig = execConfig;
|
417
|
+
return this;
|
418
|
+
}
|
419
|
+
|
420
|
+
public OutputBuilder inputPath(Path inputPath)
|
421
|
+
{
|
422
|
+
checkNotNull(inputPath, "inputPath");
|
423
|
+
this.inputPath = inputPath;
|
424
|
+
return this;
|
425
|
+
}
|
426
|
+
|
427
|
+
public OutputBuilder inputResource(String resourceName)
|
428
|
+
throws IOException
|
429
|
+
{
|
430
|
+
checkNotNull(resourceName, "resourceName");
|
431
|
+
Path path = createTempFile("csv");
|
432
|
+
copyResource(resourceName, path);
|
433
|
+
return inputPath(path);
|
434
|
+
}
|
435
|
+
|
436
|
+
public OutputBuilder inputSchema(SchemaConfig inputSchema)
|
437
|
+
{
|
438
|
+
checkNotNull(inputSchema, "inputSchema");
|
439
|
+
this.inputSchema = inputSchema;
|
440
|
+
return this;
|
441
|
+
}
|
442
|
+
|
443
|
+
public RunResult run()
|
444
|
+
throws IOException
|
445
|
+
{
|
446
|
+
checkState(outConfig != null, "out config must be set");
|
447
|
+
checkState(inputPath != null, "inputPath must be set");
|
448
|
+
|
449
|
+
String fileName = inputPath.toAbsolutePath().toString();
|
450
|
+
checkArgument(fileName.endsWith(".csv"), "inputPath must end with .csv");
|
451
|
+
|
452
|
+
// exec: config
|
453
|
+
execConfig.set("min_output_tasks", 1);
|
454
|
+
|
455
|
+
// in: config
|
456
|
+
ConfigSource inConfig = newConfig()
|
457
|
+
.set("type", "file")
|
458
|
+
.set("path_prefix", fileName)
|
459
|
+
.set("parser", newParserConfig());
|
460
|
+
|
461
|
+
// config = {exec: execConfig, in: inConfig, out: outConfig}
|
462
|
+
ConfigSource config = newConfig()
|
463
|
+
.set("exec", execConfig)
|
464
|
+
.set("in", inConfig)
|
465
|
+
.set("out", outConfig);
|
466
|
+
|
467
|
+
// embed.run returns TestingBulkLoader.TestingExecutionResult because
|
468
|
+
// LoaderState.buildExecuteResultWithWarningException is overridden.
|
469
|
+
return (RunResult) embed.run(config);
|
470
|
+
}
|
471
|
+
|
472
|
+
private ConfigSource newParserConfig()
|
473
|
+
{
|
474
|
+
return newConfig()
|
475
|
+
.set("charset", "UTF-8")
|
476
|
+
.set("newline", "LF")
|
187
477
|
.set("type", "csv")
|
188
|
-
.set("
|
189
|
-
.set("
|
478
|
+
.set("delimiter", ",")
|
479
|
+
.set("quote", "\"")
|
480
|
+
.set("escape", "\"")
|
481
|
+
.set("columns", newSchemaConfig());
|
482
|
+
}
|
190
483
|
|
191
|
-
|
192
|
-
|
193
|
-
.
|
194
|
-
|
484
|
+
private SchemaConfig newSchemaConfig()
|
485
|
+
{
|
486
|
+
ImmutableList.Builder<ColumnConfig> schema = ImmutableList.builder();
|
487
|
+
try (BufferedReader reader = newBufferedReader(inputPath, UTF_8)) {
|
488
|
+
for (String column : reader.readLine().split(",")) {
|
489
|
+
ColumnConfig columnConfig = newColumnConfig(column);
|
490
|
+
if (columnConfig != null) {
|
491
|
+
schema.add(columnConfig);
|
492
|
+
}
|
493
|
+
}
|
494
|
+
return new SchemaConfig(schema.build());
|
495
|
+
}
|
496
|
+
catch (IOException e) {
|
497
|
+
throw Throwables.propagate(e);
|
498
|
+
}
|
499
|
+
}
|
195
500
|
|
196
|
-
|
197
|
-
|
501
|
+
private ColumnConfig newColumnConfig(String column)
|
502
|
+
{
|
503
|
+
String[] tuple = column.split(":", 2);
|
504
|
+
checkArgument(tuple.length == 2, "tuple must be a pair of column name and type");
|
505
|
+
String type = tuple[1];
|
506
|
+
if (!SUPPORTED_TYPES.contains(type)) {
|
507
|
+
throw new IllegalArgumentException(String.format(ENGLISH,
|
508
|
+
"Unknown column type %s. Supported types are boolean, long, double, string, timestamp and json: %s",
|
509
|
+
tuple[1], column));
|
510
|
+
}
|
511
|
+
return new ColumnConfig(newConfig()
|
512
|
+
.set("name", tuple[0])
|
513
|
+
.set("type", type));
|
514
|
+
}
|
515
|
+
}
|
198
516
|
|
517
|
+
private RunResult buildRunResultWithOutput(RunResult result, Path outputDir, Path outputPath)
|
518
|
+
throws IOException
|
519
|
+
{
|
199
520
|
try (OutputStream out = Files.newOutputStream(outputPath)) {
|
200
521
|
List<Path> fragments = new ArrayList<Path>();
|
201
|
-
try (DirectoryStream<Path> stream = Files.newDirectoryStream(
|
522
|
+
try (DirectoryStream<Path> stream = Files.newDirectoryStream(outputDir, "fragments_*.csv")) {
|
202
523
|
for (Path fragment : stream) {
|
203
524
|
fragments.add(fragment);
|
204
525
|
}
|
@@ -214,9 +535,118 @@ public class TestingEmbulk
|
|
214
535
|
return result;
|
215
536
|
}
|
216
537
|
|
217
|
-
|
218
|
-
|
219
|
-
|
538
|
+
public InputBuilder inputBuilder()
|
539
|
+
{
|
540
|
+
return new InputBuilder();
|
541
|
+
}
|
542
|
+
|
543
|
+
public ParserBuilder parserBuilder()
|
544
|
+
{
|
545
|
+
return new ParserBuilder();
|
546
|
+
}
|
547
|
+
|
548
|
+
public OutputBuilder outputBuilder()
|
549
|
+
{
|
550
|
+
return new OutputBuilder();
|
551
|
+
}
|
552
|
+
|
553
|
+
public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath)
|
554
|
+
throws IOException
|
555
|
+
{
|
556
|
+
return parserBuilder()
|
557
|
+
.parser(parserConfig)
|
558
|
+
.inputPath(inputPath)
|
559
|
+
.outputPath(outputPath)
|
560
|
+
.run();
|
561
|
+
}
|
562
|
+
|
563
|
+
public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath, ConfigSource execConfig)
|
564
|
+
throws IOException
|
565
|
+
{
|
566
|
+
return parserBuilder()
|
567
|
+
.parser(parserConfig)
|
568
|
+
.inputPath(inputPath)
|
569
|
+
.outputPath(outputPath)
|
570
|
+
.exec(execConfig)
|
571
|
+
.run();
|
572
|
+
}
|
573
|
+
|
574
|
+
public RunResult runInput(ConfigSource inConfig, Path outputPath)
|
575
|
+
throws IOException
|
576
|
+
{
|
577
|
+
return inputBuilder()
|
578
|
+
.in(inConfig)
|
579
|
+
.outputPath(outputPath)
|
580
|
+
.run();
|
581
|
+
}
|
582
|
+
|
583
|
+
public RunResult runInput(ConfigSource inConfig, Path outputPath, ConfigSource execConfig)
|
584
|
+
throws IOException
|
585
|
+
{
|
586
|
+
return inputBuilder()
|
587
|
+
.exec(execConfig)
|
588
|
+
.in(inConfig)
|
589
|
+
.outputPath(outputPath)
|
590
|
+
.run();
|
591
|
+
}
|
592
|
+
|
593
|
+
public RunResult runOutput(ConfigSource outConfig, Path inputPath)
|
594
|
+
throws IOException
|
595
|
+
{
|
596
|
+
return outputBuilder()
|
597
|
+
.out(outConfig)
|
598
|
+
.inputPath(inputPath)
|
599
|
+
.run();
|
600
|
+
}
|
601
|
+
|
602
|
+
public RunResult runOutput(ConfigSource outConfig, Path inputPath, ConfigSource execConfig)
|
603
|
+
throws IOException
|
604
|
+
{
|
605
|
+
return outputBuilder()
|
606
|
+
.exec(execConfig)
|
607
|
+
.out(outConfig)
|
608
|
+
.inputPath(inputPath)
|
609
|
+
.run();
|
610
|
+
}
|
611
|
+
|
612
|
+
public ConfigDiff guessInput(ConfigSource inSeedConfig)
|
613
|
+
{
|
614
|
+
return inputBuilder()
|
615
|
+
.in(inSeedConfig)
|
616
|
+
.guess();
|
617
|
+
}
|
618
|
+
|
619
|
+
public ConfigDiff guessInput(ConfigSource inSeedConfig, ConfigSource execConfig)
|
620
|
+
{
|
621
|
+
return inputBuilder()
|
622
|
+
.exec(execConfig)
|
623
|
+
.in(inSeedConfig)
|
624
|
+
.guess();
|
625
|
+
}
|
626
|
+
|
627
|
+
public ConfigDiff guessParser(Path inputPath)
|
628
|
+
{
|
629
|
+
return parserBuilder()
|
630
|
+
.inputPath(inputPath)
|
631
|
+
.guess();
|
632
|
+
}
|
633
|
+
|
634
|
+
public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath)
|
635
|
+
{
|
636
|
+
return parserBuilder()
|
637
|
+
.parser(parserSeedConfig)
|
638
|
+
.inputPath(inputPath)
|
639
|
+
.guess();
|
640
|
+
}
|
641
|
+
|
642
|
+
public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath, ConfigSource execConfig)
|
643
|
+
{
|
644
|
+
return parserBuilder()
|
645
|
+
.parser(parserSeedConfig)
|
646
|
+
.inputPath(inputPath)
|
647
|
+
.exec(execConfig)
|
648
|
+
.guess();
|
649
|
+
}
|
220
650
|
|
221
651
|
// TODO add runFilter(ConfigSource filterConfig, Path inputPath, Path outputPath) where inputPath is a path to
|
222
652
|
// a CSV file whose column types can be naturally guessed using csv guess plugin.
|