embulk 0.8.15-java → 0.8.16-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/appveyor.yml +8 -0
- data/build.gradle +86 -45
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +43 -4
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +38 -1
- data/embulk-docs/src/built-in.rst +34 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.16.rst +43 -0
- data/embulk-standards/build.gradle +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RemoveColumnsFilterPlugin.java +268 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +13 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRemoveColumnsFilterPlugin.java +121 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +8 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvAllStringsGuessPlugin.java +38 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +229 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header.csv +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_in.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_without_unmatched_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_without_unmatched_filter.yml +2 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +458 -28
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/gradlew +30 -21
- data/gradlew.bat +4 -10
- data/lib/embulk/command/embulk_migrate_plugin.rb +2 -2
- data/lib/embulk/data/new/java/build.gradle.erb +5 -3
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/lib/embulk/data/new/java/gradlew +30 -21
- data/lib/embulk/data/new/java/gradlew.bat +4 -10
- data/lib/embulk/guess/csv.rb +44 -22
- data/lib/embulk/guess/newline.rb +10 -4
- data/lib/embulk/guess_plugin.rb +3 -1
- data/lib/embulk/java/time_helper.rb +2 -2
- data/lib/embulk/version.rb +1 -1
- metadata +92 -5
data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
type: file
|
2
|
+
parser:
|
3
|
+
charset: UTF-8
|
4
|
+
newline: CRLF
|
5
|
+
type: csv
|
6
|
+
delimiter: ','
|
7
|
+
quote: '"'
|
8
|
+
escape: '"'
|
9
|
+
trim_if_not_quoted: false
|
10
|
+
skip_header_lines: 1
|
11
|
+
allow_extra_columns: false
|
12
|
+
allow_optional_columns: false
|
13
|
+
columns:
|
14
|
+
- {name: id, type: long}
|
15
|
+
- {name: account, type: long}
|
16
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
17
|
+
- {name: purchase, type: string}
|
18
|
+
- {name: comment, type: string}
|
@@ -1,11 +1,14 @@
|
|
1
1
|
package org.embulk.test;
|
2
2
|
|
3
|
+
import com.google.common.base.Throwables;
|
3
4
|
import com.google.common.collect.ImmutableList;
|
4
|
-
import com.google.common.
|
5
|
+
import com.google.common.collect.Lists;
|
5
6
|
import com.google.common.io.ByteStreams;
|
6
7
|
import com.google.inject.Binder;
|
7
8
|
import com.google.inject.Injector;
|
8
9
|
import com.google.inject.Module;
|
10
|
+
|
11
|
+
import java.io.BufferedReader;
|
9
12
|
import java.io.IOException;
|
10
13
|
import java.io.InputStream;
|
11
14
|
import java.io.OutputStream;
|
@@ -15,22 +18,33 @@ import java.nio.file.Path;
|
|
15
18
|
import java.util.ArrayList;
|
16
19
|
import java.util.Collections;
|
17
20
|
import java.util.List;
|
21
|
+
|
18
22
|
import org.embulk.EmbulkEmbed;
|
19
|
-
import org.embulk.config.Config;
|
20
23
|
import org.embulk.config.ConfigDiff;
|
21
24
|
import org.embulk.config.ConfigLoader;
|
22
25
|
import org.embulk.config.ConfigSource;
|
26
|
+
import org.embulk.config.ModelManager;
|
23
27
|
import org.embulk.config.TaskReport;
|
24
|
-
import org.embulk.
|
28
|
+
import org.embulk.spi.ColumnConfig;
|
25
29
|
import org.embulk.spi.Schema;
|
30
|
+
import org.embulk.spi.SchemaConfig;
|
26
31
|
import org.embulk.spi.TempFileException;
|
27
32
|
import org.embulk.spi.TempFileSpace;
|
33
|
+
import org.embulk.spi.type.Type;
|
28
34
|
import org.junit.rules.TestRule;
|
29
35
|
import org.junit.rules.TestWatcher;
|
30
36
|
import org.junit.runner.Description;
|
31
37
|
import org.junit.runners.model.Statement;
|
32
38
|
import static com.google.common.base.Preconditions.checkArgument;
|
39
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
40
|
+
import static java.nio.file.Files.newBufferedReader;
|
41
|
+
import static java.util.Locale.ENGLISH;
|
42
|
+
import static com.google.common.base.Preconditions.checkNotNull;
|
43
|
+
import static com.google.common.base.Preconditions.checkState;
|
44
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
45
|
+
import static java.nio.file.Files.newBufferedReader;
|
33
46
|
import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
|
47
|
+
import static org.embulk.test.EmbulkTests.copyResource;
|
34
48
|
|
35
49
|
public class TestingEmbulk
|
36
50
|
implements TestRule
|
@@ -152,6 +166,10 @@ public class TestingEmbulk
|
|
152
166
|
.fromYamlString(EmbulkTests.readResource(name));
|
153
167
|
}
|
154
168
|
|
169
|
+
private static final List<String> SUPPORTED_TYPES = ImmutableList.of(
|
170
|
+
"boolean", "long", "double", "string", "timestamp", "json"
|
171
|
+
);
|
172
|
+
|
155
173
|
public static interface RunResult
|
156
174
|
{
|
157
175
|
ConfigDiff getConfigDiff();
|
@@ -167,38 +185,341 @@ public class TestingEmbulk
|
|
167
185
|
List<TaskReport> getOutputTaskReports();
|
168
186
|
}
|
169
187
|
|
170
|
-
public
|
171
|
-
throws IOException
|
188
|
+
public class InputBuilder
|
172
189
|
{
|
173
|
-
|
174
|
-
|
175
|
-
|
190
|
+
private ConfigSource inConfig = null;
|
191
|
+
private List<ConfigSource> filtersConfig = ImmutableList.of();
|
192
|
+
private ConfigSource execConfig = newConfig();
|
193
|
+
private Path outputPath = null;
|
176
194
|
|
177
|
-
|
195
|
+
private InputBuilder()
|
196
|
+
{ }
|
178
197
|
|
179
|
-
|
180
|
-
|
198
|
+
public InputBuilder in(ConfigSource inConfig)
|
199
|
+
{
|
200
|
+
checkNotNull(inConfig, "inConfig");
|
201
|
+
this.inConfig = inConfig.deepCopy();
|
202
|
+
return this;
|
203
|
+
}
|
181
204
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
.
|
186
|
-
|
205
|
+
public InputBuilder filters(List<ConfigSource> filtersConfig)
|
206
|
+
{
|
207
|
+
checkNotNull(filtersConfig, "filtersConfig");
|
208
|
+
ImmutableList.Builder<ConfigSource> builder = ImmutableList.builder();
|
209
|
+
for (ConfigSource filter : filtersConfig) {
|
210
|
+
builder.add(filter.deepCopy());
|
211
|
+
}
|
212
|
+
this.filtersConfig = builder.build();
|
213
|
+
return this;
|
214
|
+
}
|
215
|
+
|
216
|
+
public InputBuilder exec(ConfigSource execConfig)
|
217
|
+
{
|
218
|
+
checkNotNull(execConfig, "execConfig");
|
219
|
+
this.execConfig = execConfig.deepCopy();
|
220
|
+
return this;
|
221
|
+
}
|
222
|
+
|
223
|
+
public InputBuilder outputPath(Path outputPath)
|
224
|
+
{
|
225
|
+
checkNotNull(outputPath, "outputPath");
|
226
|
+
this.outputPath = outputPath;
|
227
|
+
return this;
|
228
|
+
}
|
229
|
+
|
230
|
+
public ConfigDiff guess()
|
231
|
+
{
|
232
|
+
checkState(inConfig != null, "in config must be set");
|
233
|
+
|
234
|
+
// config = {exec: execConfig, in: inConfig}
|
235
|
+
ConfigSource config = newConfig()
|
236
|
+
.set("exec", execConfig)
|
237
|
+
.set("in", inConfig)
|
238
|
+
.set("filters", filtersConfig);
|
239
|
+
|
240
|
+
// embed.guess returns GuessExecutor.ConfigDiff
|
241
|
+
return embed.guess(config).getNested("in");
|
242
|
+
}
|
243
|
+
|
244
|
+
public RunResult run()
|
245
|
+
throws IOException
|
246
|
+
{
|
247
|
+
checkState(inConfig != null, "in config must be set");
|
248
|
+
checkState(outputPath != null, "outputPath must be set");
|
249
|
+
|
250
|
+
String fileName = outputPath.getFileName().toString();
|
251
|
+
checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
|
252
|
+
Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
|
253
|
+
|
254
|
+
Files.createDirectories(dir);
|
255
|
+
|
256
|
+
// exec: config
|
257
|
+
execConfig.set("min_output_tasks", 1);
|
258
|
+
|
259
|
+
// out: config
|
260
|
+
ConfigSource outConfig = newConfig()
|
261
|
+
.set("type", "file")
|
262
|
+
.set("path_prefix", dir.resolve("fragments_").toString())
|
263
|
+
.set("file_ext", "csv")
|
264
|
+
.set("formatter", newConfig()
|
265
|
+
.set("type", "csv")
|
266
|
+
.set("header_line", false)
|
267
|
+
.set("newline", "LF"));
|
268
|
+
|
269
|
+
// combine exec:, out: and in:
|
270
|
+
ConfigSource config = newConfig()
|
271
|
+
.set("exec", execConfig)
|
272
|
+
.set("in", inConfig)
|
273
|
+
.set("filters", filtersConfig)
|
274
|
+
.set("out", outConfig);
|
275
|
+
|
276
|
+
// embed.run returns TestingBulkLoader.TestingExecutionResult because
|
277
|
+
// LoaderState.buildExecuteResultWithWarningException is overridden.
|
278
|
+
RunResult result = (RunResult) embed.run(config);
|
279
|
+
|
280
|
+
return buildRunResultWithOutput(result, dir, outputPath);
|
281
|
+
}
|
282
|
+
}
|
283
|
+
|
284
|
+
public class ParserBuilder
|
285
|
+
{
|
286
|
+
private ConfigSource parserConfig = newConfig();
|
287
|
+
private ConfigSource execConfig = newConfig();
|
288
|
+
private Path inputPath = null;
|
289
|
+
private Path outputPath = null;
|
290
|
+
|
291
|
+
private ParserBuilder()
|
292
|
+
{ }
|
293
|
+
|
294
|
+
public ParserBuilder parser(ConfigSource parserConfig)
|
295
|
+
{
|
296
|
+
checkNotNull(parserConfig, "parserConfig");
|
297
|
+
this.parserConfig = parserConfig.deepCopy();
|
298
|
+
return this;
|
299
|
+
}
|
300
|
+
|
301
|
+
public ParserBuilder exec(ConfigSource execConfig)
|
302
|
+
{
|
303
|
+
checkNotNull(execConfig, "execConfig");
|
304
|
+
this.execConfig = execConfig.deepCopy();
|
305
|
+
return this;
|
306
|
+
}
|
307
|
+
|
308
|
+
public ParserBuilder inputPath(Path inputPath)
|
309
|
+
{
|
310
|
+
checkNotNull(inputPath, "inputPath");
|
311
|
+
this.inputPath = inputPath;
|
312
|
+
return this;
|
313
|
+
}
|
314
|
+
|
315
|
+
public ParserBuilder inputResource(String resourceName)
|
316
|
+
throws IOException
|
317
|
+
{
|
318
|
+
checkNotNull(resourceName, "resourceName");
|
319
|
+
Path path = createTempFile("csv");
|
320
|
+
copyResource(resourceName, path);
|
321
|
+
return inputPath(path);
|
322
|
+
}
|
323
|
+
|
324
|
+
public ParserBuilder outputPath(Path outputPath)
|
325
|
+
{
|
326
|
+
checkNotNull(outputPath, "outputPath");
|
327
|
+
this.outputPath = outputPath;
|
328
|
+
return this;
|
329
|
+
}
|
330
|
+
|
331
|
+
public ConfigDiff guess()
|
332
|
+
{
|
333
|
+
checkState(inputPath != null, "inputPath must be set");
|
334
|
+
|
335
|
+
// in: config
|
336
|
+
ConfigSource inConfig = newConfig()
|
337
|
+
.set("type", "file")
|
338
|
+
.set("path_prefix", inputPath.toAbsolutePath().toString());
|
339
|
+
inConfig.set("parser", parserConfig);
|
340
|
+
|
341
|
+
// config = {exec: execConfig, in: inConfig}
|
342
|
+
ConfigSource config = newConfig()
|
343
|
+
.set("exec", execConfig)
|
344
|
+
.set("in", inConfig);
|
345
|
+
|
346
|
+
// embed.guess calls GuessExecutor and returns ConfigDiff
|
347
|
+
return embed.guess(config).getNested("in").getNested("parser");
|
348
|
+
}
|
349
|
+
|
350
|
+
public RunResult run()
|
351
|
+
throws IOException
|
352
|
+
{
|
353
|
+
checkState(parserConfig != null, "parser config must be set");
|
354
|
+
checkState(inputPath != null, "inputPath must be set");
|
355
|
+
checkState(outputPath != null, "outputPath must be set");
|
356
|
+
|
357
|
+
String fileName = outputPath.getFileName().toString();
|
358
|
+
checkArgument(fileName.endsWith(".csv"), "outputPath must end with .csv");
|
359
|
+
Path dir = outputPath.getParent().resolve(fileName.substring(0, fileName.length() - 4));
|
360
|
+
|
361
|
+
Files.createDirectories(dir);
|
362
|
+
|
363
|
+
// in: config
|
364
|
+
ConfigSource inConfig = newConfig()
|
365
|
+
.set("type", "file")
|
366
|
+
.set("path_prefix", inputPath.toAbsolutePath().toString());
|
367
|
+
inConfig.set("parser", parserConfig);
|
368
|
+
|
369
|
+
// exec: config
|
370
|
+
execConfig.set("min_output_tasks", 1);
|
371
|
+
|
372
|
+
// out: config
|
373
|
+
ConfigSource outConfig = newConfig()
|
374
|
+
.set("type", "file")
|
375
|
+
.set("path_prefix", dir.resolve("fragments_").toString())
|
376
|
+
.set("file_ext", "csv")
|
377
|
+
.set("formatter", newConfig()
|
378
|
+
.set("type", "csv")
|
379
|
+
.set("header_line", false)
|
380
|
+
.set("newline", "LF"));
|
381
|
+
|
382
|
+
// config = {exec: execConfig, in: inConfig, out: outConfig}
|
383
|
+
ConfigSource config = newConfig()
|
384
|
+
.set("exec", execConfig)
|
385
|
+
.set("in", inConfig)
|
386
|
+
.set("out", outConfig);
|
387
|
+
|
388
|
+
// embed.run returns TestingBulkLoader.TestingExecutionResult because
|
389
|
+
// LoaderState.buildExecuteResultWithWarningException is overridden.
|
390
|
+
RunResult result = (RunResult) embed.run(config);
|
391
|
+
|
392
|
+
return buildRunResultWithOutput(result, dir, outputPath);
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
public class OutputBuilder
|
397
|
+
{
|
398
|
+
private ConfigSource outConfig = null;
|
399
|
+
private ConfigSource execConfig = newConfig();
|
400
|
+
private Path inputPath;
|
401
|
+
private SchemaConfig inputSchema;
|
402
|
+
|
403
|
+
public OutputBuilder()
|
404
|
+
{ }
|
405
|
+
|
406
|
+
public OutputBuilder out(ConfigSource outConfig)
|
407
|
+
{
|
408
|
+
checkNotNull(outConfig, "outConfig");
|
409
|
+
this.outConfig = outConfig;
|
410
|
+
return this;
|
411
|
+
}
|
412
|
+
|
413
|
+
public OutputBuilder exec(ConfigSource execConfig)
|
414
|
+
{
|
415
|
+
checkNotNull(execConfig, "execConfig");
|
416
|
+
this.execConfig = execConfig;
|
417
|
+
return this;
|
418
|
+
}
|
419
|
+
|
420
|
+
public OutputBuilder inputPath(Path inputPath)
|
421
|
+
{
|
422
|
+
checkNotNull(inputPath, "inputPath");
|
423
|
+
this.inputPath = inputPath;
|
424
|
+
return this;
|
425
|
+
}
|
426
|
+
|
427
|
+
public OutputBuilder inputResource(String resourceName)
|
428
|
+
throws IOException
|
429
|
+
{
|
430
|
+
checkNotNull(resourceName, "resourceName");
|
431
|
+
Path path = createTempFile("csv");
|
432
|
+
copyResource(resourceName, path);
|
433
|
+
return inputPath(path);
|
434
|
+
}
|
435
|
+
|
436
|
+
public OutputBuilder inputSchema(SchemaConfig inputSchema)
|
437
|
+
{
|
438
|
+
checkNotNull(inputSchema, "inputSchema");
|
439
|
+
this.inputSchema = inputSchema;
|
440
|
+
return this;
|
441
|
+
}
|
442
|
+
|
443
|
+
public RunResult run()
|
444
|
+
throws IOException
|
445
|
+
{
|
446
|
+
checkState(outConfig != null, "out config must be set");
|
447
|
+
checkState(inputPath != null, "inputPath must be set");
|
448
|
+
|
449
|
+
String fileName = inputPath.toAbsolutePath().toString();
|
450
|
+
checkArgument(fileName.endsWith(".csv"), "inputPath must end with .csv");
|
451
|
+
|
452
|
+
// exec: config
|
453
|
+
execConfig.set("min_output_tasks", 1);
|
454
|
+
|
455
|
+
// in: config
|
456
|
+
ConfigSource inConfig = newConfig()
|
457
|
+
.set("type", "file")
|
458
|
+
.set("path_prefix", fileName)
|
459
|
+
.set("parser", newParserConfig());
|
460
|
+
|
461
|
+
// config = {exec: execConfig, in: inConfig, out: outConfig}
|
462
|
+
ConfigSource config = newConfig()
|
463
|
+
.set("exec", execConfig)
|
464
|
+
.set("in", inConfig)
|
465
|
+
.set("out", outConfig);
|
466
|
+
|
467
|
+
// embed.run returns TestingBulkLoader.TestingExecutionResult because
|
468
|
+
// LoaderState.buildExecuteResultWithWarningException is overridden.
|
469
|
+
return (RunResult) embed.run(config);
|
470
|
+
}
|
471
|
+
|
472
|
+
private ConfigSource newParserConfig()
|
473
|
+
{
|
474
|
+
return newConfig()
|
475
|
+
.set("charset", "UTF-8")
|
476
|
+
.set("newline", "LF")
|
187
477
|
.set("type", "csv")
|
188
|
-
.set("
|
189
|
-
.set("
|
478
|
+
.set("delimiter", ",")
|
479
|
+
.set("quote", "\"")
|
480
|
+
.set("escape", "\"")
|
481
|
+
.set("columns", newSchemaConfig());
|
482
|
+
}
|
190
483
|
|
191
|
-
|
192
|
-
|
193
|
-
.
|
194
|
-
|
484
|
+
private SchemaConfig newSchemaConfig()
|
485
|
+
{
|
486
|
+
ImmutableList.Builder<ColumnConfig> schema = ImmutableList.builder();
|
487
|
+
try (BufferedReader reader = newBufferedReader(inputPath, UTF_8)) {
|
488
|
+
for (String column : reader.readLine().split(",")) {
|
489
|
+
ColumnConfig columnConfig = newColumnConfig(column);
|
490
|
+
if (columnConfig != null) {
|
491
|
+
schema.add(columnConfig);
|
492
|
+
}
|
493
|
+
}
|
494
|
+
return new SchemaConfig(schema.build());
|
495
|
+
}
|
496
|
+
catch (IOException e) {
|
497
|
+
throw Throwables.propagate(e);
|
498
|
+
}
|
499
|
+
}
|
195
500
|
|
196
|
-
|
197
|
-
|
501
|
+
private ColumnConfig newColumnConfig(String column)
|
502
|
+
{
|
503
|
+
String[] tuple = column.split(":", 2);
|
504
|
+
checkArgument(tuple.length == 2, "tuple must be a pair of column name and type");
|
505
|
+
String type = tuple[1];
|
506
|
+
if (!SUPPORTED_TYPES.contains(type)) {
|
507
|
+
throw new IllegalArgumentException(String.format(ENGLISH,
|
508
|
+
"Unknown column type %s. Supported types are boolean, long, double, string, timestamp and json: %s",
|
509
|
+
tuple[1], column));
|
510
|
+
}
|
511
|
+
return new ColumnConfig(newConfig()
|
512
|
+
.set("name", tuple[0])
|
513
|
+
.set("type", type));
|
514
|
+
}
|
515
|
+
}
|
198
516
|
|
517
|
+
private RunResult buildRunResultWithOutput(RunResult result, Path outputDir, Path outputPath)
|
518
|
+
throws IOException
|
519
|
+
{
|
199
520
|
try (OutputStream out = Files.newOutputStream(outputPath)) {
|
200
521
|
List<Path> fragments = new ArrayList<Path>();
|
201
|
-
try (DirectoryStream<Path> stream = Files.newDirectoryStream(
|
522
|
+
try (DirectoryStream<Path> stream = Files.newDirectoryStream(outputDir, "fragments_*.csv")) {
|
202
523
|
for (Path fragment : stream) {
|
203
524
|
fragments.add(fragment);
|
204
525
|
}
|
@@ -214,9 +535,118 @@ public class TestingEmbulk
|
|
214
535
|
return result;
|
215
536
|
}
|
216
537
|
|
217
|
-
|
218
|
-
|
219
|
-
|
538
|
+
public InputBuilder inputBuilder()
|
539
|
+
{
|
540
|
+
return new InputBuilder();
|
541
|
+
}
|
542
|
+
|
543
|
+
public ParserBuilder parserBuilder()
|
544
|
+
{
|
545
|
+
return new ParserBuilder();
|
546
|
+
}
|
547
|
+
|
548
|
+
public OutputBuilder outputBuilder()
|
549
|
+
{
|
550
|
+
return new OutputBuilder();
|
551
|
+
}
|
552
|
+
|
553
|
+
public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath)
|
554
|
+
throws IOException
|
555
|
+
{
|
556
|
+
return parserBuilder()
|
557
|
+
.parser(parserConfig)
|
558
|
+
.inputPath(inputPath)
|
559
|
+
.outputPath(outputPath)
|
560
|
+
.run();
|
561
|
+
}
|
562
|
+
|
563
|
+
public RunResult runParser(ConfigSource parserConfig, Path inputPath, Path outputPath, ConfigSource execConfig)
|
564
|
+
throws IOException
|
565
|
+
{
|
566
|
+
return parserBuilder()
|
567
|
+
.parser(parserConfig)
|
568
|
+
.inputPath(inputPath)
|
569
|
+
.outputPath(outputPath)
|
570
|
+
.exec(execConfig)
|
571
|
+
.run();
|
572
|
+
}
|
573
|
+
|
574
|
+
public RunResult runInput(ConfigSource inConfig, Path outputPath)
|
575
|
+
throws IOException
|
576
|
+
{
|
577
|
+
return inputBuilder()
|
578
|
+
.in(inConfig)
|
579
|
+
.outputPath(outputPath)
|
580
|
+
.run();
|
581
|
+
}
|
582
|
+
|
583
|
+
public RunResult runInput(ConfigSource inConfig, Path outputPath, ConfigSource execConfig)
|
584
|
+
throws IOException
|
585
|
+
{
|
586
|
+
return inputBuilder()
|
587
|
+
.exec(execConfig)
|
588
|
+
.in(inConfig)
|
589
|
+
.outputPath(outputPath)
|
590
|
+
.run();
|
591
|
+
}
|
592
|
+
|
593
|
+
public RunResult runOutput(ConfigSource outConfig, Path inputPath)
|
594
|
+
throws IOException
|
595
|
+
{
|
596
|
+
return outputBuilder()
|
597
|
+
.out(outConfig)
|
598
|
+
.inputPath(inputPath)
|
599
|
+
.run();
|
600
|
+
}
|
601
|
+
|
602
|
+
public RunResult runOutput(ConfigSource outConfig, Path inputPath, ConfigSource execConfig)
|
603
|
+
throws IOException
|
604
|
+
{
|
605
|
+
return outputBuilder()
|
606
|
+
.exec(execConfig)
|
607
|
+
.out(outConfig)
|
608
|
+
.inputPath(inputPath)
|
609
|
+
.run();
|
610
|
+
}
|
611
|
+
|
612
|
+
public ConfigDiff guessInput(ConfigSource inSeedConfig)
|
613
|
+
{
|
614
|
+
return inputBuilder()
|
615
|
+
.in(inSeedConfig)
|
616
|
+
.guess();
|
617
|
+
}
|
618
|
+
|
619
|
+
public ConfigDiff guessInput(ConfigSource inSeedConfig, ConfigSource execConfig)
|
620
|
+
{
|
621
|
+
return inputBuilder()
|
622
|
+
.exec(execConfig)
|
623
|
+
.in(inSeedConfig)
|
624
|
+
.guess();
|
625
|
+
}
|
626
|
+
|
627
|
+
public ConfigDiff guessParser(Path inputPath)
|
628
|
+
{
|
629
|
+
return parserBuilder()
|
630
|
+
.inputPath(inputPath)
|
631
|
+
.guess();
|
632
|
+
}
|
633
|
+
|
634
|
+
public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath)
|
635
|
+
{
|
636
|
+
return parserBuilder()
|
637
|
+
.parser(parserSeedConfig)
|
638
|
+
.inputPath(inputPath)
|
639
|
+
.guess();
|
640
|
+
}
|
641
|
+
|
642
|
+
public ConfigDiff guessParser(ConfigSource parserSeedConfig, Path inputPath, ConfigSource execConfig)
|
643
|
+
{
|
644
|
+
return parserBuilder()
|
645
|
+
.parser(parserSeedConfig)
|
646
|
+
.inputPath(inputPath)
|
647
|
+
.exec(execConfig)
|
648
|
+
.guess();
|
649
|
+
}
|
220
650
|
|
221
651
|
// TODO add runFilter(ConfigSource filterConfig, Path inputPath, Path outputPath) where inputPath is a path to
|
222
652
|
// a CSV file whose column types can be naturally guessed using csv guess plugin.
|