embulk 0.8.15-java → 0.8.16-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/appveyor.yml +8 -0
- data/build.gradle +86 -45
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +43 -4
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +38 -1
- data/embulk-docs/src/built-in.rst +34 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.16.rst +43 -0
- data/embulk-standards/build.gradle +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RemoveColumnsFilterPlugin.java +268 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +13 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRemoveColumnsFilterPlugin.java +121 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +8 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvAllStringsGuessPlugin.java +38 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +229 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header.csv +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_in.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_without_unmatched_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_without_unmatched_filter.yml +2 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +458 -28
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/gradlew +30 -21
- data/gradlew.bat +4 -10
- data/lib/embulk/command/embulk_migrate_plugin.rb +2 -2
- data/lib/embulk/data/new/java/build.gradle.erb +5 -3
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/lib/embulk/data/new/java/gradlew +30 -21
- data/lib/embulk/data/new/java/gradlew.bat +4 -10
- data/lib/embulk/guess/csv.rb +44 -22
- data/lib/embulk/guess/newline.rb +10 -4
- data/lib/embulk/guess_plugin.rb +3 -1
- data/lib/embulk/java/time_helper.rb +2 -2
- data/lib/embulk/version.rb +1 -1
- metadata +92 -5
@@ -835,6 +835,40 @@ Columns: not recommended
|
|
835
835
|
.. hint::
|
836
836
|
``columns`` are applied before ``rules`` if ``columns`` and ``rules`` are specified together. (It is discouraged to specify them together, though.)
|
837
837
|
|
838
|
+
|
839
|
+
Remove columns filter plugin
|
840
|
+
-----------------------------
|
841
|
+
|
842
|
+
The ``remove_columns`` filter plugin removes columns from schema.
|
843
|
+
|
844
|
+
Options
|
845
|
+
~~~~~~~~
|
846
|
+
|
847
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
848
|
+
| name | type | description | required? |
|
849
|
+
+==========================+==========+============================================================+=======================+
|
850
|
+
| remove | array | An array of names of columns that it removes from schema. | ``[]`` by default |
|
851
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
852
|
+
| keep | array | An array of names of columns that it keeps in schema. | ``[]`` by default |
|
853
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
854
|
+
| accept_unmatched_columns | boolean | If true, skip columns that aren't included in schemas. | ``false`` by default |
|
855
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
856
|
+
|
857
|
+
|
858
|
+
remove: and keep: options are not multi-select.
|
859
|
+
|
860
|
+
Example
|
861
|
+
~~~~~~~~
|
862
|
+
|
863
|
+
.. code-block:: yaml
|
864
|
+
|
865
|
+
# This configuration removes "_c0" and "_c1" named columns from schema.
|
866
|
+
filters:
|
867
|
+
...
|
868
|
+
- type: remove_columns
|
869
|
+
remove: ["_c0", "_c1"]
|
870
|
+
|
871
|
+
|
838
872
|
Local executor plugin
|
839
873
|
----------------------
|
840
874
|
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
Release 0.8.16
|
2
|
+
==================================
|
3
|
+
|
4
|
+
General Changes
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Added remove_columns filter plugin [#530]
|
8
|
+
|
9
|
+
* http://www.embulk.org/docs/built-in.html#remove_columns-filter-plugin
|
10
|
+
|
11
|
+
* Supported timestamp format "%Q". (@hiroyuki-sato) [#468, #531]
|
12
|
+
|
13
|
+
* Improved csv guess plugin:
|
14
|
+
|
15
|
+
* Added semicolon as delimiter suggest candidate in csv guess plugin. [#527]
|
16
|
+
|
17
|
+
* Enabled suggesting for a few rows [#533]
|
18
|
+
|
19
|
+
* Enabled suggesting for a single column [#540]
|
20
|
+
|
21
|
+
* Changed and removed limitation of minimum 40 bytes size limit of guessing. [#518]
|
22
|
+
|
23
|
+
* Refactored and introduced TestingEmbulk#{Input,Parser,Output}Builder to embulk-test. [#513, #514, #526]
|
24
|
+
|
25
|
+
* Fixed PageBuilder to avoid NullPointerException. [#535]
|
26
|
+
|
27
|
+
* Fixed ResumableInputStream to avoid NullPointerException. [#472]
|
28
|
+
|
29
|
+
* Fixed TaskValidationException to inherit ConfigException. [#520]
|
30
|
+
|
31
|
+
* Fixed build.gradle to use Task.doLast instead of Task.leftShift. [#536]
|
32
|
+
|
33
|
+
* Fixed build failure on AppVeyor by FileNotFoundException. [#537]
|
34
|
+
|
35
|
+
* Added updateJRuby task to make it easy to upgrade version of JRuby. [#538]
|
36
|
+
|
37
|
+
* Upgraded gradle v3.2.1. [#528]
|
38
|
+
|
39
|
+
* Release notes: https://docs.gradle.org/3.2.1/release-notes
|
40
|
+
|
41
|
+
Release Date
|
42
|
+
------------------
|
43
|
+
2017-01-27
|
@@ -0,0 +1,268 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import com.google.common.collect.ImmutableList;
|
5
|
+
import com.google.common.collect.ImmutableMap;
|
6
|
+
import com.google.inject.Inject;
|
7
|
+
import org.embulk.config.Config;
|
8
|
+
import org.embulk.config.ConfigDefault;
|
9
|
+
import org.embulk.config.ConfigException;
|
10
|
+
import org.embulk.config.ConfigSource;
|
11
|
+
import org.embulk.config.Task;
|
12
|
+
import org.embulk.config.TaskSource;
|
13
|
+
import org.embulk.spi.Column;
|
14
|
+
import org.embulk.spi.ColumnVisitor;
|
15
|
+
import org.embulk.spi.Exec;
|
16
|
+
import org.embulk.spi.FilterPlugin;
|
17
|
+
import org.embulk.spi.Page;
|
18
|
+
import org.embulk.spi.PageBuilder;
|
19
|
+
import org.embulk.spi.PageOutput;
|
20
|
+
import org.embulk.spi.PageReader;
|
21
|
+
import org.embulk.spi.Schema;
|
22
|
+
import org.embulk.spi.SchemaConfigException;
|
23
|
+
import org.slf4j.Logger;
|
24
|
+
|
25
|
+
import java.util.List;
|
26
|
+
import java.util.Map;
|
27
|
+
import java.util.HashMap;
|
28
|
+
|
29
|
+
import static java.util.Locale.ENGLISH;
|
30
|
+
import static org.embulk.spi.Exec.getBufferAllocator;
|
31
|
+
|
32
|
+
public class RemoveColumnsFilterPlugin
|
33
|
+
implements FilterPlugin
|
34
|
+
{
|
35
|
+
public interface PluginTask
|
36
|
+
extends Task
|
37
|
+
{
|
38
|
+
@Config("remove")
|
39
|
+
@ConfigDefault("null")
|
40
|
+
public Optional<List<String>> getRemove();
|
41
|
+
|
42
|
+
// TODO remove_pattern option
|
43
|
+
|
44
|
+
@Config("keep")
|
45
|
+
@ConfigDefault("null")
|
46
|
+
public Optional<List<String>> getKeep();
|
47
|
+
|
48
|
+
// TODO keep_pattern option
|
49
|
+
|
50
|
+
@Config("accept_unmatched_columns")
|
51
|
+
@ConfigDefault("false")
|
52
|
+
public boolean getAcceptUnmatchedColumns();
|
53
|
+
|
54
|
+
public void setIndexMapping(int[] mapping);
|
55
|
+
public int[] getIndexMapping();
|
56
|
+
}
|
57
|
+
|
58
|
+
private final Logger LOG;
|
59
|
+
|
60
|
+
@Inject
|
61
|
+
public RemoveColumnsFilterPlugin()
|
62
|
+
{
|
63
|
+
LOG = Exec.getLogger(getClass());
|
64
|
+
}
|
65
|
+
|
66
|
+
@Override
|
67
|
+
public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control)
|
68
|
+
{
|
69
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
70
|
+
|
71
|
+
// validate remove: and keep:
|
72
|
+
if (task.getRemove().isPresent() && task.getKeep().isPresent()) {
|
73
|
+
throw new ConfigException("remove: and keep: must not be multi-select");
|
74
|
+
}
|
75
|
+
if (!task.getRemove().isPresent() && !task.getKeep().isPresent()) {
|
76
|
+
throw new ConfigException("Must require remove: or keep:");
|
77
|
+
}
|
78
|
+
|
79
|
+
boolean acceptUnmatchedColumns = task.getAcceptUnmatchedColumns();
|
80
|
+
|
81
|
+
ImmutableList.Builder<Column> outputColumns = ImmutableList.builder();
|
82
|
+
int index = 0;
|
83
|
+
int[] indexMapping = new int[inputSchema.size()];
|
84
|
+
for (int i = 0; i < indexMapping.length; i++) {
|
85
|
+
indexMapping[i] = -1;
|
86
|
+
}
|
87
|
+
if (task.getRemove().isPresent()) { // specify remove:
|
88
|
+
List<String> removeColumns = getExistentColumns(inputSchema, task.getRemove().get(), acceptUnmatchedColumns);
|
89
|
+
for (Column column : inputSchema.getColumns()) {
|
90
|
+
if (!removeColumns.contains(column.getName())) {
|
91
|
+
outputColumns.add(new Column(index, column.getName(), column.getType()));
|
92
|
+
indexMapping[column.getIndex()] = index;
|
93
|
+
index++;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
else { // specify keep:
|
98
|
+
List<String> keepColumns = getExistentColumns(inputSchema, task.getKeep().get(), acceptUnmatchedColumns);
|
99
|
+
for (Column column : inputSchema.getColumns()) {
|
100
|
+
if (keepColumns.contains(column.getName())) {
|
101
|
+
outputColumns.add(new Column(index, column.getName(), column.getType()));
|
102
|
+
indexMapping[column.getIndex()] = index;
|
103
|
+
index++;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
task.setIndexMapping(indexMapping);
|
109
|
+
control.run(task.dump(), new Schema(outputColumns.build()));
|
110
|
+
}
|
111
|
+
|
112
|
+
private List<String> getExistentColumns(Schema schema, List<String> specifiedColumns, boolean acceptUnmatch)
|
113
|
+
{
|
114
|
+
ImmutableList.Builder<String> existentColumns = ImmutableList.builder();
|
115
|
+
for (String column : specifiedColumns) {
|
116
|
+
try {
|
117
|
+
schema.lookupColumn(column);
|
118
|
+
existentColumns.add(column);
|
119
|
+
}
|
120
|
+
catch (SchemaConfigException e) {
|
121
|
+
if (!acceptUnmatch) {
|
122
|
+
throw new ConfigException(String.format(ENGLISH, "Column '%s' doesn't exist in the schema", column));
|
123
|
+
}
|
124
|
+
}
|
125
|
+
}
|
126
|
+
return existentColumns.build();
|
127
|
+
}
|
128
|
+
|
129
|
+
@Override
|
130
|
+
public PageOutput open(TaskSource taskSource, Schema inputSchema,
|
131
|
+
Schema outputSchema, PageOutput output)
|
132
|
+
{
|
133
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
134
|
+
PageReader pageReader = new PageReader(inputSchema);
|
135
|
+
PageBuilder pageBuilder = new PageBuilder(getBufferAllocator(), outputSchema, output);
|
136
|
+
return new PageConverter(pageReader, pageBuilder, task.getIndexMapping());
|
137
|
+
}
|
138
|
+
|
139
|
+
static class PageConverter
|
140
|
+
implements PageOutput
|
141
|
+
{
|
142
|
+
private final PageReader pageReader;
|
143
|
+
private final PageBuilder pageBuilder;
|
144
|
+
private final int[] indexMapping;
|
145
|
+
|
146
|
+
PageConverter(PageReader pageReader, PageBuilder pageBuilder, int[] indexMapping)
|
147
|
+
{
|
148
|
+
this.pageReader = pageReader;
|
149
|
+
this.pageBuilder = pageBuilder;
|
150
|
+
this.indexMapping = indexMapping;
|
151
|
+
}
|
152
|
+
|
153
|
+
@Override
|
154
|
+
public void add(Page page)
|
155
|
+
{
|
156
|
+
pageReader.setPage(page);
|
157
|
+
while (pageReader.nextRecord()) {
|
158
|
+
pageReader.getSchema().visitColumns(new ColumnVisitor() {
|
159
|
+
@Override
|
160
|
+
public void booleanColumn(Column inputColumn)
|
161
|
+
{
|
162
|
+
int index = indexMapping[inputColumn.getIndex()];
|
163
|
+
if (index >= 0) {
|
164
|
+
if (pageReader.isNull(inputColumn)) {
|
165
|
+
pageBuilder.setNull(index);
|
166
|
+
}
|
167
|
+
else {
|
168
|
+
pageBuilder.setBoolean(index, pageReader.getBoolean(inputColumn));
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
@Override
|
174
|
+
public void longColumn(Column inputColumn)
|
175
|
+
{
|
176
|
+
int index = indexMapping[inputColumn.getIndex()];
|
177
|
+
if (index >= 0) {
|
178
|
+
if (pageReader.isNull(inputColumn)) {
|
179
|
+
pageBuilder.setNull(index);
|
180
|
+
}
|
181
|
+
else {
|
182
|
+
pageBuilder.setLong(index, pageReader.getLong(inputColumn));
|
183
|
+
}
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
@Override
|
188
|
+
public void doubleColumn(Column inputColumn)
|
189
|
+
{
|
190
|
+
int index = indexMapping[inputColumn.getIndex()];
|
191
|
+
if (index >= 0) {
|
192
|
+
if (pageReader.isNull(inputColumn)) {
|
193
|
+
pageBuilder.setNull(index);
|
194
|
+
}
|
195
|
+
else {
|
196
|
+
pageBuilder.setDouble(index, pageReader.getDouble(inputColumn));
|
197
|
+
}
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
@Override
|
202
|
+
public void stringColumn(Column inputColumn)
|
203
|
+
{
|
204
|
+
int index = indexMapping[inputColumn.getIndex()];
|
205
|
+
if (index >= 0) {
|
206
|
+
if (pageReader.isNull(inputColumn)) {
|
207
|
+
pageBuilder.setNull(index);
|
208
|
+
}
|
209
|
+
else {
|
210
|
+
pageBuilder.setString(index, pageReader.getString(inputColumn));
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
@Override
|
216
|
+
public void timestampColumn(Column inputColumn)
|
217
|
+
{
|
218
|
+
int index = indexMapping[inputColumn.getIndex()];
|
219
|
+
if (index >= 0) {
|
220
|
+
if (pageReader.isNull(inputColumn)) {
|
221
|
+
pageBuilder.setNull(index);
|
222
|
+
}
|
223
|
+
else {
|
224
|
+
pageBuilder.setTimestamp(index, pageReader.getTimestamp(inputColumn));
|
225
|
+
}
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
@Override
|
230
|
+
public void jsonColumn(Column inputColumn)
|
231
|
+
{
|
232
|
+
int index = indexMapping[inputColumn.getIndex()];
|
233
|
+
if (index >= 0) {
|
234
|
+
if (pageReader.isNull(inputColumn)) {
|
235
|
+
pageBuilder.setNull(index);
|
236
|
+
}
|
237
|
+
else {
|
238
|
+
pageBuilder.setJson(index, pageReader.getJson(inputColumn));
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
});
|
243
|
+
pageBuilder.addRecord();
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
private Map<String, Integer> newColumnIndex(Schema schema)
|
248
|
+
{
|
249
|
+
ImmutableMap.Builder<String, Integer> builder = ImmutableMap.builder();
|
250
|
+
for (Column column : schema.getColumns()) {
|
251
|
+
builder.put(column.getName(), column.getIndex());
|
252
|
+
}
|
253
|
+
return builder.build();
|
254
|
+
}
|
255
|
+
|
256
|
+
@Override
|
257
|
+
public void finish()
|
258
|
+
{
|
259
|
+
pageBuilder.finish();
|
260
|
+
}
|
261
|
+
|
262
|
+
@Override
|
263
|
+
public void close()
|
264
|
+
{
|
265
|
+
pageBuilder.close();
|
266
|
+
}
|
267
|
+
}
|
268
|
+
}
|
@@ -30,6 +30,19 @@ import java.util.regex.PatternSyntaxException;
|
|
30
30
|
import javax.validation.constraints.Min;
|
31
31
|
import javax.validation.constraints.Size;
|
32
32
|
|
33
|
+
/**
|
34
|
+
* |RenameFilterPlugin| renames column names.
|
35
|
+
*
|
36
|
+
* NOTE: This filter should bahave always in the same way for the same configuration.
|
37
|
+
* Changes in its behavior confuse users who are working with the same configuration.
|
38
|
+
*
|
39
|
+
* Even when a buggy behavior is found, fix it by:
|
40
|
+
* 1) Adding a new option, and
|
41
|
+
* 2) Implementing a new behavior in the new option.
|
42
|
+
*
|
43
|
+
* Keep the buggy behavior with the old configuration except for fatal failures so
|
44
|
+
* that users are not confused.
|
45
|
+
*/
|
33
46
|
public class RenameFilterPlugin
|
34
47
|
implements FilterPlugin
|
35
48
|
{
|
@@ -47,6 +47,7 @@ public class StandardPluginModule
|
|
47
47
|
|
48
48
|
// filter plugins
|
49
49
|
registerPluginTo(binder, FilterPlugin.class, "rename", RenameFilterPlugin.class);
|
50
|
+
registerPluginTo(binder, FilterPlugin.class, "remove_columns", RemoveColumnsFilterPlugin.class);
|
50
51
|
|
51
52
|
// default guess plugins
|
52
53
|
registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
|
@@ -0,0 +1,121 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import org.embulk.config.ConfigException;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.exec.PartialExecutionException;
|
7
|
+
import org.embulk.test.TestingEmbulk;
|
8
|
+
import org.junit.Rule;
|
9
|
+
import org.junit.Test;
|
10
|
+
|
11
|
+
import java.io.IOException;
|
12
|
+
import java.nio.file.Path;
|
13
|
+
|
14
|
+
import static org.embulk.test.EmbulkTests.copyResource;
|
15
|
+
import static org.embulk.test.EmbulkTests.readResource;
|
16
|
+
import static org.embulk.test.EmbulkTests.readSortedFile;
|
17
|
+
import static org.hamcrest.Matchers.is;
|
18
|
+
import static org.junit.Assert.assertThat;
|
19
|
+
import static org.junit.Assert.assertTrue;
|
20
|
+
import static org.junit.Assert.fail;
|
21
|
+
|
22
|
+
public class TestRemoveColumnsFilterPlugin
|
23
|
+
{
|
24
|
+
private static final String RESOURCE_NAME_PREFIX = "org/embulk/standards/remove_columns/test/";
|
25
|
+
|
26
|
+
@Rule
|
27
|
+
public TestingEmbulk embulk = TestingEmbulk.builder().build();
|
28
|
+
|
29
|
+
@Test
|
30
|
+
public void useKeepOption()
|
31
|
+
throws Exception
|
32
|
+
{
|
33
|
+
assertRecordsByResource(embulk, "test_keep_in.yml", "test_keep_filter.yml",
|
34
|
+
"test_keep.csv", "test_keep_expected.csv");
|
35
|
+
}
|
36
|
+
|
37
|
+
@Test
|
38
|
+
public void useKeepWithAcceptUnmatched()
|
39
|
+
throws Exception
|
40
|
+
{
|
41
|
+
assertRecordsByResource(embulk, "test_keep_in.yml", "test_keep_with_unmatched_filter.yml",
|
42
|
+
"test_keep.csv", "test_keep_expected.csv");
|
43
|
+
}
|
44
|
+
|
45
|
+
@Test
|
46
|
+
public void useKeepWithoutAcceptUnmatched()
|
47
|
+
throws Exception
|
48
|
+
{
|
49
|
+
try {
|
50
|
+
assertRecordsByResource(embulk, "test_keep_in.yml", "test_keep_without_unmatched_filter.yml",
|
51
|
+
"test_keep.csv", "test_keep_expected.csv");
|
52
|
+
fail();
|
53
|
+
}
|
54
|
+
catch (PartialExecutionException ex) {
|
55
|
+
assertTrue(ex.getCause() instanceof ConfigException);
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
@Test
|
60
|
+
public void useKeepWithDuplicatedColumnNames()
|
61
|
+
throws Exception
|
62
|
+
{
|
63
|
+
assertRecordsByResource(embulk, "test_keep_with_duplicated_column_names_in.yml", "test_keep_with_duplicated_column_names.yml",
|
64
|
+
"test_keep_with_duplicated_column_names.csv", "test_keep_with_duplicated_column_names_expected.csv");
|
65
|
+
}
|
66
|
+
|
67
|
+
@Test
|
68
|
+
public void useRemove()
|
69
|
+
throws Exception
|
70
|
+
{
|
71
|
+
assertRecordsByResource(embulk, "test_remove_in.yml", "test_remove_filter.yml",
|
72
|
+
"test_remove.csv", "test_remove_expected.csv");
|
73
|
+
}
|
74
|
+
|
75
|
+
@Test
|
76
|
+
public void useRemoveWithAcceptUnmatched()
|
77
|
+
throws Exception
|
78
|
+
{
|
79
|
+
assertRecordsByResource(embulk, "test_remove_in.yml", "test_remove_with_unmatched_filter.yml",
|
80
|
+
"test_remove.csv", "test_remove_expected.csv");
|
81
|
+
}
|
82
|
+
|
83
|
+
@Test
|
84
|
+
public void useRemoveWithoutAcceptUnmatched()
|
85
|
+
throws Exception
|
86
|
+
{
|
87
|
+
try {
|
88
|
+
assertRecordsByResource(embulk, "test_remove_in.yml", "test_remove_without_unmatched_filter.yml",
|
89
|
+
"test_remove.csv", "test_remove_expected.csv");
|
90
|
+
fail();
|
91
|
+
}
|
92
|
+
catch (PartialExecutionException ex) {
|
93
|
+
assertTrue(ex.getCause() instanceof ConfigException);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
static void assertRecordsByResource(TestingEmbulk embulk,
|
98
|
+
String inConfigYamlResourceName, String filterConfigYamlResourceName,
|
99
|
+
String sourceCsvResourceName, String resultCsvResourceName)
|
100
|
+
throws IOException
|
101
|
+
{
|
102
|
+
Path inputPath = embulk.createTempFile("csv");
|
103
|
+
Path outputPath = embulk.createTempFile("csv");
|
104
|
+
|
105
|
+
// in: config
|
106
|
+
copyResource(RESOURCE_NAME_PREFIX + sourceCsvResourceName, inputPath);
|
107
|
+
ConfigSource inConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + inConfigYamlResourceName)
|
108
|
+
.set("path_prefix", inputPath.toAbsolutePath().toString());
|
109
|
+
|
110
|
+
// remove_columns filter config
|
111
|
+
ConfigSource filterConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + filterConfigYamlResourceName);
|
112
|
+
|
113
|
+
TestingEmbulk.RunResult result = embulk.inputBuilder()
|
114
|
+
.in(inConfig)
|
115
|
+
.filters(ImmutableList.of(filterConfig))
|
116
|
+
.outputPath(outputPath)
|
117
|
+
.run();
|
118
|
+
|
119
|
+
assertThat(readSortedFile(outputPath), is(readResource(RESOURCE_NAME_PREFIX + resultCsvResourceName)));
|
120
|
+
}
|
121
|
+
}
|