embulk 0.8.15 → 0.8.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/appveyor.yml +8 -0
- data/build.gradle +86 -45
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +43 -4
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +38 -1
- data/embulk-docs/src/built-in.rst +34 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.16.rst +43 -0
- data/embulk-standards/build.gradle +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RemoveColumnsFilterPlugin.java +268 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +13 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRemoveColumnsFilterPlugin.java +121 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +8 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvAllStringsGuessPlugin.java +38 -0
- data/embulk-standards/src/test/java/org/embulk/standards/guess/TestCsvGuessPlugin.java +229 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_int_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_and_header_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_rows_with_trim_needed_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row.csv +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_1_string_single_column_row_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_int_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header.csv +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_and_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows.csv +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_2_string_single_column_rows_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_backslash_escape_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_int_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_semicolon_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_single_quote_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_guessed.yml +12 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_string_single_column_with_header_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_guessed.yml +16 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv/test/test_tab_delimiter_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_guessed.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/guess/csv_all_strings/test/test_simple_seed.yml +1 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_duplicated_column_names_in.yml +17 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_keep_without_unmatched_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove.csv +5 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_expected.csv +4 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_filter.yml +2 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_in.yml +18 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_with_unmatched_filter.yml +3 -0
- data/embulk-standards/src/test/resources/org/embulk/standards/remove_columns/test/test_remove_without_unmatched_filter.yml +2 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +458 -28
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/gradlew +30 -21
- data/gradlew.bat +4 -10
- data/lib/embulk/command/embulk_migrate_plugin.rb +2 -2
- data/lib/embulk/data/new/java/build.gradle.erb +5 -3
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/lib/embulk/data/new/java/gradlew +30 -21
- data/lib/embulk/data/new/java/gradlew.bat +4 -10
- data/lib/embulk/guess/csv.rb +44 -22
- data/lib/embulk/guess/newline.rb +10 -4
- data/lib/embulk/guess_plugin.rb +3 -1
- data/lib/embulk/java/time_helper.rb +2 -2
- data/lib/embulk/version.rb +1 -1
- metadata +92 -5
@@ -835,6 +835,40 @@ Columns: not recommended
|
|
835
835
|
.. hint::
|
836
836
|
``columns`` are applied before ``rules`` if ``columns`` and ``rules`` are specified together. (It is discouraged to specify them together, though.)
|
837
837
|
|
838
|
+
|
839
|
+
Remove columns filter plugin
|
840
|
+
-----------------------------
|
841
|
+
|
842
|
+
The ``remove_columns`` filter plugin removes columns from schema.
|
843
|
+
|
844
|
+
Options
|
845
|
+
~~~~~~~~
|
846
|
+
|
847
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
848
|
+
| name | type | description | required? |
|
849
|
+
+==========================+==========+============================================================+=======================+
|
850
|
+
| remove | array | An array of names of columns that it removes from schema. | ``[]`` by default |
|
851
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
852
|
+
| keep | array | An array of names of columns that it keeps in schema. | ``[]`` by default |
|
853
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
854
|
+
| accept_unmatched_columns | boolean | If true, skip columns that aren't included in schemas. | ``false`` by default |
|
855
|
+
+--------------------------+----------+------------------------------------------------------------+-----------------------+
|
856
|
+
|
857
|
+
|
858
|
+
remove: and keep: options are not multi-select.
|
859
|
+
|
860
|
+
Example
|
861
|
+
~~~~~~~~
|
862
|
+
|
863
|
+
.. code-block:: yaml
|
864
|
+
|
865
|
+
# This configuration removes "_c0" and "_c1" named columns from schema.
|
866
|
+
filters:
|
867
|
+
...
|
868
|
+
- type: remove_columns
|
869
|
+
remove: ["_c0", "_c1"]
|
870
|
+
|
871
|
+
|
838
872
|
Local executor plugin
|
839
873
|
----------------------
|
840
874
|
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
Release 0.8.16
|
2
|
+
==================================
|
3
|
+
|
4
|
+
General Changes
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Added remove_columns filter plugin [#530]
|
8
|
+
|
9
|
+
* http://www.embulk.org/docs/built-in.html#remove_columns-filter-plugin
|
10
|
+
|
11
|
+
* Supported timestamp format "%Q". (@hiroyuki-sato) [#468, #531]
|
12
|
+
|
13
|
+
* Improved csv guess plugin:
|
14
|
+
|
15
|
+
* Added semicolon as delimiter suggest candidate in csv guess plugin. [#527]
|
16
|
+
|
17
|
+
* Enabled suggesting for a few rows [#533]
|
18
|
+
|
19
|
+
* Enabled suggesting for a single column [#540]
|
20
|
+
|
21
|
+
* Changed and removed limitation of minimum 40 bytes size limit of guessing. [#518]
|
22
|
+
|
23
|
+
* Refactored and introduced TestingEmbulk#{Input,Parser,Output}Builder to embulk-test. [#513, #514, #526]
|
24
|
+
|
25
|
+
* Fixed PageBuilder to avoid NullPointerException. [#535]
|
26
|
+
|
27
|
+
* Fixed ResumableInputStream to avoid NullPointerException. [#472]
|
28
|
+
|
29
|
+
* Fixed TaskValidationException to inherit ConfigException. [#520]
|
30
|
+
|
31
|
+
* Fixed build.gradle to use Task.doLast instead of Task.leftShift. [#536]
|
32
|
+
|
33
|
+
* Fixed build failure on AppVeyor by FileNotFoundException. [#537]
|
34
|
+
|
35
|
+
* Added updateJRuby task to make it easy to upgrade version of JRuby. [#538]
|
36
|
+
|
37
|
+
* Upgraded gradle v3.2.1. [#528]
|
38
|
+
|
39
|
+
* Release notes: https://docs.gradle.org/3.2.1/release-notes
|
40
|
+
|
41
|
+
Release Date
|
42
|
+
------------------
|
43
|
+
2017-01-27
|
@@ -0,0 +1,268 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import com.google.common.collect.ImmutableList;
|
5
|
+
import com.google.common.collect.ImmutableMap;
|
6
|
+
import com.google.inject.Inject;
|
7
|
+
import org.embulk.config.Config;
|
8
|
+
import org.embulk.config.ConfigDefault;
|
9
|
+
import org.embulk.config.ConfigException;
|
10
|
+
import org.embulk.config.ConfigSource;
|
11
|
+
import org.embulk.config.Task;
|
12
|
+
import org.embulk.config.TaskSource;
|
13
|
+
import org.embulk.spi.Column;
|
14
|
+
import org.embulk.spi.ColumnVisitor;
|
15
|
+
import org.embulk.spi.Exec;
|
16
|
+
import org.embulk.spi.FilterPlugin;
|
17
|
+
import org.embulk.spi.Page;
|
18
|
+
import org.embulk.spi.PageBuilder;
|
19
|
+
import org.embulk.spi.PageOutput;
|
20
|
+
import org.embulk.spi.PageReader;
|
21
|
+
import org.embulk.spi.Schema;
|
22
|
+
import org.embulk.spi.SchemaConfigException;
|
23
|
+
import org.slf4j.Logger;
|
24
|
+
|
25
|
+
import java.util.List;
|
26
|
+
import java.util.Map;
|
27
|
+
import java.util.HashMap;
|
28
|
+
|
29
|
+
import static java.util.Locale.ENGLISH;
|
30
|
+
import static org.embulk.spi.Exec.getBufferAllocator;
|
31
|
+
|
32
|
+
public class RemoveColumnsFilterPlugin
|
33
|
+
implements FilterPlugin
|
34
|
+
{
|
35
|
+
public interface PluginTask
|
36
|
+
extends Task
|
37
|
+
{
|
38
|
+
@Config("remove")
|
39
|
+
@ConfigDefault("null")
|
40
|
+
public Optional<List<String>> getRemove();
|
41
|
+
|
42
|
+
// TODO remove_pattern option
|
43
|
+
|
44
|
+
@Config("keep")
|
45
|
+
@ConfigDefault("null")
|
46
|
+
public Optional<List<String>> getKeep();
|
47
|
+
|
48
|
+
// TODO keep_pattern option
|
49
|
+
|
50
|
+
@Config("accept_unmatched_columns")
|
51
|
+
@ConfigDefault("false")
|
52
|
+
public boolean getAcceptUnmatchedColumns();
|
53
|
+
|
54
|
+
public void setIndexMapping(int[] mapping);
|
55
|
+
public int[] getIndexMapping();
|
56
|
+
}
|
57
|
+
|
58
|
+
private final Logger LOG;
|
59
|
+
|
60
|
+
@Inject
|
61
|
+
public RemoveColumnsFilterPlugin()
|
62
|
+
{
|
63
|
+
LOG = Exec.getLogger(getClass());
|
64
|
+
}
|
65
|
+
|
66
|
+
@Override
|
67
|
+
public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control)
|
68
|
+
{
|
69
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
70
|
+
|
71
|
+
// validate remove: and keep:
|
72
|
+
if (task.getRemove().isPresent() && task.getKeep().isPresent()) {
|
73
|
+
throw new ConfigException("remove: and keep: must not be multi-select");
|
74
|
+
}
|
75
|
+
if (!task.getRemove().isPresent() && !task.getKeep().isPresent()) {
|
76
|
+
throw new ConfigException("Must require remove: or keep:");
|
77
|
+
}
|
78
|
+
|
79
|
+
boolean acceptUnmatchedColumns = task.getAcceptUnmatchedColumns();
|
80
|
+
|
81
|
+
ImmutableList.Builder<Column> outputColumns = ImmutableList.builder();
|
82
|
+
int index = 0;
|
83
|
+
int[] indexMapping = new int[inputSchema.size()];
|
84
|
+
for (int i = 0; i < indexMapping.length; i++) {
|
85
|
+
indexMapping[i] = -1;
|
86
|
+
}
|
87
|
+
if (task.getRemove().isPresent()) { // specify remove:
|
88
|
+
List<String> removeColumns = getExistentColumns(inputSchema, task.getRemove().get(), acceptUnmatchedColumns);
|
89
|
+
for (Column column : inputSchema.getColumns()) {
|
90
|
+
if (!removeColumns.contains(column.getName())) {
|
91
|
+
outputColumns.add(new Column(index, column.getName(), column.getType()));
|
92
|
+
indexMapping[column.getIndex()] = index;
|
93
|
+
index++;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
else { // specify keep:
|
98
|
+
List<String> keepColumns = getExistentColumns(inputSchema, task.getKeep().get(), acceptUnmatchedColumns);
|
99
|
+
for (Column column : inputSchema.getColumns()) {
|
100
|
+
if (keepColumns.contains(column.getName())) {
|
101
|
+
outputColumns.add(new Column(index, column.getName(), column.getType()));
|
102
|
+
indexMapping[column.getIndex()] = index;
|
103
|
+
index++;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
task.setIndexMapping(indexMapping);
|
109
|
+
control.run(task.dump(), new Schema(outputColumns.build()));
|
110
|
+
}
|
111
|
+
|
112
|
+
private List<String> getExistentColumns(Schema schema, List<String> specifiedColumns, boolean acceptUnmatch)
|
113
|
+
{
|
114
|
+
ImmutableList.Builder<String> existentColumns = ImmutableList.builder();
|
115
|
+
for (String column : specifiedColumns) {
|
116
|
+
try {
|
117
|
+
schema.lookupColumn(column);
|
118
|
+
existentColumns.add(column);
|
119
|
+
}
|
120
|
+
catch (SchemaConfigException e) {
|
121
|
+
if (!acceptUnmatch) {
|
122
|
+
throw new ConfigException(String.format(ENGLISH, "Column '%s' doesn't exist in the schema", column));
|
123
|
+
}
|
124
|
+
}
|
125
|
+
}
|
126
|
+
return existentColumns.build();
|
127
|
+
}
|
128
|
+
|
129
|
+
@Override
|
130
|
+
public PageOutput open(TaskSource taskSource, Schema inputSchema,
|
131
|
+
Schema outputSchema, PageOutput output)
|
132
|
+
{
|
133
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
134
|
+
PageReader pageReader = new PageReader(inputSchema);
|
135
|
+
PageBuilder pageBuilder = new PageBuilder(getBufferAllocator(), outputSchema, output);
|
136
|
+
return new PageConverter(pageReader, pageBuilder, task.getIndexMapping());
|
137
|
+
}
|
138
|
+
|
139
|
+
static class PageConverter
|
140
|
+
implements PageOutput
|
141
|
+
{
|
142
|
+
private final PageReader pageReader;
|
143
|
+
private final PageBuilder pageBuilder;
|
144
|
+
private final int[] indexMapping;
|
145
|
+
|
146
|
+
PageConverter(PageReader pageReader, PageBuilder pageBuilder, int[] indexMapping)
|
147
|
+
{
|
148
|
+
this.pageReader = pageReader;
|
149
|
+
this.pageBuilder = pageBuilder;
|
150
|
+
this.indexMapping = indexMapping;
|
151
|
+
}
|
152
|
+
|
153
|
+
@Override
|
154
|
+
public void add(Page page)
|
155
|
+
{
|
156
|
+
pageReader.setPage(page);
|
157
|
+
while (pageReader.nextRecord()) {
|
158
|
+
pageReader.getSchema().visitColumns(new ColumnVisitor() {
|
159
|
+
@Override
|
160
|
+
public void booleanColumn(Column inputColumn)
|
161
|
+
{
|
162
|
+
int index = indexMapping[inputColumn.getIndex()];
|
163
|
+
if (index >= 0) {
|
164
|
+
if (pageReader.isNull(inputColumn)) {
|
165
|
+
pageBuilder.setNull(index);
|
166
|
+
}
|
167
|
+
else {
|
168
|
+
pageBuilder.setBoolean(index, pageReader.getBoolean(inputColumn));
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
@Override
|
174
|
+
public void longColumn(Column inputColumn)
|
175
|
+
{
|
176
|
+
int index = indexMapping[inputColumn.getIndex()];
|
177
|
+
if (index >= 0) {
|
178
|
+
if (pageReader.isNull(inputColumn)) {
|
179
|
+
pageBuilder.setNull(index);
|
180
|
+
}
|
181
|
+
else {
|
182
|
+
pageBuilder.setLong(index, pageReader.getLong(inputColumn));
|
183
|
+
}
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
@Override
|
188
|
+
public void doubleColumn(Column inputColumn)
|
189
|
+
{
|
190
|
+
int index = indexMapping[inputColumn.getIndex()];
|
191
|
+
if (index >= 0) {
|
192
|
+
if (pageReader.isNull(inputColumn)) {
|
193
|
+
pageBuilder.setNull(index);
|
194
|
+
}
|
195
|
+
else {
|
196
|
+
pageBuilder.setDouble(index, pageReader.getDouble(inputColumn));
|
197
|
+
}
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
@Override
|
202
|
+
public void stringColumn(Column inputColumn)
|
203
|
+
{
|
204
|
+
int index = indexMapping[inputColumn.getIndex()];
|
205
|
+
if (index >= 0) {
|
206
|
+
if (pageReader.isNull(inputColumn)) {
|
207
|
+
pageBuilder.setNull(index);
|
208
|
+
}
|
209
|
+
else {
|
210
|
+
pageBuilder.setString(index, pageReader.getString(inputColumn));
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
@Override
|
216
|
+
public void timestampColumn(Column inputColumn)
|
217
|
+
{
|
218
|
+
int index = indexMapping[inputColumn.getIndex()];
|
219
|
+
if (index >= 0) {
|
220
|
+
if (pageReader.isNull(inputColumn)) {
|
221
|
+
pageBuilder.setNull(index);
|
222
|
+
}
|
223
|
+
else {
|
224
|
+
pageBuilder.setTimestamp(index, pageReader.getTimestamp(inputColumn));
|
225
|
+
}
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
@Override
|
230
|
+
public void jsonColumn(Column inputColumn)
|
231
|
+
{
|
232
|
+
int index = indexMapping[inputColumn.getIndex()];
|
233
|
+
if (index >= 0) {
|
234
|
+
if (pageReader.isNull(inputColumn)) {
|
235
|
+
pageBuilder.setNull(index);
|
236
|
+
}
|
237
|
+
else {
|
238
|
+
pageBuilder.setJson(index, pageReader.getJson(inputColumn));
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
});
|
243
|
+
pageBuilder.addRecord();
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
private Map<String, Integer> newColumnIndex(Schema schema)
|
248
|
+
{
|
249
|
+
ImmutableMap.Builder<String, Integer> builder = ImmutableMap.builder();
|
250
|
+
for (Column column : schema.getColumns()) {
|
251
|
+
builder.put(column.getName(), column.getIndex());
|
252
|
+
}
|
253
|
+
return builder.build();
|
254
|
+
}
|
255
|
+
|
256
|
+
@Override
|
257
|
+
public void finish()
|
258
|
+
{
|
259
|
+
pageBuilder.finish();
|
260
|
+
}
|
261
|
+
|
262
|
+
@Override
|
263
|
+
public void close()
|
264
|
+
{
|
265
|
+
pageBuilder.close();
|
266
|
+
}
|
267
|
+
}
|
268
|
+
}
|
@@ -30,6 +30,19 @@ import java.util.regex.PatternSyntaxException;
|
|
30
30
|
import javax.validation.constraints.Min;
|
31
31
|
import javax.validation.constraints.Size;
|
32
32
|
|
33
|
+
/**
|
34
|
+
* |RenameFilterPlugin| renames column names.
|
35
|
+
*
|
36
|
+
* NOTE: This filter should bahave always in the same way for the same configuration.
|
37
|
+
* Changes in its behavior confuse users who are working with the same configuration.
|
38
|
+
*
|
39
|
+
* Even when a buggy behavior is found, fix it by:
|
40
|
+
* 1) Adding a new option, and
|
41
|
+
* 2) Implementing a new behavior in the new option.
|
42
|
+
*
|
43
|
+
* Keep the buggy behavior with the old configuration except for fatal failures so
|
44
|
+
* that users are not confused.
|
45
|
+
*/
|
33
46
|
public class RenameFilterPlugin
|
34
47
|
implements FilterPlugin
|
35
48
|
{
|
@@ -47,6 +47,7 @@ public class StandardPluginModule
|
|
47
47
|
|
48
48
|
// filter plugins
|
49
49
|
registerPluginTo(binder, FilterPlugin.class, "rename", RenameFilterPlugin.class);
|
50
|
+
registerPluginTo(binder, FilterPlugin.class, "remove_columns", RemoveColumnsFilterPlugin.class);
|
50
51
|
|
51
52
|
// default guess plugins
|
52
53
|
registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
|
@@ -0,0 +1,121 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import org.embulk.config.ConfigException;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.exec.PartialExecutionException;
|
7
|
+
import org.embulk.test.TestingEmbulk;
|
8
|
+
import org.junit.Rule;
|
9
|
+
import org.junit.Test;
|
10
|
+
|
11
|
+
import java.io.IOException;
|
12
|
+
import java.nio.file.Path;
|
13
|
+
|
14
|
+
import static org.embulk.test.EmbulkTests.copyResource;
|
15
|
+
import static org.embulk.test.EmbulkTests.readResource;
|
16
|
+
import static org.embulk.test.EmbulkTests.readSortedFile;
|
17
|
+
import static org.hamcrest.Matchers.is;
|
18
|
+
import static org.junit.Assert.assertThat;
|
19
|
+
import static org.junit.Assert.assertTrue;
|
20
|
+
import static org.junit.Assert.fail;
|
21
|
+
|
22
|
+
public class TestRemoveColumnsFilterPlugin
|
23
|
+
{
|
24
|
+
private static final String RESOURCE_NAME_PREFIX = "org/embulk/standards/remove_columns/test/";
|
25
|
+
|
26
|
+
@Rule
|
27
|
+
public TestingEmbulk embulk = TestingEmbulk.builder().build();
|
28
|
+
|
29
|
+
@Test
|
30
|
+
public void useKeepOption()
|
31
|
+
throws Exception
|
32
|
+
{
|
33
|
+
assertRecordsByResource(embulk, "test_keep_in.yml", "test_keep_filter.yml",
|
34
|
+
"test_keep.csv", "test_keep_expected.csv");
|
35
|
+
}
|
36
|
+
|
37
|
+
@Test
|
38
|
+
public void useKeepWithAcceptUnmatched()
|
39
|
+
throws Exception
|
40
|
+
{
|
41
|
+
assertRecordsByResource(embulk, "test_keep_in.yml", "test_keep_with_unmatched_filter.yml",
|
42
|
+
"test_keep.csv", "test_keep_expected.csv");
|
43
|
+
}
|
44
|
+
|
45
|
+
@Test
|
46
|
+
public void useKeepWithoutAcceptUnmatched()
|
47
|
+
throws Exception
|
48
|
+
{
|
49
|
+
try {
|
50
|
+
assertRecordsByResource(embulk, "test_keep_in.yml", "test_keep_without_unmatched_filter.yml",
|
51
|
+
"test_keep.csv", "test_keep_expected.csv");
|
52
|
+
fail();
|
53
|
+
}
|
54
|
+
catch (PartialExecutionException ex) {
|
55
|
+
assertTrue(ex.getCause() instanceof ConfigException);
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
@Test
|
60
|
+
public void useKeepWithDuplicatedColumnNames()
|
61
|
+
throws Exception
|
62
|
+
{
|
63
|
+
assertRecordsByResource(embulk, "test_keep_with_duplicated_column_names_in.yml", "test_keep_with_duplicated_column_names.yml",
|
64
|
+
"test_keep_with_duplicated_column_names.csv", "test_keep_with_duplicated_column_names_expected.csv");
|
65
|
+
}
|
66
|
+
|
67
|
+
@Test
|
68
|
+
public void useRemove()
|
69
|
+
throws Exception
|
70
|
+
{
|
71
|
+
assertRecordsByResource(embulk, "test_remove_in.yml", "test_remove_filter.yml",
|
72
|
+
"test_remove.csv", "test_remove_expected.csv");
|
73
|
+
}
|
74
|
+
|
75
|
+
@Test
|
76
|
+
public void useRemoveWithAcceptUnmatched()
|
77
|
+
throws Exception
|
78
|
+
{
|
79
|
+
assertRecordsByResource(embulk, "test_remove_in.yml", "test_remove_with_unmatched_filter.yml",
|
80
|
+
"test_remove.csv", "test_remove_expected.csv");
|
81
|
+
}
|
82
|
+
|
83
|
+
@Test
|
84
|
+
public void useRemoveWithoutAcceptUnmatched()
|
85
|
+
throws Exception
|
86
|
+
{
|
87
|
+
try {
|
88
|
+
assertRecordsByResource(embulk, "test_remove_in.yml", "test_remove_without_unmatched_filter.yml",
|
89
|
+
"test_remove.csv", "test_remove_expected.csv");
|
90
|
+
fail();
|
91
|
+
}
|
92
|
+
catch (PartialExecutionException ex) {
|
93
|
+
assertTrue(ex.getCause() instanceof ConfigException);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
static void assertRecordsByResource(TestingEmbulk embulk,
|
98
|
+
String inConfigYamlResourceName, String filterConfigYamlResourceName,
|
99
|
+
String sourceCsvResourceName, String resultCsvResourceName)
|
100
|
+
throws IOException
|
101
|
+
{
|
102
|
+
Path inputPath = embulk.createTempFile("csv");
|
103
|
+
Path outputPath = embulk.createTempFile("csv");
|
104
|
+
|
105
|
+
// in: config
|
106
|
+
copyResource(RESOURCE_NAME_PREFIX + sourceCsvResourceName, inputPath);
|
107
|
+
ConfigSource inConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + inConfigYamlResourceName)
|
108
|
+
.set("path_prefix", inputPath.toAbsolutePath().toString());
|
109
|
+
|
110
|
+
// remove_columns filter config
|
111
|
+
ConfigSource filterConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + filterConfigYamlResourceName);
|
112
|
+
|
113
|
+
TestingEmbulk.RunResult result = embulk.inputBuilder()
|
114
|
+
.in(inConfig)
|
115
|
+
.filters(ImmutableList.of(filterConfig))
|
116
|
+
.outputPath(outputPath)
|
117
|
+
.run();
|
118
|
+
|
119
|
+
assertThat(readSortedFile(outputPath), is(readResource(RESOURCE_NAME_PREFIX + resultCsvResourceName)));
|
120
|
+
}
|
121
|
+
}
|