embulk-filter-calcite 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +4 -0
  4. data/CHANGELOG.md +3 -0
  5. data/README.md +61 -0
  6. data/build.gradle +112 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  10. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  11. data/gradlew +160 -0
  12. data/gradlew.bat +90 -0
  13. data/lib/embulk/filter/calcite.rb +3 -0
  14. data/src/main/java/org/embulk/filter/calcite/CalciteFilterPlugin.java +309 -0
  15. data/src/main/java/org/embulk/filter/calcite/PageConverter.java +119 -0
  16. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageEnumerator.java +56 -0
  17. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageFieldType.java +44 -0
  18. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageSchema.java +26 -0
  19. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageSchemaFactory.java +26 -0
  20. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageTable.java +69 -0
  21. data/src/main/java/org/embulk/filter/calcite/getter/FilterColumnGetterFactory.java +45 -0
  22. data/src/main/java/org/embulk/filter/calcite/getter/UTCTimestampColumnGetter.java +41 -0
  23. data/src/test/java/org/embulk/filter/calcite/TestCalciteFilterPlugin.java +96 -0
  24. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_expected.csv +4 -0
  25. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_filter.yml +2 -0
  26. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_in.yml +18 -0
  27. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_source.csv +5 -0
  28. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_expected.csv +4 -0
  29. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_filter.yml +2 -0
  30. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_in.yml +18 -0
  31. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_source.csv +5 -0
  32. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_expected.csv +4 -0
  33. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_filter.yml +2 -0
  34. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_in.yml +18 -0
  35. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_source.csv +5 -0
  36. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_expected.csv +2 -0
  37. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_filter.yml +2 -0
  38. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_in.yml +18 -0
  39. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_source.csv +5 -0
  40. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_expected.csv +2 -0
  41. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_filter.yml +2 -0
  42. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_in.yml +18 -0
  43. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_source.csv +5 -0
  44. metadata +137 -0
@@ -0,0 +1,119 @@
1
+ package org.embulk.filter.calcite;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.ColumnVisitor;
5
+ import org.embulk.spi.PageReader;
6
+ import org.embulk.spi.Schema;
7
+
8
+ import java.math.BigDecimal;
9
+ import java.util.TimeZone;
10
+
11
+ /**
12
+ * This class converts Embulk's Page values into Calcite's row types. It refers to
13
+ * org.apache.calcite.adapter.csv.CsvEnumerator.
14
+ */
15
+ public class PageConverter
16
+ implements ColumnVisitor
17
+ {
18
+ private final TimeZone defaultTimeZone;
19
+ private final Object[] row;
20
+ private PageReader pageReader;
21
+
22
+ public PageConverter(Schema schema, TimeZone defaultTimeZone)
23
+ {
24
+ this.defaultTimeZone = defaultTimeZone;
25
+ this.row = new Object[schema.getColumnCount()];
26
+ }
27
+
28
+ public Object[] getRow()
29
+ {
30
+ return row;
31
+ }
32
+
33
+ public void setPageReader(PageReader pageReader)
34
+ {
35
+ this.pageReader = pageReader;
36
+ }
37
+
38
+ @Override
39
+ public void booleanColumn(Column column)
40
+ {
41
+ // Embulk's boolean is converted into Java's boolean
42
+ int i = column.getIndex();
43
+ if (pageReader.isNull(i)) {
44
+ row[i] = null;
45
+ }
46
+ else {
47
+ row[i] = pageReader.getBoolean(i);
48
+ }
49
+ }
50
+
51
+ @Override
52
+ public void longColumn(Column column)
53
+ {
54
+ // Embulk's long is converted into long type
55
+ int i = column.getIndex();
56
+ if (pageReader.isNull(i)) {
57
+ row[i] = null;
58
+ }
59
+ else {
60
+ row[i] = pageReader.getLong(i);
61
+ }
62
+ }
63
+
64
+ @Override
65
+ public void doubleColumn(Column column)
66
+ {
67
+ // Embulk's double is converted into java.math.BigDecimal
68
+ int i = column.getIndex();
69
+ if (pageReader.isNull(i)) {
70
+ row[i] = null;
71
+ }
72
+ else {
73
+ row[i] = new BigDecimal(pageReader.getDouble(i));
74
+ }
75
+ }
76
+
77
+ @Override
78
+ public void stringColumn(Column column)
79
+ {
80
+ // Embulk's string is converted into java.lang.String
81
+ int i = column.getIndex();
82
+ if (pageReader.isNull(i)) {
83
+ row[i] = null;
84
+ }
85
+ else {
86
+ row[i] = pageReader.getString(i);
87
+ }
88
+ }
89
+
90
+ @Override
91
+ public void timestampColumn(Column column)
92
+ {
93
+ int i = column.getIndex();
94
+ if (pageReader.isNull(i)) {
95
+ row[i] = null;
96
+ }
97
+ else {
98
+ // Embulk's timestamp is converted into java.sql.Timestmap
99
+ org.embulk.spi.time.Timestamp timestamp = pageReader.getTimestamp(i);
100
+ long milliseconds = timestamp.getEpochSecond() * 1000 + timestamp.getNano() / 1000000;
101
+ java.sql.Timestamp ts = new java.sql.Timestamp(milliseconds);
102
+ ts.setNanos(timestamp.getNano());
103
+ row[i] = ts;
104
+ }
105
+ }
106
+
107
+ @Override
108
+ public void jsonColumn(Column column)
109
+ {
110
+ // Embulk's json is converted into Java's string
111
+ int i = column.getIndex();
112
+ if (pageReader.isNull(i)) {
113
+ row[i] = null;
114
+ }
115
+ else {
116
+ row[i] = pageReader.getJson(i).toJson();
117
+ }
118
+ }
119
+ }
@@ -0,0 +1,56 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.linq4j.Enumerator;
4
+ import org.embulk.filter.calcite.PageConverter;
5
+ import org.embulk.spi.Page;
6
+ import org.embulk.spi.PageReader;
7
+ import org.embulk.spi.Schema;
8
+
9
+ public class PageEnumerator
10
+ implements Enumerator<Object[]>
11
+ {
12
+ private final Schema schema;
13
+ private final PageConverter pageConverter;
14
+ private final PageReader pageReader;
15
+
16
+ public PageEnumerator(Schema schema, PageConverter pageConverter)
17
+ {
18
+ this.schema = schema;
19
+ this.pageReader = new PageReader(schema);
20
+ this.pageConverter = pageConverter;
21
+ }
22
+
23
+ public void setPage(Page page)
24
+ {
25
+ this.pageReader.setPage(page);
26
+ this.pageConverter.setPageReader(pageReader);
27
+ }
28
+
29
+ @Override
30
+ public Object[] current()
31
+ {
32
+ // this is called from org.apache.calcite.linq4j.EnumerableDefaults
33
+ schema.visitColumns(pageConverter);
34
+ return pageConverter.getRow();
35
+ }
36
+
37
+ @Override
38
+ public boolean moveNext()
39
+ {
40
+ return pageReader.nextRecord();
41
+ }
42
+
43
+ @Override
44
+ public void reset()
45
+ {
46
+ throw new UnsupportedOperationException();
47
+ }
48
+
49
+ @Override
50
+ public void close()
51
+ {
52
+ if (pageReader != null) {
53
+ pageReader.close();
54
+ }
55
+ }
56
+ }
@@ -0,0 +1,44 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.adapter.java.JavaTypeFactory;
4
+ import org.apache.calcite.rel.type.RelDataType;
5
+
6
+ import java.util.HashMap;
7
+ import java.util.Map;
8
+
9
+ enum PageFieldType
10
+ {
11
+ STRING(String.class, "string"),
12
+ BOOLEAN(Boolean.class, Boolean.TYPE.getSimpleName()),
13
+ LONG(Long.class, Long.TYPE.getSimpleName()),
14
+ DOUBLE(Double.class, Double.TYPE.getSimpleName()),
15
+ TIMESTAMP(java.sql.Timestamp.class, "timestamp");
16
+
17
+ private static final Map<String, PageFieldType> MAP = new HashMap<>();
18
+
19
+ static
20
+ {
21
+ for (PageFieldType value : values()) {
22
+ MAP.put(value.simpleName, value);
23
+ }
24
+ }
25
+
26
+ private final Class clazz;
27
+ private final String simpleName;
28
+
29
+ private PageFieldType(Class clazz, String simpleName)
30
+ {
31
+ this.clazz = clazz;
32
+ this.simpleName = simpleName;
33
+ }
34
+
35
+ public RelDataType toType(JavaTypeFactory typeFactory)
36
+ {
37
+ return typeFactory.createJavaType(clazz);
38
+ }
39
+
40
+ public static PageFieldType of(String typeString)
41
+ {
42
+ return MAP.get(typeString);
43
+ }
44
+ }
@@ -0,0 +1,26 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.apache.calcite.schema.Table;
5
+ import org.apache.calcite.schema.impl.AbstractSchema;
6
+ import org.embulk.filter.calcite.PageConverter;
7
+ import org.embulk.spi.Schema;
8
+
9
+ import java.util.Map;
10
+
11
+ public class PageSchema
12
+ extends AbstractSchema
13
+ {
14
+ public static Schema schema;
15
+
16
+ public PageSchema()
17
+ {
18
+ super();
19
+ }
20
+
21
+ @Override
22
+ protected Map<String, Table> getTableMap()
23
+ {
24
+ return ImmutableMap.<String, Table>of("$PAGES", new PageTable(schema, null));
25
+ }
26
+ }
@@ -0,0 +1,26 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.schema.SchemaFactory;
4
+ import org.apache.calcite.schema.SchemaPlus;
5
+
6
+ import java.util.Map;
7
+
8
+ /**
9
+ * Factory that creates a {@link PageSchema}.
10
+ * @see https://github.com/apache/calcite/blob/master/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvSchemaFactory.java
11
+ */
12
+ public class PageSchemaFactory
13
+ implements SchemaFactory
14
+ {
15
+ public static final PageSchemaFactory INSTANCE = new PageSchemaFactory();
16
+
17
+ private PageSchemaFactory()
18
+ {
19
+ }
20
+
21
+ @Override
22
+ public org.apache.calcite.schema.Schema create(SchemaPlus parentSchema, String name, Map<String, Object> operand)
23
+ {
24
+ return new PageSchema();
25
+ }
26
+ }
@@ -0,0 +1,69 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.DataContext;
4
+ import org.apache.calcite.adapter.java.JavaTypeFactory;
5
+ import org.apache.calcite.linq4j.AbstractEnumerable;
6
+ import org.apache.calcite.linq4j.Enumerable;
7
+ import org.apache.calcite.linq4j.Enumerator;
8
+ import org.apache.calcite.rel.type.RelDataType;
9
+ import org.apache.calcite.rel.type.RelDataTypeFactory;
10
+ import org.apache.calcite.rel.type.RelProtoDataType;
11
+ import org.apache.calcite.schema.ScannableTable;
12
+ import org.apache.calcite.schema.impl.AbstractTable;
13
+ import org.apache.calcite.util.Pair;
14
+ import org.embulk.filter.calcite.PageConverter;
15
+ import org.embulk.spi.Column;
16
+ import org.embulk.spi.Page;
17
+ import org.embulk.spi.Schema;
18
+
19
+ import java.util.ArrayList;
20
+ import java.util.List;
21
+
22
+ public class PageTable
23
+ extends AbstractTable
24
+ implements ScannableTable
25
+ {
26
+ public static ThreadLocal<PageConverter> pageConverter = new ThreadLocal<>();
27
+ public static ThreadLocal<Page> page = new ThreadLocal<>();
28
+
29
+ private final Schema schema;
30
+ private final RelProtoDataType protoRowType;
31
+
32
+ PageTable(Schema schema, RelProtoDataType protoRowType)
33
+ {
34
+ this.schema = schema;
35
+ this.protoRowType = protoRowType;
36
+ }
37
+
38
+ public RelDataType getRowType(RelDataTypeFactory typeFactory)
39
+ {
40
+ if (protoRowType != null) {
41
+ return protoRowType.apply(typeFactory);
42
+ }
43
+
44
+ final List<RelDataType> types = new ArrayList<>(schema.getColumnCount());
45
+ final List<String> names = new ArrayList<>(schema.getColumnCount());
46
+
47
+ for (Column column : schema.getColumns()) {
48
+ names.add(column.getName());
49
+ PageFieldType type = PageFieldType.of(column.getType().getName());
50
+ types.add(type.toType((JavaTypeFactory) typeFactory));
51
+ }
52
+
53
+ return typeFactory.createStructType(Pair.zip(names, types));
54
+ }
55
+
56
+ public Enumerable<Object[]> scan(DataContext root)
57
+ {
58
+ return new AbstractEnumerable<Object[]>() {
59
+ public Enumerator<Object[]> enumerator()
60
+ {
61
+ PageEnumerator enumerator = new PageEnumerator(schema, pageConverter.get());
62
+ if (page.get() != null) {
63
+ enumerator.setPage(page.get());
64
+ }
65
+ return enumerator;
66
+ }
67
+ };
68
+ }
69
+ }
@@ -0,0 +1,45 @@
1
+ package org.embulk.filter.calcite.getter;
2
+
3
+ import org.embulk.input.jdbc.AbstractJdbcInputPlugin;
4
+ import org.embulk.input.jdbc.JdbcColumn;
5
+ import org.embulk.input.jdbc.JdbcColumnOption;
6
+ import org.embulk.input.jdbc.JdbcInputConnection;
7
+ import org.embulk.input.jdbc.getter.ColumnGetter;
8
+ import org.embulk.input.jdbc.getter.ColumnGetterFactory;
9
+ import org.embulk.spi.PageBuilder;
10
+ import org.embulk.spi.time.TimestampFormatter;
11
+ import org.embulk.spi.type.Type;
12
+ import org.joda.time.DateTimeZone;
13
+
14
+ public class FilterColumnGetterFactory
15
+ extends ColumnGetterFactory
16
+ {
17
+ private final DateTimeZone defaultTimeZone;
18
+
19
+ public FilterColumnGetterFactory(PageBuilder to, DateTimeZone defaultTimeZone)
20
+ {
21
+ super(to, defaultTimeZone);
22
+ this.defaultTimeZone = defaultTimeZone; // TODO make change super.defaultTimeZone field protected
23
+ }
24
+
25
+ @Override
26
+ public ColumnGetter newColumnGetter(JdbcInputConnection con, AbstractJdbcInputPlugin.PluginTask task, JdbcColumn column, JdbcColumnOption option)
27
+ {
28
+ String valueType = option.getValueType();
29
+ Type toType = getToType(option);
30
+ if (valueType.equals("coalesce") && sqlTypeToValueType(column, column.getSqlType()).equals("timestamp")) {
31
+ return new UTCTimestampColumnGetter(to, toType, newTimestampFormatter(option, "%Y-%m-%d"));
32
+ }
33
+ else {
34
+ return super.newColumnGetter(con, task, column, option);
35
+ }
36
+ }
37
+
38
+ private TimestampFormatter newTimestampFormatter(JdbcColumnOption option, String defaultTimestampFormat)
39
+ {
40
+ return new TimestampFormatter(
41
+ option.getJRuby(),
42
+ option.getTimestampFormat().isPresent() ? option.getTimestampFormat().get().getFormat() : defaultTimestampFormat,
43
+ option.getTimeZone().or(defaultTimeZone));
44
+ }
45
+ }
@@ -0,0 +1,41 @@
1
+ package org.embulk.filter.calcite.getter;
2
+
3
+ import org.embulk.input.jdbc.getter.TimestampColumnGetter;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampFormatter;
7
+ import org.embulk.spi.type.Type;
8
+
9
+ import java.sql.ResultSet;
10
+ import java.sql.SQLException;
11
+ import java.util.Calendar;
12
+
13
+ import static java.util.Calendar.getInstance;
14
+ import static java.util.TimeZone.getTimeZone;
15
+
16
+ public class UTCTimestampColumnGetter
17
+ extends TimestampColumnGetter
18
+ {
19
+ private static ThreadLocal<Calendar> calendar = new ThreadLocal<Calendar>() {
20
+ @Override
21
+ protected Calendar initialValue()
22
+ {
23
+ return getInstance(getTimeZone("UTC"));
24
+ }
25
+ };
26
+
27
+ public UTCTimestampColumnGetter(PageBuilder to, Type toType, TimestampFormatter timestampFormatter)
28
+ {
29
+ super(to, toType, timestampFormatter);
30
+ }
31
+
32
+ @Override
33
+ protected void fetch(ResultSet from, int fromIndex)
34
+ throws SQLException
35
+ {
36
+ java.sql.Timestamp timestamp = from.getTimestamp(fromIndex, calendar.get());
37
+ if (timestamp != null) {
38
+ value = Timestamp.ofEpochSecond(timestamp.getTime() / 1000, timestamp.getNanos());
39
+ }
40
+ }
41
+ }
@@ -0,0 +1,96 @@
1
+ package org.embulk.filter.calcite;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import org.embulk.config.ConfigSource;
5
+ import org.embulk.spi.FilterPlugin;
6
+ import org.embulk.test.TestingEmbulk;
7
+ import org.junit.Before;
8
+ import org.junit.Rule;
9
+ import org.junit.Test;
10
+
11
+ import java.io.IOException;
12
+ import java.nio.file.Path;
13
+
14
+ import static org.embulk.test.EmbulkTests.copyResource;
15
+ import static org.embulk.test.EmbulkTests.readResource;
16
+ import static org.embulk.test.EmbulkTests.readSortedFile;
17
+ import static org.hamcrest.Matchers.is;
18
+ import static org.junit.Assert.assertThat;
19
+
20
+ public class TestCalciteFilterPlugin
21
+ {
22
+ private static final String RESOURCE_NAME_PREFIX = "org/embulk/filter/calcite/test/";
23
+
24
+ @Rule
25
+ public TestingEmbulk embulk = TestingEmbulk.builder()
26
+ .registerPlugin(FilterPlugin.class, "calcite", CalciteFilterPlugin.class)
27
+ .build();
28
+
29
+ private ConfigSource baseConfig;
30
+
31
+ @Before
32
+ public void setup()
33
+ {
34
+ baseConfig = embulk.newConfig();
35
+ }
36
+
37
+ @Test
38
+ public void testSimple() throws Exception
39
+ {
40
+ assertRecordsByResource(embulk, "test_simple_in.yml", "test_simple_filter.yml",
41
+ "test_simple_source.csv", "test_simple_expected.csv");
42
+ }
43
+
44
+ @Test
45
+ public void testIntOperators() throws Exception
46
+ {
47
+ assertRecordsByResource(embulk, "test_int_ops_in.yml", "test_int_ops_filter.yml",
48
+ "test_int_ops_source.csv", "test_int_ops_expected.csv");
49
+ }
50
+
51
+ @Test
52
+ public void testWhereIntCondition() throws Exception
53
+ {
54
+ assertRecordsByResource(embulk, "test_where_int_cond_in.yml", "test_where_int_cond_filter.yml",
55
+ "test_where_int_cond_source.csv", "test_where_int_cond_expected.csv");
56
+ }
57
+
58
+ @Test
59
+ public void testStringOperators() throws Exception
60
+ {
61
+ assertRecordsByResource(embulk, "test_string_ops_in.yml", "test_string_ops_filter.yml",
62
+ "test_string_ops_source.csv", "test_string_ops_expected.csv");
63
+ }
64
+
65
+ @Test
66
+ public void testWhereStringCondition() throws Exception
67
+ {
68
+ assertRecordsByResource(embulk, "test_where_string_cond_in.yml", "test_where_string_cond_filter.yml",
69
+ "test_where_string_cond_source.csv", "test_where_string_cond_expected.csv");
70
+ }
71
+
72
+ static void assertRecordsByResource(TestingEmbulk embulk,
73
+ String inConfigYamlResourceName, String filterConfigYamlResourceName,
74
+ String sourceCsvResourceName, String resultCsvResourceName)
75
+ throws IOException
76
+ {
77
+ Path inputPath = embulk.createTempFile("csv");
78
+ Path outputPath = embulk.createTempFile("csv");
79
+
80
+ // in: config
81
+ copyResource(RESOURCE_NAME_PREFIX + sourceCsvResourceName, inputPath);
82
+ ConfigSource inConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + inConfigYamlResourceName)
83
+ .set("path_prefix", inputPath.toAbsolutePath().toString());
84
+
85
+ // remove_columns filter config
86
+ ConfigSource filterConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + filterConfigYamlResourceName);
87
+
88
+ TestingEmbulk.RunResult result = embulk.inputBuilder()
89
+ .in(inConfig)
90
+ .filters(ImmutableList.of(filterConfig))
91
+ .outputPath(outputPath)
92
+ .run();
93
+
94
+ assertThat(readSortedFile(outputPath), is(readResource(RESOURCE_NAME_PREFIX + resultCsvResourceName)));
95
+ }
96
+ }