embulk-filter-calcite 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +4 -0
  4. data/CHANGELOG.md +3 -0
  5. data/README.md +61 -0
  6. data/build.gradle +112 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  10. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  11. data/gradlew +160 -0
  12. data/gradlew.bat +90 -0
  13. data/lib/embulk/filter/calcite.rb +3 -0
  14. data/src/main/java/org/embulk/filter/calcite/CalciteFilterPlugin.java +309 -0
  15. data/src/main/java/org/embulk/filter/calcite/PageConverter.java +119 -0
  16. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageEnumerator.java +56 -0
  17. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageFieldType.java +44 -0
  18. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageSchema.java +26 -0
  19. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageSchemaFactory.java +26 -0
  20. data/src/main/java/org/embulk/filter/calcite/adapter/page/PageTable.java +69 -0
  21. data/src/main/java/org/embulk/filter/calcite/getter/FilterColumnGetterFactory.java +45 -0
  22. data/src/main/java/org/embulk/filter/calcite/getter/UTCTimestampColumnGetter.java +41 -0
  23. data/src/test/java/org/embulk/filter/calcite/TestCalciteFilterPlugin.java +96 -0
  24. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_expected.csv +4 -0
  25. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_filter.yml +2 -0
  26. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_in.yml +18 -0
  27. data/src/test/resources/org/embulk/filter/calcite/test/test_int_ops_source.csv +5 -0
  28. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_expected.csv +4 -0
  29. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_filter.yml +2 -0
  30. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_in.yml +18 -0
  31. data/src/test/resources/org/embulk/filter/calcite/test/test_simple_source.csv +5 -0
  32. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_expected.csv +4 -0
  33. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_filter.yml +2 -0
  34. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_in.yml +18 -0
  35. data/src/test/resources/org/embulk/filter/calcite/test/test_string_ops_source.csv +5 -0
  36. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_expected.csv +2 -0
  37. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_filter.yml +2 -0
  38. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_in.yml +18 -0
  39. data/src/test/resources/org/embulk/filter/calcite/test/test_where_int_cond_source.csv +5 -0
  40. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_expected.csv +2 -0
  41. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_filter.yml +2 -0
  42. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_in.yml +18 -0
  43. data/src/test/resources/org/embulk/filter/calcite/test/test_where_string_cond_source.csv +5 -0
  44. metadata +137 -0
@@ -0,0 +1,119 @@
1
+ package org.embulk.filter.calcite;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.ColumnVisitor;
5
+ import org.embulk.spi.PageReader;
6
+ import org.embulk.spi.Schema;
7
+
8
+ import java.math.BigDecimal;
9
+ import java.util.TimeZone;
10
+
11
+ /**
12
+ * This class converts Embulk's Page values into Calcite's row types. It refers to
13
+ * org.apache.calcite.adapter.csv.CsvEnumerator.
14
+ */
15
+ public class PageConverter
16
+ implements ColumnVisitor
17
+ {
18
+ private final TimeZone defaultTimeZone;
19
+ private final Object[] row;
20
+ private PageReader pageReader;
21
+
22
+ public PageConverter(Schema schema, TimeZone defaultTimeZone)
23
+ {
24
+ this.defaultTimeZone = defaultTimeZone;
25
+ this.row = new Object[schema.getColumnCount()];
26
+ }
27
+
28
+ public Object[] getRow()
29
+ {
30
+ return row;
31
+ }
32
+
33
+ public void setPageReader(PageReader pageReader)
34
+ {
35
+ this.pageReader = pageReader;
36
+ }
37
+
38
+ @Override
39
+ public void booleanColumn(Column column)
40
+ {
41
+ // Embulk's boolean is converted into Java's boolean
42
+ int i = column.getIndex();
43
+ if (pageReader.isNull(i)) {
44
+ row[i] = null;
45
+ }
46
+ else {
47
+ row[i] = pageReader.getBoolean(i);
48
+ }
49
+ }
50
+
51
+ @Override
52
+ public void longColumn(Column column)
53
+ {
54
+ // Embulk's long is converted into long type
55
+ int i = column.getIndex();
56
+ if (pageReader.isNull(i)) {
57
+ row[i] = null;
58
+ }
59
+ else {
60
+ row[i] = pageReader.getLong(i);
61
+ }
62
+ }
63
+
64
+ @Override
65
+ public void doubleColumn(Column column)
66
+ {
67
+ // Embulk's double is converted into java.math.BigDecimal
68
+ int i = column.getIndex();
69
+ if (pageReader.isNull(i)) {
70
+ row[i] = null;
71
+ }
72
+ else {
73
+ row[i] = new BigDecimal(pageReader.getDouble(i));
74
+ }
75
+ }
76
+
77
+ @Override
78
+ public void stringColumn(Column column)
79
+ {
80
+ // Embulk's string is converted into java.lang.String
81
+ int i = column.getIndex();
82
+ if (pageReader.isNull(i)) {
83
+ row[i] = null;
84
+ }
85
+ else {
86
+ row[i] = pageReader.getString(i);
87
+ }
88
+ }
89
+
90
+ @Override
91
+ public void timestampColumn(Column column)
92
+ {
93
+ int i = column.getIndex();
94
+ if (pageReader.isNull(i)) {
95
+ row[i] = null;
96
+ }
97
+ else {
98
+ // Embulk's timestamp is converted into java.sql.Timestmap
99
+ org.embulk.spi.time.Timestamp timestamp = pageReader.getTimestamp(i);
100
+ long milliseconds = timestamp.getEpochSecond() * 1000 + timestamp.getNano() / 1000000;
101
+ java.sql.Timestamp ts = new java.sql.Timestamp(milliseconds);
102
+ ts.setNanos(timestamp.getNano());
103
+ row[i] = ts;
104
+ }
105
+ }
106
+
107
+ @Override
108
+ public void jsonColumn(Column column)
109
+ {
110
+ // Embulk's json is converted into Java's string
111
+ int i = column.getIndex();
112
+ if (pageReader.isNull(i)) {
113
+ row[i] = null;
114
+ }
115
+ else {
116
+ row[i] = pageReader.getJson(i).toJson();
117
+ }
118
+ }
119
+ }
@@ -0,0 +1,56 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.linq4j.Enumerator;
4
+ import org.embulk.filter.calcite.PageConverter;
5
+ import org.embulk.spi.Page;
6
+ import org.embulk.spi.PageReader;
7
+ import org.embulk.spi.Schema;
8
+
9
+ public class PageEnumerator
10
+ implements Enumerator<Object[]>
11
+ {
12
+ private final Schema schema;
13
+ private final PageConverter pageConverter;
14
+ private final PageReader pageReader;
15
+
16
+ public PageEnumerator(Schema schema, PageConverter pageConverter)
17
+ {
18
+ this.schema = schema;
19
+ this.pageReader = new PageReader(schema);
20
+ this.pageConverter = pageConverter;
21
+ }
22
+
23
+ public void setPage(Page page)
24
+ {
25
+ this.pageReader.setPage(page);
26
+ this.pageConverter.setPageReader(pageReader);
27
+ }
28
+
29
+ @Override
30
+ public Object[] current()
31
+ {
32
+ // this is called from org.apache.calcite.linq4j.EnumerableDefaults
33
+ schema.visitColumns(pageConverter);
34
+ return pageConverter.getRow();
35
+ }
36
+
37
+ @Override
38
+ public boolean moveNext()
39
+ {
40
+ return pageReader.nextRecord();
41
+ }
42
+
43
+ @Override
44
+ public void reset()
45
+ {
46
+ throw new UnsupportedOperationException();
47
+ }
48
+
49
+ @Override
50
+ public void close()
51
+ {
52
+ if (pageReader != null) {
53
+ pageReader.close();
54
+ }
55
+ }
56
+ }
@@ -0,0 +1,44 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.adapter.java.JavaTypeFactory;
4
+ import org.apache.calcite.rel.type.RelDataType;
5
+
6
+ import java.util.HashMap;
7
+ import java.util.Map;
8
+
9
+ enum PageFieldType
10
+ {
11
+ STRING(String.class, "string"),
12
+ BOOLEAN(Boolean.class, Boolean.TYPE.getSimpleName()),
13
+ LONG(Long.class, Long.TYPE.getSimpleName()),
14
+ DOUBLE(Double.class, Double.TYPE.getSimpleName()),
15
+ TIMESTAMP(java.sql.Timestamp.class, "timestamp");
16
+
17
+ private static final Map<String, PageFieldType> MAP = new HashMap<>();
18
+
19
+ static
20
+ {
21
+ for (PageFieldType value : values()) {
22
+ MAP.put(value.simpleName, value);
23
+ }
24
+ }
25
+
26
+ private final Class clazz;
27
+ private final String simpleName;
28
+
29
+ private PageFieldType(Class clazz, String simpleName)
30
+ {
31
+ this.clazz = clazz;
32
+ this.simpleName = simpleName;
33
+ }
34
+
35
+ public RelDataType toType(JavaTypeFactory typeFactory)
36
+ {
37
+ return typeFactory.createJavaType(clazz);
38
+ }
39
+
40
+ public static PageFieldType of(String typeString)
41
+ {
42
+ return MAP.get(typeString);
43
+ }
44
+ }
@@ -0,0 +1,26 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.apache.calcite.schema.Table;
5
+ import org.apache.calcite.schema.impl.AbstractSchema;
6
+ import org.embulk.filter.calcite.PageConverter;
7
+ import org.embulk.spi.Schema;
8
+
9
+ import java.util.Map;
10
+
11
+ public class PageSchema
12
+ extends AbstractSchema
13
+ {
14
+ public static Schema schema;
15
+
16
+ public PageSchema()
17
+ {
18
+ super();
19
+ }
20
+
21
+ @Override
22
+ protected Map<String, Table> getTableMap()
23
+ {
24
+ return ImmutableMap.<String, Table>of("$PAGES", new PageTable(schema, null));
25
+ }
26
+ }
@@ -0,0 +1,26 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.schema.SchemaFactory;
4
+ import org.apache.calcite.schema.SchemaPlus;
5
+
6
+ import java.util.Map;
7
+
8
+ /**
9
+ * Factory that creates a {@link PageSchema}.
10
+ * @see https://github.com/apache/calcite/blob/master/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvSchemaFactory.java
11
+ */
12
+ public class PageSchemaFactory
13
+ implements SchemaFactory
14
+ {
15
+ public static final PageSchemaFactory INSTANCE = new PageSchemaFactory();
16
+
17
+ private PageSchemaFactory()
18
+ {
19
+ }
20
+
21
+ @Override
22
+ public org.apache.calcite.schema.Schema create(SchemaPlus parentSchema, String name, Map<String, Object> operand)
23
+ {
24
+ return new PageSchema();
25
+ }
26
+ }
@@ -0,0 +1,69 @@
1
+ package org.embulk.filter.calcite.adapter.page;
2
+
3
+ import org.apache.calcite.DataContext;
4
+ import org.apache.calcite.adapter.java.JavaTypeFactory;
5
+ import org.apache.calcite.linq4j.AbstractEnumerable;
6
+ import org.apache.calcite.linq4j.Enumerable;
7
+ import org.apache.calcite.linq4j.Enumerator;
8
+ import org.apache.calcite.rel.type.RelDataType;
9
+ import org.apache.calcite.rel.type.RelDataTypeFactory;
10
+ import org.apache.calcite.rel.type.RelProtoDataType;
11
+ import org.apache.calcite.schema.ScannableTable;
12
+ import org.apache.calcite.schema.impl.AbstractTable;
13
+ import org.apache.calcite.util.Pair;
14
+ import org.embulk.filter.calcite.PageConverter;
15
+ import org.embulk.spi.Column;
16
+ import org.embulk.spi.Page;
17
+ import org.embulk.spi.Schema;
18
+
19
+ import java.util.ArrayList;
20
+ import java.util.List;
21
+
22
+ public class PageTable
23
+ extends AbstractTable
24
+ implements ScannableTable
25
+ {
26
+ public static ThreadLocal<PageConverter> pageConverter = new ThreadLocal<>();
27
+ public static ThreadLocal<Page> page = new ThreadLocal<>();
28
+
29
+ private final Schema schema;
30
+ private final RelProtoDataType protoRowType;
31
+
32
+ PageTable(Schema schema, RelProtoDataType protoRowType)
33
+ {
34
+ this.schema = schema;
35
+ this.protoRowType = protoRowType;
36
+ }
37
+
38
+ public RelDataType getRowType(RelDataTypeFactory typeFactory)
39
+ {
40
+ if (protoRowType != null) {
41
+ return protoRowType.apply(typeFactory);
42
+ }
43
+
44
+ final List<RelDataType> types = new ArrayList<>(schema.getColumnCount());
45
+ final List<String> names = new ArrayList<>(schema.getColumnCount());
46
+
47
+ for (Column column : schema.getColumns()) {
48
+ names.add(column.getName());
49
+ PageFieldType type = PageFieldType.of(column.getType().getName());
50
+ types.add(type.toType((JavaTypeFactory) typeFactory));
51
+ }
52
+
53
+ return typeFactory.createStructType(Pair.zip(names, types));
54
+ }
55
+
56
+ public Enumerable<Object[]> scan(DataContext root)
57
+ {
58
+ return new AbstractEnumerable<Object[]>() {
59
+ public Enumerator<Object[]> enumerator()
60
+ {
61
+ PageEnumerator enumerator = new PageEnumerator(schema, pageConverter.get());
62
+ if (page.get() != null) {
63
+ enumerator.setPage(page.get());
64
+ }
65
+ return enumerator;
66
+ }
67
+ };
68
+ }
69
+ }
@@ -0,0 +1,45 @@
1
+ package org.embulk.filter.calcite.getter;
2
+
3
+ import org.embulk.input.jdbc.AbstractJdbcInputPlugin;
4
+ import org.embulk.input.jdbc.JdbcColumn;
5
+ import org.embulk.input.jdbc.JdbcColumnOption;
6
+ import org.embulk.input.jdbc.JdbcInputConnection;
7
+ import org.embulk.input.jdbc.getter.ColumnGetter;
8
+ import org.embulk.input.jdbc.getter.ColumnGetterFactory;
9
+ import org.embulk.spi.PageBuilder;
10
+ import org.embulk.spi.time.TimestampFormatter;
11
+ import org.embulk.spi.type.Type;
12
+ import org.joda.time.DateTimeZone;
13
+
14
+ public class FilterColumnGetterFactory
15
+ extends ColumnGetterFactory
16
+ {
17
+ private final DateTimeZone defaultTimeZone;
18
+
19
+ public FilterColumnGetterFactory(PageBuilder to, DateTimeZone defaultTimeZone)
20
+ {
21
+ super(to, defaultTimeZone);
22
+ this.defaultTimeZone = defaultTimeZone; // TODO make change super.defaultTimeZone field protected
23
+ }
24
+
25
+ @Override
26
+ public ColumnGetter newColumnGetter(JdbcInputConnection con, AbstractJdbcInputPlugin.PluginTask task, JdbcColumn column, JdbcColumnOption option)
27
+ {
28
+ String valueType = option.getValueType();
29
+ Type toType = getToType(option);
30
+ if (valueType.equals("coalesce") && sqlTypeToValueType(column, column.getSqlType()).equals("timestamp")) {
31
+ return new UTCTimestampColumnGetter(to, toType, newTimestampFormatter(option, "%Y-%m-%d"));
32
+ }
33
+ else {
34
+ return super.newColumnGetter(con, task, column, option);
35
+ }
36
+ }
37
+
38
+ private TimestampFormatter newTimestampFormatter(JdbcColumnOption option, String defaultTimestampFormat)
39
+ {
40
+ return new TimestampFormatter(
41
+ option.getJRuby(),
42
+ option.getTimestampFormat().isPresent() ? option.getTimestampFormat().get().getFormat() : defaultTimestampFormat,
43
+ option.getTimeZone().or(defaultTimeZone));
44
+ }
45
+ }
@@ -0,0 +1,41 @@
1
+ package org.embulk.filter.calcite.getter;
2
+
3
+ import org.embulk.input.jdbc.getter.TimestampColumnGetter;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampFormatter;
7
+ import org.embulk.spi.type.Type;
8
+
9
+ import java.sql.ResultSet;
10
+ import java.sql.SQLException;
11
+ import java.util.Calendar;
12
+
13
+ import static java.util.Calendar.getInstance;
14
+ import static java.util.TimeZone.getTimeZone;
15
+
16
+ public class UTCTimestampColumnGetter
17
+ extends TimestampColumnGetter
18
+ {
19
+ private static ThreadLocal<Calendar> calendar = new ThreadLocal<Calendar>() {
20
+ @Override
21
+ protected Calendar initialValue()
22
+ {
23
+ return getInstance(getTimeZone("UTC"));
24
+ }
25
+ };
26
+
27
+ public UTCTimestampColumnGetter(PageBuilder to, Type toType, TimestampFormatter timestampFormatter)
28
+ {
29
+ super(to, toType, timestampFormatter);
30
+ }
31
+
32
+ @Override
33
+ protected void fetch(ResultSet from, int fromIndex)
34
+ throws SQLException
35
+ {
36
+ java.sql.Timestamp timestamp = from.getTimestamp(fromIndex, calendar.get());
37
+ if (timestamp != null) {
38
+ value = Timestamp.ofEpochSecond(timestamp.getTime() / 1000, timestamp.getNanos());
39
+ }
40
+ }
41
+ }
@@ -0,0 +1,96 @@
1
+ package org.embulk.filter.calcite;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import org.embulk.config.ConfigSource;
5
+ import org.embulk.spi.FilterPlugin;
6
+ import org.embulk.test.TestingEmbulk;
7
+ import org.junit.Before;
8
+ import org.junit.Rule;
9
+ import org.junit.Test;
10
+
11
+ import java.io.IOException;
12
+ import java.nio.file.Path;
13
+
14
+ import static org.embulk.test.EmbulkTests.copyResource;
15
+ import static org.embulk.test.EmbulkTests.readResource;
16
+ import static org.embulk.test.EmbulkTests.readSortedFile;
17
+ import static org.hamcrest.Matchers.is;
18
+ import static org.junit.Assert.assertThat;
19
+
20
+ public class TestCalciteFilterPlugin
21
+ {
22
+ private static final String RESOURCE_NAME_PREFIX = "org/embulk/filter/calcite/test/";
23
+
24
+ @Rule
25
+ public TestingEmbulk embulk = TestingEmbulk.builder()
26
+ .registerPlugin(FilterPlugin.class, "calcite", CalciteFilterPlugin.class)
27
+ .build();
28
+
29
+ private ConfigSource baseConfig;
30
+
31
+ @Before
32
+ public void setup()
33
+ {
34
+ baseConfig = embulk.newConfig();
35
+ }
36
+
37
+ @Test
38
+ public void testSimple() throws Exception
39
+ {
40
+ assertRecordsByResource(embulk, "test_simple_in.yml", "test_simple_filter.yml",
41
+ "test_simple_source.csv", "test_simple_expected.csv");
42
+ }
43
+
44
+ @Test
45
+ public void testIntOperators() throws Exception
46
+ {
47
+ assertRecordsByResource(embulk, "test_int_ops_in.yml", "test_int_ops_filter.yml",
48
+ "test_int_ops_source.csv", "test_int_ops_expected.csv");
49
+ }
50
+
51
+ @Test
52
+ public void testWhereIntCondition() throws Exception
53
+ {
54
+ assertRecordsByResource(embulk, "test_where_int_cond_in.yml", "test_where_int_cond_filter.yml",
55
+ "test_where_int_cond_source.csv", "test_where_int_cond_expected.csv");
56
+ }
57
+
58
+ @Test
59
+ public void testStringOperators() throws Exception
60
+ {
61
+ assertRecordsByResource(embulk, "test_string_ops_in.yml", "test_string_ops_filter.yml",
62
+ "test_string_ops_source.csv", "test_string_ops_expected.csv");
63
+ }
64
+
65
+ @Test
66
+ public void testWhereStringCondition() throws Exception
67
+ {
68
+ assertRecordsByResource(embulk, "test_where_string_cond_in.yml", "test_where_string_cond_filter.yml",
69
+ "test_where_string_cond_source.csv", "test_where_string_cond_expected.csv");
70
+ }
71
+
72
+ static void assertRecordsByResource(TestingEmbulk embulk,
73
+ String inConfigYamlResourceName, String filterConfigYamlResourceName,
74
+ String sourceCsvResourceName, String resultCsvResourceName)
75
+ throws IOException
76
+ {
77
+ Path inputPath = embulk.createTempFile("csv");
78
+ Path outputPath = embulk.createTempFile("csv");
79
+
80
+ // in: config
81
+ copyResource(RESOURCE_NAME_PREFIX + sourceCsvResourceName, inputPath);
82
+ ConfigSource inConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + inConfigYamlResourceName)
83
+ .set("path_prefix", inputPath.toAbsolutePath().toString());
84
+
85
+ // remove_columns filter config
86
+ ConfigSource filterConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + filterConfigYamlResourceName);
87
+
88
+ TestingEmbulk.RunResult result = embulk.inputBuilder()
89
+ .in(inConfig)
90
+ .filters(ImmutableList.of(filterConfig))
91
+ .outputPath(outputPath)
92
+ .run();
93
+
94
+ assertThat(readSortedFile(outputPath), is(readResource(RESOURCE_NAME_PREFIX + resultCsvResourceName)));
95
+ }
96
+ }