embulk-parser-avro 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +88 -0
  5. data/build.gradle +96 -0
  6. data/config/checkstyle/checkstyle.xml +128 -0
  7. data/config/checkstyle/default.xml +108 -0
  8. data/example/.gitignore +3 -0
  9. data/example/example.yml +22 -0
  10. data/example/generate.rb +94 -0
  11. data/example/item.avsc +27 -0
  12. data/example/items.avro +0 -0
  13. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  14. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  15. data/gradlew +160 -0
  16. data/gradlew.bat +90 -0
  17. data/lib/embulk/guess/avro.rb +61 -0
  18. data/lib/embulk/parser/avro.rb +3 -0
  19. data/src/main/java/org/embulk/parser/avro/AvroColumnOption.java +16 -0
  20. data/src/main/java/org/embulk/parser/avro/AvroParserPlugin.java +93 -0
  21. data/src/main/java/org/embulk/parser/avro/getter/AvroGenericDataConverter.java +72 -0
  22. data/src/main/java/org/embulk/parser/avro/getter/BaseColumnGetter.java +83 -0
  23. data/src/main/java/org/embulk/parser/avro/getter/BooleanColumnGetter.java +37 -0
  24. data/src/main/java/org/embulk/parser/avro/getter/ColumnGetterFactory.java +63 -0
  25. data/src/main/java/org/embulk/parser/avro/getter/DoubleColumnGetter.java +58 -0
  26. data/src/main/java/org/embulk/parser/avro/getter/FloatColumnGetter.java +58 -0
  27. data/src/main/java/org/embulk/parser/avro/getter/GenericDataColumnGetter.java +34 -0
  28. data/src/main/java/org/embulk/parser/avro/getter/IntegerColumnGetter.java +57 -0
  29. data/src/main/java/org/embulk/parser/avro/getter/LongColumnGetter.java +58 -0
  30. data/src/main/java/org/embulk/parser/avro/getter/StringColumnGetter.java +85 -0
  31. data/src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java +145 -0
  32. data/src/test/resources/org/embulk/parser/avro/item.avsc +27 -0
  33. data/src/test/resources/org/embulk/parser/avro/items.avro +0 -0
  34. metadata +112 -0
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "avro", "org.embulk.parser.avro.AvroParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,16 @@
1
+ package org.embulk.parser.avro;
2
+
3
+ import org.embulk.config.Config;
4
+ import org.embulk.config.ConfigDefault;
5
+ import org.embulk.config.Task;
6
+ import org.embulk.spi.type.Type;
7
+
8
+ import com.google.common.base.Optional;
9
+
10
+ public interface AvroColumnOption
11
+ extends Task
12
+ {
13
+ @Config("type")
14
+ @ConfigDefault("null")
15
+ Optional<Type> getType();
16
+ }
@@ -0,0 +1,93 @@
1
+ package org.embulk.parser.avro;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.apache.avro.file.DataFileStream;
5
+ import org.apache.avro.generic.GenericDatumReader;
6
+ import org.apache.avro.generic.GenericRecord;
7
+ import org.apache.avro.io.DatumReader;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.Task;
11
+ import org.embulk.config.TaskSource;
12
+ import org.embulk.parser.avro.getter.BaseColumnGetter;
13
+ import org.embulk.parser.avro.getter.ColumnGetterFactory;
14
+ import org.embulk.spi.Column;
15
+ import org.embulk.spi.Exec;
16
+ import org.embulk.spi.FileInput;
17
+ import org.embulk.spi.PageBuilder;
18
+ import org.embulk.spi.PageOutput;
19
+ import org.embulk.spi.ParserPlugin;
20
+ import org.embulk.spi.Schema;
21
+ import org.embulk.spi.SchemaConfig;
22
+ import org.embulk.spi.time.TimestampParser;
23
+ import org.embulk.spi.unit.LocalFile;
24
+ import org.embulk.spi.util.FileInputInputStream;
25
+ import org.embulk.spi.util.Timestamps;
26
+
27
+ import java.io.File;
28
+ import java.io.IOException;
29
+ import java.util.List;
30
+
31
+ public class AvroParserPlugin
32
+ implements ParserPlugin
33
+ {
34
+ public interface PluginTask
35
+ extends Task, TimestampParser.Task
36
+ {
37
+ @Config("columns")
38
+ public SchemaConfig getColumns();
39
+
40
+ @Config("avsc")
41
+ LocalFile getAvsc();
42
+ }
43
+
44
+ @Override
45
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
46
+ {
47
+ PluginTask task = config.loadConfig(PluginTask.class);
48
+
49
+ Schema schema = task.getColumns().toSchema();
50
+
51
+ control.run(task.dump(), schema);
52
+ }
53
+
54
+ @Override
55
+ public void run(TaskSource taskSource, Schema schema,
56
+ FileInput input, PageOutput output)
57
+ {
58
+ PluginTask task = taskSource.loadTask(PluginTask.class);
59
+ File avsc = task.getAvsc().getFile();
60
+ List<Column> columns = schema.getColumns();
61
+ final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
62
+
63
+ try (FileInputInputStream is = new FileInputInputStream(input); final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
64
+ org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
65
+ ColumnGetterFactory factory = new ColumnGetterFactory(avroSchema, pageBuilder, timestampParsers);
66
+ ImmutableMap.Builder<String, BaseColumnGetter> columnGettersBuilder = ImmutableMap.builder();
67
+ for (Column column : columns) {
68
+ BaseColumnGetter columnGetter = factory.newColumnGetter(column);
69
+ columnGettersBuilder.put(column.getName(), columnGetter);
70
+ }
71
+ ImmutableMap<String, BaseColumnGetter> columnGetters = columnGettersBuilder.build();
72
+ DatumReader<GenericRecord> reader = new GenericDatumReader<>(avroSchema);
73
+ GenericRecord record = null;
74
+ while (is.nextFile()) {
75
+ DataFileStream<GenericRecord> ds = new DataFileStream<>(is, reader);
76
+ while (ds.hasNext()) {
77
+ record = ds.next(record);
78
+ for (Column column : columns) {
79
+ BaseColumnGetter columnGetter = columnGetters.get(column.getName());
80
+ columnGetter.setValue(record.get(column.getName()));
81
+ column.visit(columnGetter);
82
+ }
83
+ pageBuilder.addRecord();
84
+ }
85
+ }
86
+
87
+ pageBuilder.finish();
88
+ }
89
+ catch (IOException e) {
90
+ throw new RuntimeException(e);
91
+ }
92
+ }
93
+ }
@@ -0,0 +1,72 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.apache.avro.Schema;
4
+ import org.apache.avro.generic.GenericData;
5
+ import org.apache.avro.util.Utf8;
6
+ import org.msgpack.value.Value;
7
+ import org.msgpack.value.ValueFactory;
8
+
9
+ import java.util.ArrayList;
10
+ import java.util.HashMap;
11
+ import java.util.List;
12
+ import java.util.Map;
13
+ import java.util.Set;
14
+
15
+ public class AvroGenericDataConverter {
16
+ public static Value convert(Object genericData)
17
+ {
18
+ return toValue(genericData);
19
+ }
20
+
21
+ private static Value toValue(Object rawValue)
22
+ {
23
+ if (rawValue instanceof Utf8) {
24
+ return ValueFactory.newString(rawValue.toString());
25
+ } else if (rawValue instanceof Integer) {
26
+ return ValueFactory.newInteger((Integer)rawValue);
27
+ } else if (rawValue instanceof Long) {
28
+ return ValueFactory.newInteger((Long)rawValue);
29
+ } else if (rawValue instanceof Float) {
30
+ return ValueFactory.newFloat((Float)rawValue);
31
+ } else if (rawValue instanceof Double) {
32
+ return ValueFactory.newFloat((Double)rawValue);
33
+ } else if (rawValue instanceof Boolean) {
34
+ return ValueFactory.newBoolean((Boolean)rawValue);
35
+ } else if (rawValue instanceof GenericData.EnumSymbol) {
36
+ return ValueFactory.newString(rawValue.toString());
37
+ } else if (rawValue instanceof GenericData.Array) {
38
+ List<Value> list = new ArrayList<>();
39
+ for (Object item : (GenericData.Array)rawValue) {
40
+ list.add(toValue(item));
41
+ }
42
+ return ValueFactory.newArray(list);
43
+ } else if (rawValue instanceof GenericData.Record) {
44
+ Map<Value, Value> map = new HashMap<>();
45
+ GenericData.Record casted = (GenericData.Record) rawValue;
46
+ for (Schema.Field field : casted.getSchema().getFields()) {
47
+ Object val = casted.get(field.name());
48
+ Value keyValue = ValueFactory.newString(field.name());
49
+ Value valValue = toValue(val);
50
+ map.put(keyValue, valValue);
51
+ }
52
+ return ValueFactory.newMap(map);
53
+ } else if (rawValue instanceof HashMap) {
54
+ Map<Value, Value> map = new HashMap<>();
55
+ HashMap casted = (HashMap) rawValue;
56
+ Set entries = casted.entrySet();
57
+ for (Object entry : entries) {
58
+ Map.Entry et = (Map.Entry) entry;
59
+ Utf8 key = (Utf8) (et.getKey());
60
+ Object val = et.getValue();
61
+ Value keyValue = toValue(key);
62
+ Value valValue = toValue(val);
63
+ map.put(keyValue, valValue);
64
+ }
65
+ return ValueFactory.newMap(map);
66
+ } else if (rawValue == null) {
67
+ return ValueFactory.newNil();
68
+ } else {
69
+ throw new RuntimeException("Unknown type");
70
+ }
71
+ }
72
+ }
@@ -0,0 +1,83 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.ColumnVisitor;
5
+ import org.embulk.spi.DataException;
6
+ import org.embulk.spi.PageBuilder;
7
+ import org.embulk.spi.time.TimestampParser;
8
+
9
+ public class BaseColumnGetter implements ColumnVisitor {
10
+ protected final PageBuilder pageBuilder;
11
+ protected final TimestampParser[] timestampParsers;
12
+ protected Object value;
13
+
14
+ public BaseColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
15
+ this.pageBuilder = pageBuilder;
16
+ this.timestampParsers = timestampParsers;
17
+ }
18
+
19
+ public void setValue(Object value)
20
+ {
21
+ this.value = value;
22
+ }
23
+
24
+ @Override
25
+ public void booleanColumn(Column column)
26
+ {
27
+ if (value == null) {
28
+ pageBuilder.setNull(column);
29
+ } else {
30
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
31
+ }
32
+ }
33
+
34
+ @Override
35
+ public void longColumn(Column column)
36
+ {
37
+ if (value == null) {
38
+ pageBuilder.setNull(column);
39
+ } else {
40
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
41
+ }
42
+ }
43
+
44
+ @Override
45
+ public void doubleColumn(Column column)
46
+ {
47
+ if (value == null) {
48
+ pageBuilder.setNull(column);
49
+ } else {
50
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
51
+ }
52
+ }
53
+
54
+ @Override
55
+ public void stringColumn(Column column)
56
+ {
57
+ if (value == null) {
58
+ pageBuilder.setNull(column);
59
+ } else {
60
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
61
+ }
62
+ }
63
+
64
+ @Override
65
+ public void timestampColumn(Column column)
66
+ {
67
+ if (value == null) {
68
+ pageBuilder.setNull(column);
69
+ } else {
70
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
71
+ }
72
+ }
73
+
74
+ @Override
75
+ public void jsonColumn(Column column)
76
+ {
77
+ if (value == null) {
78
+ pageBuilder.setNull(column);
79
+ } else {
80
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
81
+ }
82
+ }
83
+ }
@@ -0,0 +1,37 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.TimestampParser;
6
+
7
+ public class BooleanColumnGetter extends BaseColumnGetter {
8
+ protected Boolean value;
9
+
10
+ public BooleanColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
11
+ super(pageBuilder, timestampParsers);
12
+ }
13
+
14
+ @Override
15
+ public void setValue(Object value)
16
+ {
17
+ this.value = (Boolean) value;
18
+ }
19
+
20
+ @Override
21
+ public void booleanColumn(Column column) {
22
+ if (value == null) {
23
+ pageBuilder.setNull(column);
24
+ } else {
25
+ pageBuilder.setBoolean(column, value);
26
+ }
27
+ }
28
+
29
+ @Override
30
+ public void stringColumn(Column column) {
31
+ if (value == null) {
32
+ pageBuilder.setNull(column);
33
+ } else {
34
+ pageBuilder.setString(column, value.toString());
35
+ }
36
+ }
37
+ }
@@ -0,0 +1,63 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.DataException;
5
+ import org.embulk.spi.PageBuilder;
6
+ import org.embulk.spi.time.TimestampParser;
7
+
8
+ public class ColumnGetterFactory {
9
+ private org.apache.avro.Schema avroSchema;
10
+ private PageBuilder pageBuilder;
11
+ private TimestampParser[] timestampParsers;
12
+
13
+ public ColumnGetterFactory(org.apache.avro.Schema avroSchema, PageBuilder pageBuilder, TimestampParser[] timestampParsers)
14
+ {
15
+ this.avroSchema = avroSchema;
16
+ this.pageBuilder = pageBuilder;
17
+ this.timestampParsers = timestampParsers;
18
+ }
19
+
20
+ public BaseColumnGetter newColumnGetter(Column column)
21
+ {
22
+ org.apache.avro.Schema fieldSchema = avroSchema.getField(column.getName()).schema();
23
+ switch (fieldSchema.getType().getName()) {
24
+ case "union" :
25
+ String typeName = "";
26
+ for (org.apache.avro.Schema type : fieldSchema.getTypes()) {
27
+ if (!type.getName().equals("null")) {
28
+ typeName = type.getName();
29
+ break;
30
+ }
31
+ }
32
+ return getColumnGetterFromTypeName(typeName);
33
+ default :
34
+ return getColumnGetterFromTypeName(fieldSchema.getType().getName());
35
+ }
36
+ }
37
+
38
+ private BaseColumnGetter getColumnGetterFromTypeName(String typeName)
39
+ {
40
+ switch (typeName) {
41
+ case "string":
42
+ case "enum":
43
+ return new StringColumnGetter(pageBuilder, timestampParsers);
44
+ case "int":
45
+ return new IntegerColumnGetter(pageBuilder, timestampParsers);
46
+ case "long":
47
+ return new LongColumnGetter(pageBuilder, timestampParsers);
48
+ case "float":
49
+ return new FloatColumnGetter(pageBuilder, timestampParsers);
50
+ case "double":
51
+ return new DoubleColumnGetter(pageBuilder, timestampParsers);
52
+ case "boolean":
53
+ return new BooleanColumnGetter(pageBuilder, timestampParsers);
54
+ case "array":
55
+ case "map":
56
+ case "record":
57
+ return new GenericDataColumnGetter(pageBuilder, timestampParsers);
58
+ case "byte":
59
+ default:
60
+ throw new DataException(String.format("%s is not supported", typeName));
61
+ }
62
+ }
63
+ }
@@ -0,0 +1,58 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParser;
7
+
8
+ public class DoubleColumnGetter extends BaseColumnGetter {
9
+ protected Double value;
10
+
11
+ public DoubleColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
12
+ super(pageBuilder, timestampParsers);
13
+ }
14
+
15
+ @Override
16
+ public void setValue(Object value)
17
+ {
18
+ this.value = (Double) value;
19
+ }
20
+
21
+ @Override
22
+ public void longColumn(Column column) {
23
+ if (value == null) {
24
+ pageBuilder.setNull(column);
25
+ } else {
26
+ pageBuilder.setLong(column, value.longValue());
27
+ }
28
+ }
29
+
30
+ @Override
31
+ public void doubleColumn(Column column) {
32
+ if (value == null) {
33
+ pageBuilder.setNull(column);
34
+ } else {
35
+ pageBuilder.setDouble(column, value);
36
+ }
37
+ }
38
+
39
+ @Override
40
+ public void stringColumn(Column column) {
41
+ if (value == null) {
42
+ pageBuilder.setNull(column);
43
+ } else {
44
+ pageBuilder.setString(column, value.toString());
45
+ }
46
+ }
47
+
48
+ @Override
49
+ public void timestampColumn(Column column) {
50
+ if (this.value == null) {
51
+ pageBuilder.setNull(column);
52
+ }
53
+ else {
54
+ long milliSec = (long) (value * 1000);
55
+ pageBuilder.setTimestamp(column, Timestamp.ofEpochMilli(milliSec));
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,58 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParser;
7
+
8
+ public class FloatColumnGetter extends BaseColumnGetter {
9
+ protected Float value;
10
+
11
+ public FloatColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
12
+ super(pageBuilder, timestampParsers);
13
+ }
14
+
15
+ @Override
16
+ public void setValue(Object value)
17
+ {
18
+ this.value = (Float) value;
19
+ }
20
+
21
+ @Override
22
+ public void longColumn(Column column) {
23
+ if (value == null) {
24
+ pageBuilder.setNull(column);
25
+ } else {
26
+ pageBuilder.setLong(column, value.longValue());
27
+ }
28
+ }
29
+
30
+ @Override
31
+ public void doubleColumn(Column column) {
32
+ if (value == null) {
33
+ pageBuilder.setNull(column);
34
+ } else {
35
+ pageBuilder.setDouble(column, value.doubleValue());
36
+ }
37
+ }
38
+
39
+ @Override
40
+ public void stringColumn(Column column) {
41
+ if (value == null) {
42
+ pageBuilder.setNull(column);
43
+ } else {
44
+ pageBuilder.setString(column, value.toString());
45
+ }
46
+ }
47
+
48
+ @Override
49
+ public void timestampColumn(Column column) {
50
+ if (this.value == null) {
51
+ pageBuilder.setNull(column);
52
+ }
53
+ else {
54
+ long milliSec = (long) (value * 1000);
55
+ pageBuilder.setTimestamp(column, Timestamp.ofEpochMilli(milliSec));
56
+ }
57
+ }
58
+ }