embulk-parser-avro 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +88 -0
  5. data/build.gradle +96 -0
  6. data/config/checkstyle/checkstyle.xml +128 -0
  7. data/config/checkstyle/default.xml +108 -0
  8. data/example/.gitignore +3 -0
  9. data/example/example.yml +22 -0
  10. data/example/generate.rb +94 -0
  11. data/example/item.avsc +27 -0
  12. data/example/items.avro +0 -0
  13. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  14. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  15. data/gradlew +160 -0
  16. data/gradlew.bat +90 -0
  17. data/lib/embulk/guess/avro.rb +61 -0
  18. data/lib/embulk/parser/avro.rb +3 -0
  19. data/src/main/java/org/embulk/parser/avro/AvroColumnOption.java +16 -0
  20. data/src/main/java/org/embulk/parser/avro/AvroParserPlugin.java +93 -0
  21. data/src/main/java/org/embulk/parser/avro/getter/AvroGenericDataConverter.java +72 -0
  22. data/src/main/java/org/embulk/parser/avro/getter/BaseColumnGetter.java +83 -0
  23. data/src/main/java/org/embulk/parser/avro/getter/BooleanColumnGetter.java +37 -0
  24. data/src/main/java/org/embulk/parser/avro/getter/ColumnGetterFactory.java +63 -0
  25. data/src/main/java/org/embulk/parser/avro/getter/DoubleColumnGetter.java +58 -0
  26. data/src/main/java/org/embulk/parser/avro/getter/FloatColumnGetter.java +58 -0
  27. data/src/main/java/org/embulk/parser/avro/getter/GenericDataColumnGetter.java +34 -0
  28. data/src/main/java/org/embulk/parser/avro/getter/IntegerColumnGetter.java +57 -0
  29. data/src/main/java/org/embulk/parser/avro/getter/LongColumnGetter.java +58 -0
  30. data/src/main/java/org/embulk/parser/avro/getter/StringColumnGetter.java +85 -0
  31. data/src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java +145 -0
  32. data/src/test/resources/org/embulk/parser/avro/item.avsc +27 -0
  33. data/src/test/resources/org/embulk/parser/avro/items.avro +0 -0
  34. metadata +112 -0
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "avro", "org.embulk.parser.avro.AvroParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,16 @@
1
+ package org.embulk.parser.avro;
2
+
3
+ import org.embulk.config.Config;
4
+ import org.embulk.config.ConfigDefault;
5
+ import org.embulk.config.Task;
6
+ import org.embulk.spi.type.Type;
7
+
8
+ import com.google.common.base.Optional;
9
+
10
+ public interface AvroColumnOption
11
+ extends Task
12
+ {
13
+ @Config("type")
14
+ @ConfigDefault("null")
15
+ Optional<Type> getType();
16
+ }
@@ -0,0 +1,93 @@
1
+ package org.embulk.parser.avro;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.apache.avro.file.DataFileStream;
5
+ import org.apache.avro.generic.GenericDatumReader;
6
+ import org.apache.avro.generic.GenericRecord;
7
+ import org.apache.avro.io.DatumReader;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.Task;
11
+ import org.embulk.config.TaskSource;
12
+ import org.embulk.parser.avro.getter.BaseColumnGetter;
13
+ import org.embulk.parser.avro.getter.ColumnGetterFactory;
14
+ import org.embulk.spi.Column;
15
+ import org.embulk.spi.Exec;
16
+ import org.embulk.spi.FileInput;
17
+ import org.embulk.spi.PageBuilder;
18
+ import org.embulk.spi.PageOutput;
19
+ import org.embulk.spi.ParserPlugin;
20
+ import org.embulk.spi.Schema;
21
+ import org.embulk.spi.SchemaConfig;
22
+ import org.embulk.spi.time.TimestampParser;
23
+ import org.embulk.spi.unit.LocalFile;
24
+ import org.embulk.spi.util.FileInputInputStream;
25
+ import org.embulk.spi.util.Timestamps;
26
+
27
+ import java.io.File;
28
+ import java.io.IOException;
29
+ import java.util.List;
30
+
31
+ public class AvroParserPlugin
32
+ implements ParserPlugin
33
+ {
34
+ public interface PluginTask
35
+ extends Task, TimestampParser.Task
36
+ {
37
+ @Config("columns")
38
+ public SchemaConfig getColumns();
39
+
40
+ @Config("avsc")
41
+ LocalFile getAvsc();
42
+ }
43
+
44
+ @Override
45
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
46
+ {
47
+ PluginTask task = config.loadConfig(PluginTask.class);
48
+
49
+ Schema schema = task.getColumns().toSchema();
50
+
51
+ control.run(task.dump(), schema);
52
+ }
53
+
54
+ @Override
55
+ public void run(TaskSource taskSource, Schema schema,
56
+ FileInput input, PageOutput output)
57
+ {
58
+ PluginTask task = taskSource.loadTask(PluginTask.class);
59
+ File avsc = task.getAvsc().getFile();
60
+ List<Column> columns = schema.getColumns();
61
+ final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
62
+
63
+ try (FileInputInputStream is = new FileInputInputStream(input); final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
64
+ org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
65
+ ColumnGetterFactory factory = new ColumnGetterFactory(avroSchema, pageBuilder, timestampParsers);
66
+ ImmutableMap.Builder<String, BaseColumnGetter> columnGettersBuilder = ImmutableMap.builder();
67
+ for (Column column : columns) {
68
+ BaseColumnGetter columnGetter = factory.newColumnGetter(column);
69
+ columnGettersBuilder.put(column.getName(), columnGetter);
70
+ }
71
+ ImmutableMap<String, BaseColumnGetter> columnGetters = columnGettersBuilder.build();
72
+ DatumReader<GenericRecord> reader = new GenericDatumReader<>(avroSchema);
73
+ GenericRecord record = null;
74
+ while (is.nextFile()) {
75
+ DataFileStream<GenericRecord> ds = new DataFileStream<>(is, reader);
76
+ while (ds.hasNext()) {
77
+ record = ds.next(record);
78
+ for (Column column : columns) {
79
+ BaseColumnGetter columnGetter = columnGetters.get(column.getName());
80
+ columnGetter.setValue(record.get(column.getName()));
81
+ column.visit(columnGetter);
82
+ }
83
+ pageBuilder.addRecord();
84
+ }
85
+ }
86
+
87
+ pageBuilder.finish();
88
+ }
89
+ catch (IOException e) {
90
+ throw new RuntimeException(e);
91
+ }
92
+ }
93
+ }
@@ -0,0 +1,72 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.apache.avro.Schema;
4
+ import org.apache.avro.generic.GenericData;
5
+ import org.apache.avro.util.Utf8;
6
+ import org.msgpack.value.Value;
7
+ import org.msgpack.value.ValueFactory;
8
+
9
+ import java.util.ArrayList;
10
+ import java.util.HashMap;
11
+ import java.util.List;
12
+ import java.util.Map;
13
+ import java.util.Set;
14
+
15
+ public class AvroGenericDataConverter {
16
+ public static Value convert(Object genericData)
17
+ {
18
+ return toValue(genericData);
19
+ }
20
+
21
+ private static Value toValue(Object rawValue)
22
+ {
23
+ if (rawValue instanceof Utf8) {
24
+ return ValueFactory.newString(rawValue.toString());
25
+ } else if (rawValue instanceof Integer) {
26
+ return ValueFactory.newInteger((Integer)rawValue);
27
+ } else if (rawValue instanceof Long) {
28
+ return ValueFactory.newInteger((Long)rawValue);
29
+ } else if (rawValue instanceof Float) {
30
+ return ValueFactory.newFloat((Float)rawValue);
31
+ } else if (rawValue instanceof Double) {
32
+ return ValueFactory.newFloat((Double)rawValue);
33
+ } else if (rawValue instanceof Boolean) {
34
+ return ValueFactory.newBoolean((Boolean)rawValue);
35
+ } else if (rawValue instanceof GenericData.EnumSymbol) {
36
+ return ValueFactory.newString(rawValue.toString());
37
+ } else if (rawValue instanceof GenericData.Array) {
38
+ List<Value> list = new ArrayList<>();
39
+ for (Object item : (GenericData.Array)rawValue) {
40
+ list.add(toValue(item));
41
+ }
42
+ return ValueFactory.newArray(list);
43
+ } else if (rawValue instanceof GenericData.Record) {
44
+ Map<Value, Value> map = new HashMap<>();
45
+ GenericData.Record casted = (GenericData.Record) rawValue;
46
+ for (Schema.Field field : casted.getSchema().getFields()) {
47
+ Object val = casted.get(field.name());
48
+ Value keyValue = ValueFactory.newString(field.name());
49
+ Value valValue = toValue(val);
50
+ map.put(keyValue, valValue);
51
+ }
52
+ return ValueFactory.newMap(map);
53
+ } else if (rawValue instanceof HashMap) {
54
+ Map<Value, Value> map = new HashMap<>();
55
+ HashMap casted = (HashMap) rawValue;
56
+ Set entries = casted.entrySet();
57
+ for (Object entry : entries) {
58
+ Map.Entry et = (Map.Entry) entry;
59
+ Utf8 key = (Utf8) (et.getKey());
60
+ Object val = et.getValue();
61
+ Value keyValue = toValue(key);
62
+ Value valValue = toValue(val);
63
+ map.put(keyValue, valValue);
64
+ }
65
+ return ValueFactory.newMap(map);
66
+ } else if (rawValue == null) {
67
+ return ValueFactory.newNil();
68
+ } else {
69
+ throw new RuntimeException("Unknown type");
70
+ }
71
+ }
72
+ }
@@ -0,0 +1,83 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.ColumnVisitor;
5
+ import org.embulk.spi.DataException;
6
+ import org.embulk.spi.PageBuilder;
7
+ import org.embulk.spi.time.TimestampParser;
8
+
9
+ public class BaseColumnGetter implements ColumnVisitor {
10
+ protected final PageBuilder pageBuilder;
11
+ protected final TimestampParser[] timestampParsers;
12
+ protected Object value;
13
+
14
+ public BaseColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
15
+ this.pageBuilder = pageBuilder;
16
+ this.timestampParsers = timestampParsers;
17
+ }
18
+
19
+ public void setValue(Object value)
20
+ {
21
+ this.value = value;
22
+ }
23
+
24
+ @Override
25
+ public void booleanColumn(Column column)
26
+ {
27
+ if (value == null) {
28
+ pageBuilder.setNull(column);
29
+ } else {
30
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
31
+ }
32
+ }
33
+
34
+ @Override
35
+ public void longColumn(Column column)
36
+ {
37
+ if (value == null) {
38
+ pageBuilder.setNull(column);
39
+ } else {
40
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
41
+ }
42
+ }
43
+
44
+ @Override
45
+ public void doubleColumn(Column column)
46
+ {
47
+ if (value == null) {
48
+ pageBuilder.setNull(column);
49
+ } else {
50
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
51
+ }
52
+ }
53
+
54
+ @Override
55
+ public void stringColumn(Column column)
56
+ {
57
+ if (value == null) {
58
+ pageBuilder.setNull(column);
59
+ } else {
60
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
61
+ }
62
+ }
63
+
64
+ @Override
65
+ public void timestampColumn(Column column)
66
+ {
67
+ if (value == null) {
68
+ pageBuilder.setNull(column);
69
+ } else {
70
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
71
+ }
72
+ }
73
+
74
+ @Override
75
+ public void jsonColumn(Column column)
76
+ {
77
+ if (value == null) {
78
+ pageBuilder.setNull(column);
79
+ } else {
80
+ throw new DataException(String.format("cannot convert value from %s", column.getType()));
81
+ }
82
+ }
83
+ }
@@ -0,0 +1,37 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.TimestampParser;
6
+
7
+ public class BooleanColumnGetter extends BaseColumnGetter {
8
+ protected Boolean value;
9
+
10
+ public BooleanColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
11
+ super(pageBuilder, timestampParsers);
12
+ }
13
+
14
+ @Override
15
+ public void setValue(Object value)
16
+ {
17
+ this.value = (Boolean) value;
18
+ }
19
+
20
+ @Override
21
+ public void booleanColumn(Column column) {
22
+ if (value == null) {
23
+ pageBuilder.setNull(column);
24
+ } else {
25
+ pageBuilder.setBoolean(column, value);
26
+ }
27
+ }
28
+
29
+ @Override
30
+ public void stringColumn(Column column) {
31
+ if (value == null) {
32
+ pageBuilder.setNull(column);
33
+ } else {
34
+ pageBuilder.setString(column, value.toString());
35
+ }
36
+ }
37
+ }
@@ -0,0 +1,63 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.DataException;
5
+ import org.embulk.spi.PageBuilder;
6
+ import org.embulk.spi.time.TimestampParser;
7
+
8
+ public class ColumnGetterFactory {
9
+ private org.apache.avro.Schema avroSchema;
10
+ private PageBuilder pageBuilder;
11
+ private TimestampParser[] timestampParsers;
12
+
13
+ public ColumnGetterFactory(org.apache.avro.Schema avroSchema, PageBuilder pageBuilder, TimestampParser[] timestampParsers)
14
+ {
15
+ this.avroSchema = avroSchema;
16
+ this.pageBuilder = pageBuilder;
17
+ this.timestampParsers = timestampParsers;
18
+ }
19
+
20
+ public BaseColumnGetter newColumnGetter(Column column)
21
+ {
22
+ org.apache.avro.Schema fieldSchema = avroSchema.getField(column.getName()).schema();
23
+ switch (fieldSchema.getType().getName()) {
24
+ case "union" :
25
+ String typeName = "";
26
+ for (org.apache.avro.Schema type : fieldSchema.getTypes()) {
27
+ if (!type.getName().equals("null")) {
28
+ typeName = type.getName();
29
+ break;
30
+ }
31
+ }
32
+ return getColumnGetterFromTypeName(typeName);
33
+ default :
34
+ return getColumnGetterFromTypeName(fieldSchema.getType().getName());
35
+ }
36
+ }
37
+
38
+ private BaseColumnGetter getColumnGetterFromTypeName(String typeName)
39
+ {
40
+ switch (typeName) {
41
+ case "string":
42
+ case "enum":
43
+ return new StringColumnGetter(pageBuilder, timestampParsers);
44
+ case "int":
45
+ return new IntegerColumnGetter(pageBuilder, timestampParsers);
46
+ case "long":
47
+ return new LongColumnGetter(pageBuilder, timestampParsers);
48
+ case "float":
49
+ return new FloatColumnGetter(pageBuilder, timestampParsers);
50
+ case "double":
51
+ return new DoubleColumnGetter(pageBuilder, timestampParsers);
52
+ case "boolean":
53
+ return new BooleanColumnGetter(pageBuilder, timestampParsers);
54
+ case "array":
55
+ case "map":
56
+ case "record":
57
+ return new GenericDataColumnGetter(pageBuilder, timestampParsers);
58
+ case "byte":
59
+ default:
60
+ throw new DataException(String.format("%s is not supported", typeName));
61
+ }
62
+ }
63
+ }
@@ -0,0 +1,58 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParser;
7
+
8
+ public class DoubleColumnGetter extends BaseColumnGetter {
9
+ protected Double value;
10
+
11
+ public DoubleColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
12
+ super(pageBuilder, timestampParsers);
13
+ }
14
+
15
+ @Override
16
+ public void setValue(Object value)
17
+ {
18
+ this.value = (Double) value;
19
+ }
20
+
21
+ @Override
22
+ public void longColumn(Column column) {
23
+ if (value == null) {
24
+ pageBuilder.setNull(column);
25
+ } else {
26
+ pageBuilder.setLong(column, value.longValue());
27
+ }
28
+ }
29
+
30
+ @Override
31
+ public void doubleColumn(Column column) {
32
+ if (value == null) {
33
+ pageBuilder.setNull(column);
34
+ } else {
35
+ pageBuilder.setDouble(column, value);
36
+ }
37
+ }
38
+
39
+ @Override
40
+ public void stringColumn(Column column) {
41
+ if (value == null) {
42
+ pageBuilder.setNull(column);
43
+ } else {
44
+ pageBuilder.setString(column, value.toString());
45
+ }
46
+ }
47
+
48
+ @Override
49
+ public void timestampColumn(Column column) {
50
+ if (this.value == null) {
51
+ pageBuilder.setNull(column);
52
+ }
53
+ else {
54
+ long milliSec = (long) (value * 1000);
55
+ pageBuilder.setTimestamp(column, Timestamp.ofEpochMilli(milliSec));
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,58 @@
1
+ package org.embulk.parser.avro.getter;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParser;
7
+
8
+ public class FloatColumnGetter extends BaseColumnGetter {
9
+ protected Float value;
10
+
11
+ public FloatColumnGetter(PageBuilder pageBuilder, TimestampParser[] timestampParsers) {
12
+ super(pageBuilder, timestampParsers);
13
+ }
14
+
15
+ @Override
16
+ public void setValue(Object value)
17
+ {
18
+ this.value = (Float) value;
19
+ }
20
+
21
+ @Override
22
+ public void longColumn(Column column) {
23
+ if (value == null) {
24
+ pageBuilder.setNull(column);
25
+ } else {
26
+ pageBuilder.setLong(column, value.longValue());
27
+ }
28
+ }
29
+
30
+ @Override
31
+ public void doubleColumn(Column column) {
32
+ if (value == null) {
33
+ pageBuilder.setNull(column);
34
+ } else {
35
+ pageBuilder.setDouble(column, value.doubleValue());
36
+ }
37
+ }
38
+
39
+ @Override
40
+ public void stringColumn(Column column) {
41
+ if (value == null) {
42
+ pageBuilder.setNull(column);
43
+ } else {
44
+ pageBuilder.setString(column, value.toString());
45
+ }
46
+ }
47
+
48
+ @Override
49
+ public void timestampColumn(Column column) {
50
+ if (this.value == null) {
51
+ pageBuilder.setNull(column);
52
+ }
53
+ else {
54
+ long milliSec = (long) (value * 1000);
55
+ pageBuilder.setTimestamp(column, Timestamp.ofEpochMilli(milliSec));
56
+ }
57
+ }
58
+ }