embulk-parser-avro 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -1
- data/build.gradle +5 -5
- data/example/example_nocolumns.yml +9 -0
- data/src/main/java/org/embulk/parser/avro/AvroParserPlugin.java +78 -3
- data/src/main/java/org/embulk/parser/avro/getter/ColumnGetterFactory.java +25 -22
- data/src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java +40 -0
- metadata +4 -4
- data/src/main/java/org/embulk/parser/avro/AvroColumnOption.java +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62671c1f3feefa2bb7feecccf4362b04f0ad7d3c
|
4
|
+
data.tar.gz: 68e76d9d06bcc2f30c3d6e376e1fe867027b6e09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79e628c578df06d5aa54e17a0038eeb65fa667318be469cdceab9161a4b40716634d1c9cbdd88d17ef47a8d85cf9d4d63939c6340b3efde74995070e3ad37758
|
7
|
+
data.tar.gz: d1120457b865feed3eab7560f3dff20ab0b767fe17653d6970626836460426135334f0f681f901405ce8107d4a0f3c3fb5aa14e302023132fbad48a22c538168
|
data/README.md
CHANGED
@@ -11,10 +11,12 @@
|
|
11
11
|
|
12
12
|
- **type**: Specify this parser as avro
|
13
13
|
- **avsc**: Specify avro schema file.
|
14
|
-
- **columns**: Specify column name and type. See below (array,
|
14
|
+
- **columns**: Specify column name and type. See below (array, optional)
|
15
15
|
* **default_timezone**: Default timezone of the timestamp (string, default: UTC)
|
16
16
|
* **default_timestamp_format**: Default timestamp format of the timestamp (string, default: `%Y-%m-%d %H:%M:%S.%N %z`)
|
17
17
|
|
18
|
+
If columns is not set, this plugin detect schema automatically by using avsc schema.
|
19
|
+
|
18
20
|
## Example
|
19
21
|
|
20
22
|
```yaml
|
data/build.gradle
CHANGED
@@ -13,19 +13,19 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.
|
16
|
+
version = "0.2.0"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.7
|
19
19
|
targetCompatibility = 1.7
|
20
20
|
|
21
21
|
dependencies {
|
22
|
-
compile "org.embulk:embulk-core:0.8.
|
23
|
-
provided "org.embulk:embulk-core:0.8.
|
22
|
+
compile "org.embulk:embulk-core:0.8.14"
|
23
|
+
provided "org.embulk:embulk-core:0.8.14"
|
24
24
|
compile "org.apache.avro:avro:1.8.0"
|
25
25
|
testCompile "junit:junit:4.+"
|
26
26
|
|
27
|
-
testCompile "org.embulk:embulk-core:0.8.
|
28
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
27
|
+
testCompile "org.embulk:embulk-core:0.8.14:tests"
|
28
|
+
testCompile "org.embulk:embulk-standards:0.8.14"
|
29
29
|
}
|
30
30
|
|
31
31
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -1,11 +1,14 @@
|
|
1
1
|
package org.embulk.parser.avro;
|
2
2
|
|
3
|
+
import com.google.common.collect.ImmutableList;
|
3
4
|
import com.google.common.collect.ImmutableMap;
|
4
5
|
import org.apache.avro.file.DataFileStream;
|
5
6
|
import org.apache.avro.generic.GenericDatumReader;
|
6
7
|
import org.apache.avro.generic.GenericRecord;
|
7
8
|
import org.apache.avro.io.DatumReader;
|
8
9
|
import org.embulk.config.Config;
|
10
|
+
import org.embulk.config.ConfigDefault;
|
11
|
+
import org.embulk.config.ConfigException;
|
9
12
|
import org.embulk.config.ConfigSource;
|
10
13
|
import org.embulk.config.Task;
|
11
14
|
import org.embulk.config.TaskSource;
|
@@ -20,6 +23,7 @@ import org.embulk.spi.ParserPlugin;
|
|
20
23
|
import org.embulk.spi.Schema;
|
21
24
|
import org.embulk.spi.SchemaConfig;
|
22
25
|
import org.embulk.spi.time.TimestampParser;
|
26
|
+
import org.embulk.spi.type.Types;
|
23
27
|
import org.embulk.spi.unit.LocalFile;
|
24
28
|
import org.embulk.spi.util.FileInputInputStream;
|
25
29
|
import org.embulk.spi.util.Timestamps;
|
@@ -35,6 +39,7 @@ public class AvroParserPlugin
|
|
35
39
|
extends Task, TimestampParser.Task
|
36
40
|
{
|
37
41
|
@Config("columns")
|
42
|
+
@ConfigDefault("[]")
|
38
43
|
public SchemaConfig getColumns();
|
39
44
|
|
40
45
|
@Config("avsc")
|
@@ -46,22 +51,92 @@ public class AvroParserPlugin
|
|
46
51
|
{
|
47
52
|
PluginTask task = config.loadConfig(PluginTask.class);
|
48
53
|
|
49
|
-
|
54
|
+
File avsc = task.getAvsc().getFile();
|
55
|
+
org.apache.avro.Schema avroSchema;
|
56
|
+
try {
|
57
|
+
avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
|
58
|
+
} catch (IOException e) {
|
59
|
+
throw new ConfigException("avsc file is not found");
|
60
|
+
}
|
61
|
+
|
62
|
+
Schema schema = buildSchema(task.getColumns(), avroSchema);
|
50
63
|
|
51
64
|
control.run(task.dump(), schema);
|
52
65
|
}
|
53
66
|
|
67
|
+
Schema buildSchema(SchemaConfig columns, org.apache.avro.Schema avroSchema) {
|
68
|
+
if (columns.size() > 0) {
|
69
|
+
return columns.toSchema();
|
70
|
+
} else {
|
71
|
+
int index = 0;
|
72
|
+
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
73
|
+
for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
|
74
|
+
String name = field.name();
|
75
|
+
|
76
|
+
org.apache.avro.Schema.Type avroType = null;
|
77
|
+
if (field.schema().getType() == org.apache.avro.Schema.Type.UNION) {
|
78
|
+
for (org.apache.avro.Schema sc : field.schema().getTypes()) {
|
79
|
+
if (sc.getType() != org.apache.avro.Schema.Type.NULL) {
|
80
|
+
avroType = sc.getType();
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
} else {
|
85
|
+
avroType = field.schema().getType();
|
86
|
+
}
|
87
|
+
switch (avroType) {
|
88
|
+
case STRING:
|
89
|
+
case BYTES:
|
90
|
+
case FIXED:
|
91
|
+
case ENUM:
|
92
|
+
case NULL:
|
93
|
+
builder.add(new Column(index, name, Types.STRING));
|
94
|
+
index++;
|
95
|
+
break;
|
96
|
+
case INT:
|
97
|
+
case LONG:
|
98
|
+
builder.add(new Column(index, name, Types.LONG));
|
99
|
+
index++;
|
100
|
+
break;
|
101
|
+
case FLOAT:
|
102
|
+
case DOUBLE:
|
103
|
+
builder.add(new Column(index, name, Types.DOUBLE));
|
104
|
+
index++;
|
105
|
+
break;
|
106
|
+
case BOOLEAN:
|
107
|
+
builder.add(new Column(index, name, Types.BOOLEAN));
|
108
|
+
index++;
|
109
|
+
break;
|
110
|
+
case MAP:
|
111
|
+
case ARRAY:
|
112
|
+
case RECORD:
|
113
|
+
builder.add(new Column(index, name, Types.JSON));
|
114
|
+
index++;
|
115
|
+
break;
|
116
|
+
default:
|
117
|
+
throw new RuntimeException("Unsupported type");
|
118
|
+
}
|
119
|
+
}
|
120
|
+
return new Schema(builder.build());
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
54
124
|
@Override
|
55
125
|
public void run(TaskSource taskSource, Schema schema,
|
56
126
|
FileInput input, PageOutput output)
|
57
127
|
{
|
58
128
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
59
|
-
File avsc = task.getAvsc().getFile();
|
60
129
|
List<Column> columns = schema.getColumns();
|
61
130
|
final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
|
131
|
+
File avsc = task.getAvsc().getFile();
|
132
|
+
final org.apache.avro.Schema avroSchema;
|
133
|
+
try {
|
134
|
+
avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
|
135
|
+
} catch (IOException e) {
|
136
|
+
throw new ConfigException("avsc file is not found");
|
137
|
+
}
|
62
138
|
|
63
139
|
try (FileInputInputStream is = new FileInputInputStream(input); final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
64
|
-
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
|
65
140
|
ColumnGetterFactory factory = new ColumnGetterFactory(avroSchema, pageBuilder, timestampParsers);
|
66
141
|
ImmutableMap.Builder<String, BaseColumnGetter> columnGettersBuilder = ImmutableMap.builder();
|
67
142
|
for (Column column : columns) {
|
@@ -1,5 +1,6 @@
|
|
1
1
|
package org.embulk.parser.avro.getter;
|
2
2
|
|
3
|
+
import org.apache.avro.Schema;
|
3
4
|
import org.embulk.spi.Column;
|
4
5
|
import org.embulk.spi.DataException;
|
5
6
|
import org.embulk.spi.PageBuilder;
|
@@ -20,44 +21,46 @@ public class ColumnGetterFactory {
|
|
20
21
|
public BaseColumnGetter newColumnGetter(Column column)
|
21
22
|
{
|
22
23
|
org.apache.avro.Schema fieldSchema = avroSchema.getField(column.getName()).schema();
|
23
|
-
switch (fieldSchema.getType()
|
24
|
-
case
|
25
|
-
|
26
|
-
for (org.apache.avro.Schema
|
27
|
-
if (
|
28
|
-
|
24
|
+
switch (fieldSchema.getType()) {
|
25
|
+
case UNION:
|
26
|
+
Schema.Type type = null;
|
27
|
+
for (org.apache.avro.Schema sc : fieldSchema.getTypes()) {
|
28
|
+
if (sc.getType() != Schema.Type.NULL) {
|
29
|
+
type = sc.getType();
|
29
30
|
break;
|
30
31
|
}
|
31
32
|
}
|
32
|
-
return getColumnGetterFromTypeName(
|
33
|
+
return getColumnGetterFromTypeName(type);
|
33
34
|
default :
|
34
|
-
return getColumnGetterFromTypeName(fieldSchema.getType()
|
35
|
+
return getColumnGetterFromTypeName(fieldSchema.getType());
|
35
36
|
}
|
36
37
|
}
|
37
38
|
|
38
|
-
private BaseColumnGetter getColumnGetterFromTypeName(
|
39
|
+
private BaseColumnGetter getColumnGetterFromTypeName(Schema.Type type)
|
39
40
|
{
|
40
|
-
switch (
|
41
|
-
case
|
42
|
-
case
|
41
|
+
switch (type) {
|
42
|
+
case STRING:
|
43
|
+
case ENUM:
|
43
44
|
return new StringColumnGetter(pageBuilder, timestampParsers);
|
44
|
-
case
|
45
|
+
case INT:
|
45
46
|
return new IntegerColumnGetter(pageBuilder, timestampParsers);
|
46
|
-
case
|
47
|
+
case LONG:
|
47
48
|
return new LongColumnGetter(pageBuilder, timestampParsers);
|
48
|
-
case
|
49
|
+
case FLOAT:
|
49
50
|
return new FloatColumnGetter(pageBuilder, timestampParsers);
|
50
|
-
case
|
51
|
+
case DOUBLE:
|
51
52
|
return new DoubleColumnGetter(pageBuilder, timestampParsers);
|
52
|
-
case
|
53
|
+
case BOOLEAN:
|
53
54
|
return new BooleanColumnGetter(pageBuilder, timestampParsers);
|
54
|
-
case
|
55
|
-
case
|
56
|
-
case
|
55
|
+
case ARRAY:
|
56
|
+
case MAP:
|
57
|
+
case RECORD:
|
57
58
|
return new GenericDataColumnGetter(pageBuilder, timestampParsers);
|
58
|
-
case
|
59
|
+
case NULL:
|
60
|
+
return new StringColumnGetter(pageBuilder, timestampParsers);
|
61
|
+
case BYTES:
|
59
62
|
default:
|
60
|
-
throw new DataException(String.format("%s is not supported",
|
63
|
+
throw new DataException(String.format("%s is not supported", type.getName()));
|
61
64
|
}
|
62
65
|
}
|
63
66
|
}
|
@@ -92,6 +92,46 @@ public class TestAvroParserPlugin
|
|
92
92
|
assertEquals("2016-05-08 19:35:25.952 UTC", record[11].toString());
|
93
93
|
}
|
94
94
|
|
95
|
+
@Test
|
96
|
+
public void useNoColumnsOption()
|
97
|
+
throws Exception
|
98
|
+
{
|
99
|
+
SchemaConfig schema = schema(
|
100
|
+
column("id", LONG),
|
101
|
+
column("code", LONG),
|
102
|
+
column("name", STRING),
|
103
|
+
column("description", STRING),
|
104
|
+
column("flag", BOOLEAN),
|
105
|
+
column("created_at", STRING),
|
106
|
+
column("created_at_utc", DOUBLE),
|
107
|
+
column("price", DOUBLE),
|
108
|
+
column("spec", JSON),
|
109
|
+
column("tags", JSON),
|
110
|
+
column("options", JSON),
|
111
|
+
column("item_type", STRING),
|
112
|
+
column("dummy", STRING)
|
113
|
+
);
|
114
|
+
|
115
|
+
ConfigSource config = this.config.deepCopy().set("avsc", this.getClass().getResource("item.avsc").getPath());
|
116
|
+
|
117
|
+
transaction(config, fileInput(new File(this.getClass().getResource("items.avro").getPath())));
|
118
|
+
|
119
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
120
|
+
assertEquals(6, records.size());
|
121
|
+
|
122
|
+
Object[] record = records.get(0);
|
123
|
+
assertEquals(1L, record[0]);
|
124
|
+
assertEquals(123456789012345678L, record[1]);
|
125
|
+
assertEquals("Desktop", record[2]);
|
126
|
+
assertEquals(true, record[4]);
|
127
|
+
assertEquals("D", record[11]);
|
128
|
+
assertEquals("[\"tag1\",\"tag2\"]", record[9].toString());
|
129
|
+
assertEquals("bar", ((MapValue)record[10]).map().get(ValueFactory.newString("foo")).toString());
|
130
|
+
assertEquals("opt1", ((MapValue)record[8]).map().get(ValueFactory.newString("key")).toString());
|
131
|
+
assertEquals("2016-05-09T04:35:43+09:00", record[5].toString());
|
132
|
+
assertNull(record[12]);
|
133
|
+
}
|
134
|
+
|
95
135
|
private void recreatePageOutput()
|
96
136
|
{
|
97
137
|
output = new MockPageOutput();
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-avro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,6 +53,7 @@ files:
|
|
53
53
|
- config/checkstyle/default.xml
|
54
54
|
- example/.gitignore
|
55
55
|
- example/example.yml
|
56
|
+
- example/example_nocolumns.yml
|
56
57
|
- example/generate.rb
|
57
58
|
- example/item.avsc
|
58
59
|
- example/items.avro
|
@@ -62,7 +63,6 @@ files:
|
|
62
63
|
- gradlew.bat
|
63
64
|
- lib/embulk/guess/avro.rb
|
64
65
|
- lib/embulk/parser/avro.rb
|
65
|
-
- src/main/java/org/embulk/parser/avro/AvroColumnOption.java
|
66
66
|
- src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
|
67
67
|
- src/main/java/org/embulk/parser/avro/getter/AvroGenericDataConverter.java
|
68
68
|
- src/main/java/org/embulk/parser/avro/getter/BaseColumnGetter.java
|
@@ -79,7 +79,7 @@ files:
|
|
79
79
|
- src/test/resources/org/embulk/parser/avro/items.avro
|
80
80
|
- classpath/avro-1.8.0.jar
|
81
81
|
- classpath/commons-compress-1.8.1.jar
|
82
|
-
- classpath/embulk-parser-avro-0.
|
82
|
+
- classpath/embulk-parser-avro-0.2.0.jar
|
83
83
|
- classpath/jackson-core-asl-1.9.13.jar
|
84
84
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
85
85
|
- classpath/paranamer-2.7.jar
|
@@ -1,16 +0,0 @@
|
|
1
|
-
package org.embulk.parser.avro;
|
2
|
-
|
3
|
-
import org.embulk.config.Config;
|
4
|
-
import org.embulk.config.ConfigDefault;
|
5
|
-
import org.embulk.config.Task;
|
6
|
-
import org.embulk.spi.type.Type;
|
7
|
-
|
8
|
-
import com.google.common.base.Optional;
|
9
|
-
|
10
|
-
public interface AvroColumnOption
|
11
|
-
extends Task
|
12
|
-
{
|
13
|
-
@Config("type")
|
14
|
-
@ConfigDefault("null")
|
15
|
-
Optional<Type> getType();
|
16
|
-
}
|