embulk-parser-avro 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -1
- data/build.gradle +5 -5
- data/example/example_nocolumns.yml +9 -0
- data/src/main/java/org/embulk/parser/avro/AvroParserPlugin.java +78 -3
- data/src/main/java/org/embulk/parser/avro/getter/ColumnGetterFactory.java +25 -22
- data/src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java +40 -0
- metadata +4 -4
- data/src/main/java/org/embulk/parser/avro/AvroColumnOption.java +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62671c1f3feefa2bb7feecccf4362b04f0ad7d3c
|
4
|
+
data.tar.gz: 68e76d9d06bcc2f30c3d6e376e1fe867027b6e09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79e628c578df06d5aa54e17a0038eeb65fa667318be469cdceab9161a4b40716634d1c9cbdd88d17ef47a8d85cf9d4d63939c6340b3efde74995070e3ad37758
|
7
|
+
data.tar.gz: d1120457b865feed3eab7560f3dff20ab0b767fe17653d6970626836460426135334f0f681f901405ce8107d4a0f3c3fb5aa14e302023132fbad48a22c538168
|
data/README.md
CHANGED
@@ -11,10 +11,12 @@
|
|
11
11
|
|
12
12
|
- **type**: Specify this parser as avro
|
13
13
|
- **avsc**: Specify avro schema file.
|
14
|
-
- **columns**: Specify column name and type. See below (array,
|
14
|
+
- **columns**: Specify column name and type. See below (array, optional)
|
15
15
|
* **default_timezone**: Default timezone of the timestamp (string, default: UTC)
|
16
16
|
* **default_timestamp_format**: Default timestamp format of the timestamp (string, default: `%Y-%m-%d %H:%M:%S.%N %z`)
|
17
17
|
|
18
|
+
If columns is not set, this plugin detect schema automatically by using avsc schema.
|
19
|
+
|
18
20
|
## Example
|
19
21
|
|
20
22
|
```yaml
|
data/build.gradle
CHANGED
@@ -13,19 +13,19 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.
|
16
|
+
version = "0.2.0"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.7
|
19
19
|
targetCompatibility = 1.7
|
20
20
|
|
21
21
|
dependencies {
|
22
|
-
compile "org.embulk:embulk-core:0.8.
|
23
|
-
provided "org.embulk:embulk-core:0.8.
|
22
|
+
compile "org.embulk:embulk-core:0.8.14"
|
23
|
+
provided "org.embulk:embulk-core:0.8.14"
|
24
24
|
compile "org.apache.avro:avro:1.8.0"
|
25
25
|
testCompile "junit:junit:4.+"
|
26
26
|
|
27
|
-
testCompile "org.embulk:embulk-core:0.8.
|
28
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
27
|
+
testCompile "org.embulk:embulk-core:0.8.14:tests"
|
28
|
+
testCompile "org.embulk:embulk-standards:0.8.14"
|
29
29
|
}
|
30
30
|
|
31
31
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -1,11 +1,14 @@
|
|
1
1
|
package org.embulk.parser.avro;
|
2
2
|
|
3
|
+
import com.google.common.collect.ImmutableList;
|
3
4
|
import com.google.common.collect.ImmutableMap;
|
4
5
|
import org.apache.avro.file.DataFileStream;
|
5
6
|
import org.apache.avro.generic.GenericDatumReader;
|
6
7
|
import org.apache.avro.generic.GenericRecord;
|
7
8
|
import org.apache.avro.io.DatumReader;
|
8
9
|
import org.embulk.config.Config;
|
10
|
+
import org.embulk.config.ConfigDefault;
|
11
|
+
import org.embulk.config.ConfigException;
|
9
12
|
import org.embulk.config.ConfigSource;
|
10
13
|
import org.embulk.config.Task;
|
11
14
|
import org.embulk.config.TaskSource;
|
@@ -20,6 +23,7 @@ import org.embulk.spi.ParserPlugin;
|
|
20
23
|
import org.embulk.spi.Schema;
|
21
24
|
import org.embulk.spi.SchemaConfig;
|
22
25
|
import org.embulk.spi.time.TimestampParser;
|
26
|
+
import org.embulk.spi.type.Types;
|
23
27
|
import org.embulk.spi.unit.LocalFile;
|
24
28
|
import org.embulk.spi.util.FileInputInputStream;
|
25
29
|
import org.embulk.spi.util.Timestamps;
|
@@ -35,6 +39,7 @@ public class AvroParserPlugin
|
|
35
39
|
extends Task, TimestampParser.Task
|
36
40
|
{
|
37
41
|
@Config("columns")
|
42
|
+
@ConfigDefault("[]")
|
38
43
|
public SchemaConfig getColumns();
|
39
44
|
|
40
45
|
@Config("avsc")
|
@@ -46,22 +51,92 @@ public class AvroParserPlugin
|
|
46
51
|
{
|
47
52
|
PluginTask task = config.loadConfig(PluginTask.class);
|
48
53
|
|
49
|
-
|
54
|
+
File avsc = task.getAvsc().getFile();
|
55
|
+
org.apache.avro.Schema avroSchema;
|
56
|
+
try {
|
57
|
+
avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
|
58
|
+
} catch (IOException e) {
|
59
|
+
throw new ConfigException("avsc file is not found");
|
60
|
+
}
|
61
|
+
|
62
|
+
Schema schema = buildSchema(task.getColumns(), avroSchema);
|
50
63
|
|
51
64
|
control.run(task.dump(), schema);
|
52
65
|
}
|
53
66
|
|
67
|
+
Schema buildSchema(SchemaConfig columns, org.apache.avro.Schema avroSchema) {
|
68
|
+
if (columns.size() > 0) {
|
69
|
+
return columns.toSchema();
|
70
|
+
} else {
|
71
|
+
int index = 0;
|
72
|
+
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
73
|
+
for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
|
74
|
+
String name = field.name();
|
75
|
+
|
76
|
+
org.apache.avro.Schema.Type avroType = null;
|
77
|
+
if (field.schema().getType() == org.apache.avro.Schema.Type.UNION) {
|
78
|
+
for (org.apache.avro.Schema sc : field.schema().getTypes()) {
|
79
|
+
if (sc.getType() != org.apache.avro.Schema.Type.NULL) {
|
80
|
+
avroType = sc.getType();
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
} else {
|
85
|
+
avroType = field.schema().getType();
|
86
|
+
}
|
87
|
+
switch (avroType) {
|
88
|
+
case STRING:
|
89
|
+
case BYTES:
|
90
|
+
case FIXED:
|
91
|
+
case ENUM:
|
92
|
+
case NULL:
|
93
|
+
builder.add(new Column(index, name, Types.STRING));
|
94
|
+
index++;
|
95
|
+
break;
|
96
|
+
case INT:
|
97
|
+
case LONG:
|
98
|
+
builder.add(new Column(index, name, Types.LONG));
|
99
|
+
index++;
|
100
|
+
break;
|
101
|
+
case FLOAT:
|
102
|
+
case DOUBLE:
|
103
|
+
builder.add(new Column(index, name, Types.DOUBLE));
|
104
|
+
index++;
|
105
|
+
break;
|
106
|
+
case BOOLEAN:
|
107
|
+
builder.add(new Column(index, name, Types.BOOLEAN));
|
108
|
+
index++;
|
109
|
+
break;
|
110
|
+
case MAP:
|
111
|
+
case ARRAY:
|
112
|
+
case RECORD:
|
113
|
+
builder.add(new Column(index, name, Types.JSON));
|
114
|
+
index++;
|
115
|
+
break;
|
116
|
+
default:
|
117
|
+
throw new RuntimeException("Unsupported type");
|
118
|
+
}
|
119
|
+
}
|
120
|
+
return new Schema(builder.build());
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
54
124
|
@Override
|
55
125
|
public void run(TaskSource taskSource, Schema schema,
|
56
126
|
FileInput input, PageOutput output)
|
57
127
|
{
|
58
128
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
59
|
-
File avsc = task.getAvsc().getFile();
|
60
129
|
List<Column> columns = schema.getColumns();
|
61
130
|
final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
|
131
|
+
File avsc = task.getAvsc().getFile();
|
132
|
+
final org.apache.avro.Schema avroSchema;
|
133
|
+
try {
|
134
|
+
avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
|
135
|
+
} catch (IOException e) {
|
136
|
+
throw new ConfigException("avsc file is not found");
|
137
|
+
}
|
62
138
|
|
63
139
|
try (FileInputInputStream is = new FileInputInputStream(input); final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
64
|
-
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
|
65
140
|
ColumnGetterFactory factory = new ColumnGetterFactory(avroSchema, pageBuilder, timestampParsers);
|
66
141
|
ImmutableMap.Builder<String, BaseColumnGetter> columnGettersBuilder = ImmutableMap.builder();
|
67
142
|
for (Column column : columns) {
|
@@ -1,5 +1,6 @@
|
|
1
1
|
package org.embulk.parser.avro.getter;
|
2
2
|
|
3
|
+
import org.apache.avro.Schema;
|
3
4
|
import org.embulk.spi.Column;
|
4
5
|
import org.embulk.spi.DataException;
|
5
6
|
import org.embulk.spi.PageBuilder;
|
@@ -20,44 +21,46 @@ public class ColumnGetterFactory {
|
|
20
21
|
public BaseColumnGetter newColumnGetter(Column column)
|
21
22
|
{
|
22
23
|
org.apache.avro.Schema fieldSchema = avroSchema.getField(column.getName()).schema();
|
23
|
-
switch (fieldSchema.getType()
|
24
|
-
case
|
25
|
-
|
26
|
-
for (org.apache.avro.Schema
|
27
|
-
if (
|
28
|
-
|
24
|
+
switch (fieldSchema.getType()) {
|
25
|
+
case UNION:
|
26
|
+
Schema.Type type = null;
|
27
|
+
for (org.apache.avro.Schema sc : fieldSchema.getTypes()) {
|
28
|
+
if (sc.getType() != Schema.Type.NULL) {
|
29
|
+
type = sc.getType();
|
29
30
|
break;
|
30
31
|
}
|
31
32
|
}
|
32
|
-
return getColumnGetterFromTypeName(
|
33
|
+
return getColumnGetterFromTypeName(type);
|
33
34
|
default :
|
34
|
-
return getColumnGetterFromTypeName(fieldSchema.getType()
|
35
|
+
return getColumnGetterFromTypeName(fieldSchema.getType());
|
35
36
|
}
|
36
37
|
}
|
37
38
|
|
38
|
-
private BaseColumnGetter getColumnGetterFromTypeName(
|
39
|
+
private BaseColumnGetter getColumnGetterFromTypeName(Schema.Type type)
|
39
40
|
{
|
40
|
-
switch (
|
41
|
-
case
|
42
|
-
case
|
41
|
+
switch (type) {
|
42
|
+
case STRING:
|
43
|
+
case ENUM:
|
43
44
|
return new StringColumnGetter(pageBuilder, timestampParsers);
|
44
|
-
case
|
45
|
+
case INT:
|
45
46
|
return new IntegerColumnGetter(pageBuilder, timestampParsers);
|
46
|
-
case
|
47
|
+
case LONG:
|
47
48
|
return new LongColumnGetter(pageBuilder, timestampParsers);
|
48
|
-
case
|
49
|
+
case FLOAT:
|
49
50
|
return new FloatColumnGetter(pageBuilder, timestampParsers);
|
50
|
-
case
|
51
|
+
case DOUBLE:
|
51
52
|
return new DoubleColumnGetter(pageBuilder, timestampParsers);
|
52
|
-
case
|
53
|
+
case BOOLEAN:
|
53
54
|
return new BooleanColumnGetter(pageBuilder, timestampParsers);
|
54
|
-
case
|
55
|
-
case
|
56
|
-
case
|
55
|
+
case ARRAY:
|
56
|
+
case MAP:
|
57
|
+
case RECORD:
|
57
58
|
return new GenericDataColumnGetter(pageBuilder, timestampParsers);
|
58
|
-
case
|
59
|
+
case NULL:
|
60
|
+
return new StringColumnGetter(pageBuilder, timestampParsers);
|
61
|
+
case BYTES:
|
59
62
|
default:
|
60
|
-
throw new DataException(String.format("%s is not supported",
|
63
|
+
throw new DataException(String.format("%s is not supported", type.getName()));
|
61
64
|
}
|
62
65
|
}
|
63
66
|
}
|
@@ -92,6 +92,46 @@ public class TestAvroParserPlugin
|
|
92
92
|
assertEquals("2016-05-08 19:35:25.952 UTC", record[11].toString());
|
93
93
|
}
|
94
94
|
|
95
|
+
@Test
|
96
|
+
public void useNoColumnsOption()
|
97
|
+
throws Exception
|
98
|
+
{
|
99
|
+
SchemaConfig schema = schema(
|
100
|
+
column("id", LONG),
|
101
|
+
column("code", LONG),
|
102
|
+
column("name", STRING),
|
103
|
+
column("description", STRING),
|
104
|
+
column("flag", BOOLEAN),
|
105
|
+
column("created_at", STRING),
|
106
|
+
column("created_at_utc", DOUBLE),
|
107
|
+
column("price", DOUBLE),
|
108
|
+
column("spec", JSON),
|
109
|
+
column("tags", JSON),
|
110
|
+
column("options", JSON),
|
111
|
+
column("item_type", STRING),
|
112
|
+
column("dummy", STRING)
|
113
|
+
);
|
114
|
+
|
115
|
+
ConfigSource config = this.config.deepCopy().set("avsc", this.getClass().getResource("item.avsc").getPath());
|
116
|
+
|
117
|
+
transaction(config, fileInput(new File(this.getClass().getResource("items.avro").getPath())));
|
118
|
+
|
119
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
120
|
+
assertEquals(6, records.size());
|
121
|
+
|
122
|
+
Object[] record = records.get(0);
|
123
|
+
assertEquals(1L, record[0]);
|
124
|
+
assertEquals(123456789012345678L, record[1]);
|
125
|
+
assertEquals("Desktop", record[2]);
|
126
|
+
assertEquals(true, record[4]);
|
127
|
+
assertEquals("D", record[11]);
|
128
|
+
assertEquals("[\"tag1\",\"tag2\"]", record[9].toString());
|
129
|
+
assertEquals("bar", ((MapValue)record[10]).map().get(ValueFactory.newString("foo")).toString());
|
130
|
+
assertEquals("opt1", ((MapValue)record[8]).map().get(ValueFactory.newString("key")).toString());
|
131
|
+
assertEquals("2016-05-09T04:35:43+09:00", record[5].toString());
|
132
|
+
assertNull(record[12]);
|
133
|
+
}
|
134
|
+
|
95
135
|
private void recreatePageOutput()
|
96
136
|
{
|
97
137
|
output = new MockPageOutput();
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-avro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,6 +53,7 @@ files:
|
|
53
53
|
- config/checkstyle/default.xml
|
54
54
|
- example/.gitignore
|
55
55
|
- example/example.yml
|
56
|
+
- example/example_nocolumns.yml
|
56
57
|
- example/generate.rb
|
57
58
|
- example/item.avsc
|
58
59
|
- example/items.avro
|
@@ -62,7 +63,6 @@ files:
|
|
62
63
|
- gradlew.bat
|
63
64
|
- lib/embulk/guess/avro.rb
|
64
65
|
- lib/embulk/parser/avro.rb
|
65
|
-
- src/main/java/org/embulk/parser/avro/AvroColumnOption.java
|
66
66
|
- src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
|
67
67
|
- src/main/java/org/embulk/parser/avro/getter/AvroGenericDataConverter.java
|
68
68
|
- src/main/java/org/embulk/parser/avro/getter/BaseColumnGetter.java
|
@@ -79,7 +79,7 @@ files:
|
|
79
79
|
- src/test/resources/org/embulk/parser/avro/items.avro
|
80
80
|
- classpath/avro-1.8.0.jar
|
81
81
|
- classpath/commons-compress-1.8.1.jar
|
82
|
-
- classpath/embulk-parser-avro-0.
|
82
|
+
- classpath/embulk-parser-avro-0.2.0.jar
|
83
83
|
- classpath/jackson-core-asl-1.9.13.jar
|
84
84
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
85
85
|
- classpath/paranamer-2.7.jar
|
@@ -1,16 +0,0 @@
|
|
1
|
-
package org.embulk.parser.avro;
|
2
|
-
|
3
|
-
import org.embulk.config.Config;
|
4
|
-
import org.embulk.config.ConfigDefault;
|
5
|
-
import org.embulk.config.Task;
|
6
|
-
import org.embulk.spi.type.Type;
|
7
|
-
|
8
|
-
import com.google.common.base.Optional;
|
9
|
-
|
10
|
-
public interface AvroColumnOption
|
11
|
-
extends Task
|
12
|
-
{
|
13
|
-
@Config("type")
|
14
|
-
@ConfigDefault("null")
|
15
|
-
Optional<Type> getType();
|
16
|
-
}
|