embulk-parser-avro 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 42ab545f7eca444d73944d0e1b86076582e109a3
4
- data.tar.gz: 297e047f1320b1e2d8c1e22b6ab24a2feed51a08
3
+ metadata.gz: 62671c1f3feefa2bb7feecccf4362b04f0ad7d3c
4
+ data.tar.gz: 68e76d9d06bcc2f30c3d6e376e1fe867027b6e09
5
5
  SHA512:
6
- metadata.gz: 73e60189bdb763f31f16330cd6c1cee0e3de1a27df254206d16b2276489263442ea87349a2e240c489a47d6d0d505c18b721e28d305064439ade530dc82e8b48
7
- data.tar.gz: 45b959df1da857a91ed391a8cdfbcc7200d1e55c8f6d3a821ed80a642c153a171babd4adc63cc345fbf64b5a172868b9ee92ff5e5f3d5856ad086c8c1b30c220
6
+ metadata.gz: 79e628c578df06d5aa54e17a0038eeb65fa667318be469cdceab9161a4b40716634d1c9cbdd88d17ef47a8d85cf9d4d63939c6340b3efde74995070e3ad37758
7
+ data.tar.gz: d1120457b865feed3eab7560f3dff20ab0b767fe17653d6970626836460426135334f0f681f901405ce8107d4a0f3c3fb5aa14e302023132fbad48a22c538168
data/README.md CHANGED
@@ -11,10 +11,12 @@
11
11
 
12
12
  - **type**: Specify this parser as avro
13
13
  - **avsc**: Specify avro schema file.
14
- - **columns**: Specify column name and type. See below (array, required)
14
+ - **columns**: Specify column name and type. See below (array, optional)
15
15
  * **default_timezone**: Default timezone of the timestamp (string, default: UTC)
16
16
  * **default_timestamp_format**: Default timestamp format of the timestamp (string, default: `%Y-%m-%d %H:%M:%S.%N %z`)
17
17
 
18
+ If columns is not set, this plugin detect schema automatically by using avsc schema.
19
+
18
20
  ## Example
19
21
 
20
22
  ```yaml
data/build.gradle CHANGED
@@ -13,19 +13,19 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.0"
16
+ version = "0.2.0"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
20
20
 
21
21
  dependencies {
22
- compile "org.embulk:embulk-core:0.8.8"
23
- provided "org.embulk:embulk-core:0.8.8"
22
+ compile "org.embulk:embulk-core:0.8.14"
23
+ provided "org.embulk:embulk-core:0.8.14"
24
24
  compile "org.apache.avro:avro:1.8.0"
25
25
  testCompile "junit:junit:4.+"
26
26
 
27
- testCompile "org.embulk:embulk-core:0.8.8:tests"
28
- testCompile "org.embulk:embulk-standards:0.8.8"
27
+ testCompile "org.embulk:embulk-core:0.8.14:tests"
28
+ testCompile "org.embulk:embulk-standards:0.8.14"
29
29
  }
30
30
 
31
31
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -0,0 +1,9 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "items"
4
+ parser:
5
+ type: avro
6
+ avsc : "./item.avsc"
7
+
8
+ out:
9
+ type: stdout
@@ -1,11 +1,14 @@
1
1
  package org.embulk.parser.avro;
2
2
 
3
+ import com.google.common.collect.ImmutableList;
3
4
  import com.google.common.collect.ImmutableMap;
4
5
  import org.apache.avro.file.DataFileStream;
5
6
  import org.apache.avro.generic.GenericDatumReader;
6
7
  import org.apache.avro.generic.GenericRecord;
7
8
  import org.apache.avro.io.DatumReader;
8
9
  import org.embulk.config.Config;
10
+ import org.embulk.config.ConfigDefault;
11
+ import org.embulk.config.ConfigException;
9
12
  import org.embulk.config.ConfigSource;
10
13
  import org.embulk.config.Task;
11
14
  import org.embulk.config.TaskSource;
@@ -20,6 +23,7 @@ import org.embulk.spi.ParserPlugin;
20
23
  import org.embulk.spi.Schema;
21
24
  import org.embulk.spi.SchemaConfig;
22
25
  import org.embulk.spi.time.TimestampParser;
26
+ import org.embulk.spi.type.Types;
23
27
  import org.embulk.spi.unit.LocalFile;
24
28
  import org.embulk.spi.util.FileInputInputStream;
25
29
  import org.embulk.spi.util.Timestamps;
@@ -35,6 +39,7 @@ public class AvroParserPlugin
35
39
  extends Task, TimestampParser.Task
36
40
  {
37
41
  @Config("columns")
42
+ @ConfigDefault("[]")
38
43
  public SchemaConfig getColumns();
39
44
 
40
45
  @Config("avsc")
@@ -46,22 +51,92 @@ public class AvroParserPlugin
46
51
  {
47
52
  PluginTask task = config.loadConfig(PluginTask.class);
48
53
 
49
- Schema schema = task.getColumns().toSchema();
54
+ File avsc = task.getAvsc().getFile();
55
+ org.apache.avro.Schema avroSchema;
56
+ try {
57
+ avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
58
+ } catch (IOException e) {
59
+ throw new ConfigException("avsc file is not found");
60
+ }
61
+
62
+ Schema schema = buildSchema(task.getColumns(), avroSchema);
50
63
 
51
64
  control.run(task.dump(), schema);
52
65
  }
53
66
 
67
+ Schema buildSchema(SchemaConfig columns, org.apache.avro.Schema avroSchema) {
68
+ if (columns.size() > 0) {
69
+ return columns.toSchema();
70
+ } else {
71
+ int index = 0;
72
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
73
+ for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
74
+ String name = field.name();
75
+
76
+ org.apache.avro.Schema.Type avroType = null;
77
+ if (field.schema().getType() == org.apache.avro.Schema.Type.UNION) {
78
+ for (org.apache.avro.Schema sc : field.schema().getTypes()) {
79
+ if (sc.getType() != org.apache.avro.Schema.Type.NULL) {
80
+ avroType = sc.getType();
81
+ break;
82
+ }
83
+ }
84
+ } else {
85
+ avroType = field.schema().getType();
86
+ }
87
+ switch (avroType) {
88
+ case STRING:
89
+ case BYTES:
90
+ case FIXED:
91
+ case ENUM:
92
+ case NULL:
93
+ builder.add(new Column(index, name, Types.STRING));
94
+ index++;
95
+ break;
96
+ case INT:
97
+ case LONG:
98
+ builder.add(new Column(index, name, Types.LONG));
99
+ index++;
100
+ break;
101
+ case FLOAT:
102
+ case DOUBLE:
103
+ builder.add(new Column(index, name, Types.DOUBLE));
104
+ index++;
105
+ break;
106
+ case BOOLEAN:
107
+ builder.add(new Column(index, name, Types.BOOLEAN));
108
+ index++;
109
+ break;
110
+ case MAP:
111
+ case ARRAY:
112
+ case RECORD:
113
+ builder.add(new Column(index, name, Types.JSON));
114
+ index++;
115
+ break;
116
+ default:
117
+ throw new RuntimeException("Unsupported type");
118
+ }
119
+ }
120
+ return new Schema(builder.build());
121
+ }
122
+ }
123
+
54
124
  @Override
55
125
  public void run(TaskSource taskSource, Schema schema,
56
126
  FileInput input, PageOutput output)
57
127
  {
58
128
  PluginTask task = taskSource.loadTask(PluginTask.class);
59
- File avsc = task.getAvsc().getFile();
60
129
  List<Column> columns = schema.getColumns();
61
130
  final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
131
+ File avsc = task.getAvsc().getFile();
132
+ final org.apache.avro.Schema avroSchema;
133
+ try {
134
+ avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
135
+ } catch (IOException e) {
136
+ throw new ConfigException("avsc file is not found");
137
+ }
62
138
 
63
139
  try (FileInputInputStream is = new FileInputInputStream(input); final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
64
- org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
65
140
  ColumnGetterFactory factory = new ColumnGetterFactory(avroSchema, pageBuilder, timestampParsers);
66
141
  ImmutableMap.Builder<String, BaseColumnGetter> columnGettersBuilder = ImmutableMap.builder();
67
142
  for (Column column : columns) {
@@ -1,5 +1,6 @@
1
1
  package org.embulk.parser.avro.getter;
2
2
 
3
+ import org.apache.avro.Schema;
3
4
  import org.embulk.spi.Column;
4
5
  import org.embulk.spi.DataException;
5
6
  import org.embulk.spi.PageBuilder;
@@ -20,44 +21,46 @@ public class ColumnGetterFactory {
20
21
  public BaseColumnGetter newColumnGetter(Column column)
21
22
  {
22
23
  org.apache.avro.Schema fieldSchema = avroSchema.getField(column.getName()).schema();
23
- switch (fieldSchema.getType().getName()) {
24
- case "union" :
25
- String typeName = "";
26
- for (org.apache.avro.Schema type : fieldSchema.getTypes()) {
27
- if (!type.getName().equals("null")) {
28
- typeName = type.getName();
24
+ switch (fieldSchema.getType()) {
25
+ case UNION:
26
+ Schema.Type type = null;
27
+ for (org.apache.avro.Schema sc : fieldSchema.getTypes()) {
28
+ if (sc.getType() != Schema.Type.NULL) {
29
+ type = sc.getType();
29
30
  break;
30
31
  }
31
32
  }
32
- return getColumnGetterFromTypeName(typeName);
33
+ return getColumnGetterFromTypeName(type);
33
34
  default :
34
- return getColumnGetterFromTypeName(fieldSchema.getType().getName());
35
+ return getColumnGetterFromTypeName(fieldSchema.getType());
35
36
  }
36
37
  }
37
38
 
38
- private BaseColumnGetter getColumnGetterFromTypeName(String typeName)
39
+ private BaseColumnGetter getColumnGetterFromTypeName(Schema.Type type)
39
40
  {
40
- switch (typeName) {
41
- case "string":
42
- case "enum":
41
+ switch (type) {
42
+ case STRING:
43
+ case ENUM:
43
44
  return new StringColumnGetter(pageBuilder, timestampParsers);
44
- case "int":
45
+ case INT:
45
46
  return new IntegerColumnGetter(pageBuilder, timestampParsers);
46
- case "long":
47
+ case LONG:
47
48
  return new LongColumnGetter(pageBuilder, timestampParsers);
48
- case "float":
49
+ case FLOAT:
49
50
  return new FloatColumnGetter(pageBuilder, timestampParsers);
50
- case "double":
51
+ case DOUBLE:
51
52
  return new DoubleColumnGetter(pageBuilder, timestampParsers);
52
- case "boolean":
53
+ case BOOLEAN:
53
54
  return new BooleanColumnGetter(pageBuilder, timestampParsers);
54
- case "array":
55
- case "map":
56
- case "record":
55
+ case ARRAY:
56
+ case MAP:
57
+ case RECORD:
57
58
  return new GenericDataColumnGetter(pageBuilder, timestampParsers);
58
- case "byte":
59
+ case NULL:
60
+ return new StringColumnGetter(pageBuilder, timestampParsers);
61
+ case BYTES:
59
62
  default:
60
- throw new DataException(String.format("%s is not supported", typeName));
63
+ throw new DataException(String.format("%s is not supported", type.getName()));
61
64
  }
62
65
  }
63
66
  }
@@ -92,6 +92,46 @@ public class TestAvroParserPlugin
92
92
  assertEquals("2016-05-08 19:35:25.952 UTC", record[11].toString());
93
93
  }
94
94
 
95
+ @Test
96
+ public void useNoColumnsOption()
97
+ throws Exception
98
+ {
99
+ SchemaConfig schema = schema(
100
+ column("id", LONG),
101
+ column("code", LONG),
102
+ column("name", STRING),
103
+ column("description", STRING),
104
+ column("flag", BOOLEAN),
105
+ column("created_at", STRING),
106
+ column("created_at_utc", DOUBLE),
107
+ column("price", DOUBLE),
108
+ column("spec", JSON),
109
+ column("tags", JSON),
110
+ column("options", JSON),
111
+ column("item_type", STRING),
112
+ column("dummy", STRING)
113
+ );
114
+
115
+ ConfigSource config = this.config.deepCopy().set("avsc", this.getClass().getResource("item.avsc").getPath());
116
+
117
+ transaction(config, fileInput(new File(this.getClass().getResource("items.avro").getPath())));
118
+
119
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
120
+ assertEquals(6, records.size());
121
+
122
+ Object[] record = records.get(0);
123
+ assertEquals(1L, record[0]);
124
+ assertEquals(123456789012345678L, record[1]);
125
+ assertEquals("Desktop", record[2]);
126
+ assertEquals(true, record[4]);
127
+ assertEquals("D", record[11]);
128
+ assertEquals("[\"tag1\",\"tag2\"]", record[9].toString());
129
+ assertEquals("bar", ((MapValue)record[10]).map().get(ValueFactory.newString("foo")).toString());
130
+ assertEquals("opt1", ((MapValue)record[8]).map().get(ValueFactory.newString("key")).toString());
131
+ assertEquals("2016-05-09T04:35:43+09:00", record[5].toString());
132
+ assertNull(record[12]);
133
+ }
134
+
95
135
  private void recreatePageOutput()
96
136
  {
97
137
  output = new MockPageOutput();
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-avro
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - joker1007
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-08 00:00:00.000000000 Z
11
+ date: 2016-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -53,6 +53,7 @@ files:
53
53
  - config/checkstyle/default.xml
54
54
  - example/.gitignore
55
55
  - example/example.yml
56
+ - example/example_nocolumns.yml
56
57
  - example/generate.rb
57
58
  - example/item.avsc
58
59
  - example/items.avro
@@ -62,7 +63,6 @@ files:
62
63
  - gradlew.bat
63
64
  - lib/embulk/guess/avro.rb
64
65
  - lib/embulk/parser/avro.rb
65
- - src/main/java/org/embulk/parser/avro/AvroColumnOption.java
66
66
  - src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
67
67
  - src/main/java/org/embulk/parser/avro/getter/AvroGenericDataConverter.java
68
68
  - src/main/java/org/embulk/parser/avro/getter/BaseColumnGetter.java
@@ -79,7 +79,7 @@ files:
79
79
  - src/test/resources/org/embulk/parser/avro/items.avro
80
80
  - classpath/avro-1.8.0.jar
81
81
  - classpath/commons-compress-1.8.1.jar
82
- - classpath/embulk-parser-avro-0.1.0.jar
82
+ - classpath/embulk-parser-avro-0.2.0.jar
83
83
  - classpath/jackson-core-asl-1.9.13.jar
84
84
  - classpath/jackson-mapper-asl-1.9.13.jar
85
85
  - classpath/paranamer-2.7.jar
@@ -1,16 +0,0 @@
1
- package org.embulk.parser.avro;
2
-
3
- import org.embulk.config.Config;
4
- import org.embulk.config.ConfigDefault;
5
- import org.embulk.config.Task;
6
- import org.embulk.spi.type.Type;
7
-
8
- import com.google.common.base.Optional;
9
-
10
- public interface AvroColumnOption
11
- extends Task
12
- {
13
- @Config("type")
14
- @ConfigDefault("null")
15
- Optional<Type> getType();
16
- }