embulk-parser-avro 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 42ab545f7eca444d73944d0e1b86076582e109a3
4
- data.tar.gz: 297e047f1320b1e2d8c1e22b6ab24a2feed51a08
3
+ metadata.gz: 62671c1f3feefa2bb7feecccf4362b04f0ad7d3c
4
+ data.tar.gz: 68e76d9d06bcc2f30c3d6e376e1fe867027b6e09
5
5
  SHA512:
6
- metadata.gz: 73e60189bdb763f31f16330cd6c1cee0e3de1a27df254206d16b2276489263442ea87349a2e240c489a47d6d0d505c18b721e28d305064439ade530dc82e8b48
7
- data.tar.gz: 45b959df1da857a91ed391a8cdfbcc7200d1e55c8f6d3a821ed80a642c153a171babd4adc63cc345fbf64b5a172868b9ee92ff5e5f3d5856ad086c8c1b30c220
6
+ metadata.gz: 79e628c578df06d5aa54e17a0038eeb65fa667318be469cdceab9161a4b40716634d1c9cbdd88d17ef47a8d85cf9d4d63939c6340b3efde74995070e3ad37758
7
+ data.tar.gz: d1120457b865feed3eab7560f3dff20ab0b767fe17653d6970626836460426135334f0f681f901405ce8107d4a0f3c3fb5aa14e302023132fbad48a22c538168
data/README.md CHANGED
@@ -11,10 +11,12 @@
11
11
 
12
12
  - **type**: Specify this parser as avro
13
13
  - **avsc**: Specify avro schema file.
14
- - **columns**: Specify column name and type. See below (array, required)
14
+ - **columns**: Specify column name and type. See below (array, optional)
15
15
  * **default_timezone**: Default timezone of the timestamp (string, default: UTC)
16
16
  * **default_timestamp_format**: Default timestamp format of the timestamp (string, default: `%Y-%m-%d %H:%M:%S.%N %z`)
17
17
 
18
+ If columns is not set, this plugin detect schema automatically by using avsc schema.
19
+
18
20
  ## Example
19
21
 
20
22
  ```yaml
data/build.gradle CHANGED
@@ -13,19 +13,19 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.0"
16
+ version = "0.2.0"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
20
20
 
21
21
  dependencies {
22
- compile "org.embulk:embulk-core:0.8.8"
23
- provided "org.embulk:embulk-core:0.8.8"
22
+ compile "org.embulk:embulk-core:0.8.14"
23
+ provided "org.embulk:embulk-core:0.8.14"
24
24
  compile "org.apache.avro:avro:1.8.0"
25
25
  testCompile "junit:junit:4.+"
26
26
 
27
- testCompile "org.embulk:embulk-core:0.8.8:tests"
28
- testCompile "org.embulk:embulk-standards:0.8.8"
27
+ testCompile "org.embulk:embulk-core:0.8.14:tests"
28
+ testCompile "org.embulk:embulk-standards:0.8.14"
29
29
  }
30
30
 
31
31
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -0,0 +1,9 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "items"
4
+ parser:
5
+ type: avro
6
+ avsc : "./item.avsc"
7
+
8
+ out:
9
+ type: stdout
@@ -1,11 +1,14 @@
1
1
  package org.embulk.parser.avro;
2
2
 
3
+ import com.google.common.collect.ImmutableList;
3
4
  import com.google.common.collect.ImmutableMap;
4
5
  import org.apache.avro.file.DataFileStream;
5
6
  import org.apache.avro.generic.GenericDatumReader;
6
7
  import org.apache.avro.generic.GenericRecord;
7
8
  import org.apache.avro.io.DatumReader;
8
9
  import org.embulk.config.Config;
10
+ import org.embulk.config.ConfigDefault;
11
+ import org.embulk.config.ConfigException;
9
12
  import org.embulk.config.ConfigSource;
10
13
  import org.embulk.config.Task;
11
14
  import org.embulk.config.TaskSource;
@@ -20,6 +23,7 @@ import org.embulk.spi.ParserPlugin;
20
23
  import org.embulk.spi.Schema;
21
24
  import org.embulk.spi.SchemaConfig;
22
25
  import org.embulk.spi.time.TimestampParser;
26
+ import org.embulk.spi.type.Types;
23
27
  import org.embulk.spi.unit.LocalFile;
24
28
  import org.embulk.spi.util.FileInputInputStream;
25
29
  import org.embulk.spi.util.Timestamps;
@@ -35,6 +39,7 @@ public class AvroParserPlugin
35
39
  extends Task, TimestampParser.Task
36
40
  {
37
41
  @Config("columns")
42
+ @ConfigDefault("[]")
38
43
  public SchemaConfig getColumns();
39
44
 
40
45
  @Config("avsc")
@@ -46,22 +51,92 @@ public class AvroParserPlugin
46
51
  {
47
52
  PluginTask task = config.loadConfig(PluginTask.class);
48
53
 
49
- Schema schema = task.getColumns().toSchema();
54
+ File avsc = task.getAvsc().getFile();
55
+ org.apache.avro.Schema avroSchema;
56
+ try {
57
+ avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
58
+ } catch (IOException e) {
59
+ throw new ConfigException("avsc file is not found");
60
+ }
61
+
62
+ Schema schema = buildSchema(task.getColumns(), avroSchema);
50
63
 
51
64
  control.run(task.dump(), schema);
52
65
  }
53
66
 
67
+ Schema buildSchema(SchemaConfig columns, org.apache.avro.Schema avroSchema) {
68
+ if (columns.size() > 0) {
69
+ return columns.toSchema();
70
+ } else {
71
+ int index = 0;
72
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
73
+ for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
74
+ String name = field.name();
75
+
76
+ org.apache.avro.Schema.Type avroType = null;
77
+ if (field.schema().getType() == org.apache.avro.Schema.Type.UNION) {
78
+ for (org.apache.avro.Schema sc : field.schema().getTypes()) {
79
+ if (sc.getType() != org.apache.avro.Schema.Type.NULL) {
80
+ avroType = sc.getType();
81
+ break;
82
+ }
83
+ }
84
+ } else {
85
+ avroType = field.schema().getType();
86
+ }
87
+ switch (avroType) {
88
+ case STRING:
89
+ case BYTES:
90
+ case FIXED:
91
+ case ENUM:
92
+ case NULL:
93
+ builder.add(new Column(index, name, Types.STRING));
94
+ index++;
95
+ break;
96
+ case INT:
97
+ case LONG:
98
+ builder.add(new Column(index, name, Types.LONG));
99
+ index++;
100
+ break;
101
+ case FLOAT:
102
+ case DOUBLE:
103
+ builder.add(new Column(index, name, Types.DOUBLE));
104
+ index++;
105
+ break;
106
+ case BOOLEAN:
107
+ builder.add(new Column(index, name, Types.BOOLEAN));
108
+ index++;
109
+ break;
110
+ case MAP:
111
+ case ARRAY:
112
+ case RECORD:
113
+ builder.add(new Column(index, name, Types.JSON));
114
+ index++;
115
+ break;
116
+ default:
117
+ throw new RuntimeException("Unsupported type");
118
+ }
119
+ }
120
+ return new Schema(builder.build());
121
+ }
122
+ }
123
+
54
124
  @Override
55
125
  public void run(TaskSource taskSource, Schema schema,
56
126
  FileInput input, PageOutput output)
57
127
  {
58
128
  PluginTask task = taskSource.loadTask(PluginTask.class);
59
- File avsc = task.getAvsc().getFile();
60
129
  List<Column> columns = schema.getColumns();
61
130
  final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
131
+ File avsc = task.getAvsc().getFile();
132
+ final org.apache.avro.Schema avroSchema;
133
+ try {
134
+ avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
135
+ } catch (IOException e) {
136
+ throw new ConfigException("avsc file is not found");
137
+ }
62
138
 
63
139
  try (FileInputInputStream is = new FileInputInputStream(input); final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
64
- org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(avsc);
65
140
  ColumnGetterFactory factory = new ColumnGetterFactory(avroSchema, pageBuilder, timestampParsers);
66
141
  ImmutableMap.Builder<String, BaseColumnGetter> columnGettersBuilder = ImmutableMap.builder();
67
142
  for (Column column : columns) {
@@ -1,5 +1,6 @@
1
1
  package org.embulk.parser.avro.getter;
2
2
 
3
+ import org.apache.avro.Schema;
3
4
  import org.embulk.spi.Column;
4
5
  import org.embulk.spi.DataException;
5
6
  import org.embulk.spi.PageBuilder;
@@ -20,44 +21,46 @@ public class ColumnGetterFactory {
20
21
  public BaseColumnGetter newColumnGetter(Column column)
21
22
  {
22
23
  org.apache.avro.Schema fieldSchema = avroSchema.getField(column.getName()).schema();
23
- switch (fieldSchema.getType().getName()) {
24
- case "union" :
25
- String typeName = "";
26
- for (org.apache.avro.Schema type : fieldSchema.getTypes()) {
27
- if (!type.getName().equals("null")) {
28
- typeName = type.getName();
24
+ switch (fieldSchema.getType()) {
25
+ case UNION:
26
+ Schema.Type type = null;
27
+ for (org.apache.avro.Schema sc : fieldSchema.getTypes()) {
28
+ if (sc.getType() != Schema.Type.NULL) {
29
+ type = sc.getType();
29
30
  break;
30
31
  }
31
32
  }
32
- return getColumnGetterFromTypeName(typeName);
33
+ return getColumnGetterFromTypeName(type);
33
34
  default :
34
- return getColumnGetterFromTypeName(fieldSchema.getType().getName());
35
+ return getColumnGetterFromTypeName(fieldSchema.getType());
35
36
  }
36
37
  }
37
38
 
38
- private BaseColumnGetter getColumnGetterFromTypeName(String typeName)
39
+ private BaseColumnGetter getColumnGetterFromTypeName(Schema.Type type)
39
40
  {
40
- switch (typeName) {
41
- case "string":
42
- case "enum":
41
+ switch (type) {
42
+ case STRING:
43
+ case ENUM:
43
44
  return new StringColumnGetter(pageBuilder, timestampParsers);
44
- case "int":
45
+ case INT:
45
46
  return new IntegerColumnGetter(pageBuilder, timestampParsers);
46
- case "long":
47
+ case LONG:
47
48
  return new LongColumnGetter(pageBuilder, timestampParsers);
48
- case "float":
49
+ case FLOAT:
49
50
  return new FloatColumnGetter(pageBuilder, timestampParsers);
50
- case "double":
51
+ case DOUBLE:
51
52
  return new DoubleColumnGetter(pageBuilder, timestampParsers);
52
- case "boolean":
53
+ case BOOLEAN:
53
54
  return new BooleanColumnGetter(pageBuilder, timestampParsers);
54
- case "array":
55
- case "map":
56
- case "record":
55
+ case ARRAY:
56
+ case MAP:
57
+ case RECORD:
57
58
  return new GenericDataColumnGetter(pageBuilder, timestampParsers);
58
- case "byte":
59
+ case NULL:
60
+ return new StringColumnGetter(pageBuilder, timestampParsers);
61
+ case BYTES:
59
62
  default:
60
- throw new DataException(String.format("%s is not supported", typeName));
63
+ throw new DataException(String.format("%s is not supported", type.getName()));
61
64
  }
62
65
  }
63
66
  }
@@ -92,6 +92,46 @@ public class TestAvroParserPlugin
92
92
  assertEquals("2016-05-08 19:35:25.952 UTC", record[11].toString());
93
93
  }
94
94
 
95
+ @Test
96
+ public void useNoColumnsOption()
97
+ throws Exception
98
+ {
99
+ SchemaConfig schema = schema(
100
+ column("id", LONG),
101
+ column("code", LONG),
102
+ column("name", STRING),
103
+ column("description", STRING),
104
+ column("flag", BOOLEAN),
105
+ column("created_at", STRING),
106
+ column("created_at_utc", DOUBLE),
107
+ column("price", DOUBLE),
108
+ column("spec", JSON),
109
+ column("tags", JSON),
110
+ column("options", JSON),
111
+ column("item_type", STRING),
112
+ column("dummy", STRING)
113
+ );
114
+
115
+ ConfigSource config = this.config.deepCopy().set("avsc", this.getClass().getResource("item.avsc").getPath());
116
+
117
+ transaction(config, fileInput(new File(this.getClass().getResource("items.avro").getPath())));
118
+
119
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
120
+ assertEquals(6, records.size());
121
+
122
+ Object[] record = records.get(0);
123
+ assertEquals(1L, record[0]);
124
+ assertEquals(123456789012345678L, record[1]);
125
+ assertEquals("Desktop", record[2]);
126
+ assertEquals(true, record[4]);
127
+ assertEquals("D", record[11]);
128
+ assertEquals("[\"tag1\",\"tag2\"]", record[9].toString());
129
+ assertEquals("bar", ((MapValue)record[10]).map().get(ValueFactory.newString("foo")).toString());
130
+ assertEquals("opt1", ((MapValue)record[8]).map().get(ValueFactory.newString("key")).toString());
131
+ assertEquals("2016-05-09T04:35:43+09:00", record[5].toString());
132
+ assertNull(record[12]);
133
+ }
134
+
95
135
  private void recreatePageOutput()
96
136
  {
97
137
  output = new MockPageOutput();
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-avro
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - joker1007
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-08 00:00:00.000000000 Z
11
+ date: 2016-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -53,6 +53,7 @@ files:
53
53
  - config/checkstyle/default.xml
54
54
  - example/.gitignore
55
55
  - example/example.yml
56
+ - example/example_nocolumns.yml
56
57
  - example/generate.rb
57
58
  - example/item.avsc
58
59
  - example/items.avro
@@ -62,7 +63,6 @@ files:
62
63
  - gradlew.bat
63
64
  - lib/embulk/guess/avro.rb
64
65
  - lib/embulk/parser/avro.rb
65
- - src/main/java/org/embulk/parser/avro/AvroColumnOption.java
66
66
  - src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
67
67
  - src/main/java/org/embulk/parser/avro/getter/AvroGenericDataConverter.java
68
68
  - src/main/java/org/embulk/parser/avro/getter/BaseColumnGetter.java
@@ -79,7 +79,7 @@ files:
79
79
  - src/test/resources/org/embulk/parser/avro/items.avro
80
80
  - classpath/avro-1.8.0.jar
81
81
  - classpath/commons-compress-1.8.1.jar
82
- - classpath/embulk-parser-avro-0.1.0.jar
82
+ - classpath/embulk-parser-avro-0.2.0.jar
83
83
  - classpath/jackson-core-asl-1.9.13.jar
84
84
  - classpath/jackson-mapper-asl-1.9.13.jar
85
85
  - classpath/paranamer-2.7.jar
@@ -1,16 +0,0 @@
1
- package org.embulk.parser.avro;
2
-
3
- import org.embulk.config.Config;
4
- import org.embulk.config.ConfigDefault;
5
- import org.embulk.config.Task;
6
- import org.embulk.spi.type.Type;
7
-
8
- import com.google.common.base.Optional;
9
-
10
- public interface AvroColumnOption
11
- extends Task
12
- {
13
- @Config("type")
14
- @ConfigDefault("null")
15
- Optional<Type> getType();
16
- }