embulk-parser-avro 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/build.gradle +7 -2
- data/lib/embulk/guess/avro.rb +3 -61
- data/src/main/java/org/embulk/guess/avro/AvroGuessPlugin.java +97 -0
- data/src/test/java/org/embulk/guess/avro/TestAvroGuessPlugin.java +81 -0
- metadata +12 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c910b362e790acea208b141982b8d9530d3c29c8
|
4
|
+
data.tar.gz: 450bc2c36228d32575e04fdd2bca4664536ca5f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 521f6e18548b1d163c2ecbbc2d2c19be584d3ab72d7530f63302ea16e9b36cbe66297c39ea4ccf08dfe4064708fc7d0a6eddeb2d472341e724687f4cc585f303
|
7
|
+
data.tar.gz: acda88bdc802a4b7c8a64818ab426a48bb39c7bb2d4ee8c1a54d9424fc8e3982ab9f5431b67f9cb90167b5ddb2bead8fa1ec54d2986d09a38e49697be0ef8f19
|
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
## Overview
|
6
6
|
|
7
7
|
* **Plugin type**: parser
|
8
|
-
* **Guess supported**:
|
8
|
+
* **Guess supported**: yes
|
9
9
|
|
10
10
|
## Configuration
|
11
11
|
|
@@ -102,7 +102,7 @@ out:
|
|
102
102
|
}
|
103
103
|
```
|
104
104
|
|
105
|
-
|
105
|
+
You don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
|
106
106
|
|
107
107
|
```
|
108
108
|
$ embulk gem install embulk-parser-avro
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.
|
16
|
+
version = "0.4.0"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.8
|
19
19
|
targetCompatibility = 1.8
|
@@ -21,7 +21,12 @@ targetCompatibility = 1.8
|
|
21
21
|
dependencies {
|
22
22
|
compile "org.embulk:embulk-core:0.9.4"
|
23
23
|
provided "org.embulk:embulk-core:0.9.4"
|
24
|
-
compile "org.apache.avro:avro:1.
|
24
|
+
compile ("org.apache.avro:avro:1.10.1") {
|
25
|
+
exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
|
26
|
+
}
|
27
|
+
compile "com.github.luben:zstd-jni:1.4.5-12"
|
28
|
+
compile "org.tukaani:xz:1.8"
|
29
|
+
compile "org.xerial.snappy:snappy-java:1.1.8.1"
|
25
30
|
testCompile "junit:junit:4.+"
|
26
31
|
|
27
32
|
testCompile "org.embulk:embulk-core:0.9.4:tests"
|
data/lib/embulk/guess/avro.rb
CHANGED
@@ -1,61 +1,3 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
# TODO implement guess plugin to make this command work:
|
5
|
-
# $ embulk guess -g "avro" partial-config.yml
|
6
|
-
#
|
7
|
-
# Depending on the file format the plugin uses, you can use choose
|
8
|
-
# one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
|
9
|
-
# or line guess (LineGuessPlugin).
|
10
|
-
|
11
|
-
#class Avro < GuessPlugin
|
12
|
-
# Plugin.register_guess("avro", self)
|
13
|
-
#
|
14
|
-
# def guess(config, sample_buffer)
|
15
|
-
# if sample_buffer[0,2] == GZIP_HEADER
|
16
|
-
# guessed = {}
|
17
|
-
# guessed["type"] = "avro"
|
18
|
-
# guessed["property1"] = "guessed-value"
|
19
|
-
# return {"parser" => guessed}
|
20
|
-
# else
|
21
|
-
# return {}
|
22
|
-
# end
|
23
|
-
# end
|
24
|
-
#end
|
25
|
-
|
26
|
-
#class Avro < TextGuessPlugin
|
27
|
-
# Plugin.register_guess("avro", self)
|
28
|
-
#
|
29
|
-
# def guess_text(config, sample_text)
|
30
|
-
# js = JSON.parse(sample_text) rescue nil
|
31
|
-
# if js && js["mykeyword"] == "keyword"
|
32
|
-
# guessed = {}
|
33
|
-
# guessed["type"] = "avro"
|
34
|
-
# guessed["property1"] = "guessed-value"
|
35
|
-
# return {"parser" => guessed}
|
36
|
-
# else
|
37
|
-
# return {}
|
38
|
-
# end
|
39
|
-
# end
|
40
|
-
#end
|
41
|
-
|
42
|
-
#class Avro < LineGuessPlugin
|
43
|
-
# Plugin.register_guess("avro", self)
|
44
|
-
#
|
45
|
-
# def guess_lines(config, sample_lines)
|
46
|
-
# all_line_matched = sample_lines.all? do |line|
|
47
|
-
# line =~ /mypattern/
|
48
|
-
# end
|
49
|
-
# if all_line_matched
|
50
|
-
# guessed = {}
|
51
|
-
# guessed["type"] = "avro"
|
52
|
-
# guessed["property1"] = "guessed-value"
|
53
|
-
# return {"parser" => guessed}
|
54
|
-
# else
|
55
|
-
# return {}
|
56
|
-
# end
|
57
|
-
# end
|
58
|
-
#end
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
1
|
+
Embulk::JavaPlugin.register_guess(
|
2
|
+
"avro", "org.embulk.guess.avro.AvroGuessPlugin",
|
3
|
+
File.expand_path('../../../../classpath', __FILE__))
|
@@ -0,0 +1,97 @@
|
|
1
|
+
package org.embulk.guess.avro;
|
2
|
+
|
3
|
+
import org.apache.avro.Schema;
|
4
|
+
import org.apache.avro.file.DataFileReader;
|
5
|
+
import org.apache.avro.file.SeekableByteArrayInput;
|
6
|
+
import org.apache.avro.generic.GenericDatumReader;
|
7
|
+
import org.apache.avro.generic.GenericRecord;
|
8
|
+
import org.embulk.config.ConfigDiff;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.spi.Buffer;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.embulk.spi.GuessPlugin;
|
13
|
+
import org.embulk.spi.type.Type;
|
14
|
+
import org.embulk.spi.type.Types;
|
15
|
+
|
16
|
+
import java.io.IOException;
|
17
|
+
import java.util.ArrayList;
|
18
|
+
import java.util.Arrays;
|
19
|
+
import java.util.Collections;
|
20
|
+
import java.util.EnumMap;
|
21
|
+
import java.util.HashMap;
|
22
|
+
import java.util.List;
|
23
|
+
import java.util.Map;
|
24
|
+
|
25
|
+
public class AvroGuessPlugin
|
26
|
+
implements GuessPlugin {
|
27
|
+
|
28
|
+
private static final byte[] AVRO_HEADER = {0x4f, 0x62, 0x6a, 0x01};
|
29
|
+
|
30
|
+
private static final Map<Schema.Type, Type> TYPE_MAP = new EnumMap<>(Schema.Type.class);
|
31
|
+
|
32
|
+
static {
|
33
|
+
TYPE_MAP.put(Schema.Type.STRING, Types.STRING);
|
34
|
+
TYPE_MAP.put(Schema.Type.BYTES, Types.STRING);
|
35
|
+
TYPE_MAP.put(Schema.Type.FIXED, Types.STRING);
|
36
|
+
TYPE_MAP.put(Schema.Type.ENUM, Types.STRING);
|
37
|
+
TYPE_MAP.put(Schema.Type.NULL, Types.STRING);
|
38
|
+
TYPE_MAP.put(Schema.Type.INT, Types.LONG);
|
39
|
+
TYPE_MAP.put(Schema.Type.LONG, Types.LONG);
|
40
|
+
TYPE_MAP.put(Schema.Type.FLOAT, Types.DOUBLE);
|
41
|
+
TYPE_MAP.put(Schema.Type.DOUBLE, Types.DOUBLE);
|
42
|
+
TYPE_MAP.put(Schema.Type.BOOLEAN, Types.BOOLEAN);
|
43
|
+
TYPE_MAP.put(Schema.Type.MAP, Types.JSON);
|
44
|
+
TYPE_MAP.put(Schema.Type.ARRAY, Types.JSON);
|
45
|
+
TYPE_MAP.put(Schema.Type.RECORD, Types.JSON);
|
46
|
+
}
|
47
|
+
|
48
|
+
private Type convertType(Schema.Field field) {
|
49
|
+
Schema.Type type = field.schema().getType();
|
50
|
+
if (type == Schema.Type.UNION) {
|
51
|
+
for (Schema schema : field.schema().getTypes()) {
|
52
|
+
Schema.Type t = schema.getType();
|
53
|
+
if (t != Schema.Type.NULL) {
|
54
|
+
type = t;
|
55
|
+
break;
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
return TYPE_MAP.get(type);
|
60
|
+
}
|
61
|
+
|
62
|
+
private byte[] copyBuffer(Buffer buffer, int size) {
|
63
|
+
byte[] bytes = new byte[size];
|
64
|
+
buffer.getBytes(0, bytes, 0, size);
|
65
|
+
return bytes;
|
66
|
+
}
|
67
|
+
|
68
|
+
@Override
|
69
|
+
public ConfigDiff guess(ConfigSource config, Buffer sample) {
|
70
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
71
|
+
|
72
|
+
byte[] bytes = copyBuffer(sample, AVRO_HEADER.length);
|
73
|
+
if (!Arrays.equals(bytes, AVRO_HEADER)) {
|
74
|
+
return configDiff;
|
75
|
+
}
|
76
|
+
ConfigDiff parserConfig = configDiff.set("parser", Collections.emptyMap()).getNested("parser");
|
77
|
+
parserConfig.set("type", "avro");
|
78
|
+
|
79
|
+
bytes = copyBuffer(sample, sample.capacity());
|
80
|
+
DataFileReader<GenericRecord> reader;
|
81
|
+
try {
|
82
|
+
reader = new DataFileReader<>(new SeekableByteArrayInput(bytes), new GenericDatumReader<>());
|
83
|
+
} catch (IOException e) {
|
84
|
+
return configDiff;
|
85
|
+
}
|
86
|
+
List<Map<String, String>> columns = new ArrayList<>();
|
87
|
+
for (Schema.Field field : reader.getSchema().getFields()) {
|
88
|
+
Map<String, String> column = new HashMap<>();
|
89
|
+
column.put("name", field.name());
|
90
|
+
column.put("type", convertType(field).getName());
|
91
|
+
columns.add(column);
|
92
|
+
}
|
93
|
+
parserConfig.set("columns", columns);
|
94
|
+
|
95
|
+
return configDiff;
|
96
|
+
}
|
97
|
+
}
|
@@ -0,0 +1,81 @@
|
|
1
|
+
package org.embulk.guess.avro;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.databind.JsonNode;
|
4
|
+
import com.fasterxml.jackson.databind.node.ObjectNode;
|
5
|
+
import com.google.common.collect.ImmutableMap;
|
6
|
+
import org.apache.commons.compress.utils.IOUtils;
|
7
|
+
import org.embulk.EmbulkTestRuntime;
|
8
|
+
import org.embulk.config.ConfigDiff;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.spi.Buffer;
|
11
|
+
import org.junit.Before;
|
12
|
+
import org.junit.Rule;
|
13
|
+
import org.junit.Test;
|
14
|
+
|
15
|
+
import java.io.IOException;
|
16
|
+
import java.io.InputStream;
|
17
|
+
import java.util.Iterator;
|
18
|
+
import java.util.Map;
|
19
|
+
|
20
|
+
import static org.junit.Assert.assertEquals;
|
21
|
+
import static org.junit.Assert.assertTrue;
|
22
|
+
|
23
|
+
public class TestAvroGuessPlugin {
|
24
|
+
|
25
|
+
@Rule
|
26
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
27
|
+
|
28
|
+
private ConfigSource config;
|
29
|
+
private AvroGuessPlugin plugin;
|
30
|
+
|
31
|
+
@Before
|
32
|
+
public void setUp() {
|
33
|
+
plugin = new AvroGuessPlugin();
|
34
|
+
config = runtime.getExec().newConfigSource();
|
35
|
+
}
|
36
|
+
|
37
|
+
@Test
|
38
|
+
public void testAvroFile() throws IOException {
|
39
|
+
Map<String, String> expectedColumns = ImmutableMap.<String, String>builder()
|
40
|
+
.put("id", "long")
|
41
|
+
.put("code", "long")
|
42
|
+
.put("name", "string")
|
43
|
+
.put("description", "string")
|
44
|
+
.put("flag", "boolean")
|
45
|
+
.put("created_at", "string")
|
46
|
+
.put("created_at_utc", "double")
|
47
|
+
.put("price", "double")
|
48
|
+
.put("spec", "json")
|
49
|
+
.put("tags", "json")
|
50
|
+
.put("options", "json")
|
51
|
+
.put("item_type", "string")
|
52
|
+
.put("dummy", "string")
|
53
|
+
.build();
|
54
|
+
|
55
|
+
ConfigDiff configDiff = guess("items.avro");
|
56
|
+
|
57
|
+
JsonNode parserNode = configDiff.getObjectNode().get("parser");
|
58
|
+
assertEquals("avro", parserNode.get("type").asText());
|
59
|
+
|
60
|
+
Iterator<JsonNode> it = parserNode.get("columns").elements();
|
61
|
+
while (it.hasNext()) {
|
62
|
+
JsonNode node = it.next();
|
63
|
+
String name = node.get("name").asText();
|
64
|
+
assertTrue(expectedColumns.containsKey(name));
|
65
|
+
assertEquals(expectedColumns.get(name), node.get("type").asText());
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
@Test
|
70
|
+
public void testNonAvroFile() throws IOException {
|
71
|
+
ConfigDiff configDiff = guess("data.json");
|
72
|
+
ObjectNode objectNode = configDiff.getObjectNode();
|
73
|
+
assertEquals(0, objectNode.size());
|
74
|
+
}
|
75
|
+
|
76
|
+
private ConfigDiff guess(String resource) throws IOException {
|
77
|
+
InputStream is = this.getClass().getResourceAsStream("/org/embulk/parser/avro/" + resource);
|
78
|
+
Buffer sample = Buffer.wrap(IOUtils.toByteArray(is));
|
79
|
+
return plugin.guess(config, sample);
|
80
|
+
}
|
81
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-avro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +63,7 @@ files:
|
|
63
63
|
- gradlew.bat
|
64
64
|
- lib/embulk/guess/avro.rb
|
65
65
|
- lib/embulk/parser/avro.rb
|
66
|
+
- src/main/java/org/embulk/guess/avro/AvroGuessPlugin.java
|
66
67
|
- src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
|
67
68
|
- src/main/java/org/embulk/parser/avro/TimestampUnit.java
|
68
69
|
- src/main/java/org/embulk/parser/avro/TimestampUnitDeserializer.java
|
@@ -76,6 +77,7 @@ files:
|
|
76
77
|
- src/main/java/org/embulk/parser/avro/getter/IntegerColumnGetter.java
|
77
78
|
- src/main/java/org/embulk/parser/avro/getter/LongColumnGetter.java
|
78
79
|
- src/main/java/org/embulk/parser/avro/getter/StringColumnGetter.java
|
80
|
+
- src/test/java/org/embulk/guess/avro/TestAvroGuessPlugin.java
|
79
81
|
- src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java
|
80
82
|
- src/test/resources/org/embulk/parser/avro/.gitignore
|
81
83
|
- src/test/resources/org/embulk/parser/avro/Gemfile
|
@@ -87,14 +89,14 @@ files:
|
|
87
89
|
- src/test/resources/org/embulk/parser/avro/item2.avsc
|
88
90
|
- src/test/resources/org/embulk/parser/avro/items.avro
|
89
91
|
- src/test/resources/org/embulk/parser/avro/items2.avro
|
90
|
-
- classpath/avro-1.
|
91
|
-
- classpath/
|
92
|
-
- classpath/xz-1.
|
93
|
-
- classpath/
|
94
|
-
- classpath/
|
95
|
-
- classpath/jackson-core-
|
96
|
-
- classpath/
|
97
|
-
- classpath/
|
92
|
+
- classpath/avro-1.10.1.jar
|
93
|
+
- classpath/zstd-jni-1.4.5-12.jar
|
94
|
+
- classpath/xz-1.8.jar
|
95
|
+
- classpath/snappy-java-1.1.8.1.jar
|
96
|
+
- classpath/commons-compress-1.20.jar
|
97
|
+
- classpath/jackson-core-2.11.3.jar
|
98
|
+
- classpath/slf4j-api-1.7.30.jar
|
99
|
+
- classpath/embulk-parser-avro-0.4.0.jar
|
98
100
|
homepage: https://github.com/joker1007/embulk-parser-avro
|
99
101
|
licenses:
|
100
102
|
- MIT
|