embulk-parser-avro 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/build.gradle +7 -2
- data/lib/embulk/guess/avro.rb +3 -61
- data/src/main/java/org/embulk/guess/avro/AvroGuessPlugin.java +97 -0
- data/src/test/java/org/embulk/guess/avro/TestAvroGuessPlugin.java +81 -0
- metadata +12 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c910b362e790acea208b141982b8d9530d3c29c8
|
4
|
+
data.tar.gz: 450bc2c36228d32575e04fdd2bca4664536ca5f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 521f6e18548b1d163c2ecbbc2d2c19be584d3ab72d7530f63302ea16e9b36cbe66297c39ea4ccf08dfe4064708fc7d0a6eddeb2d472341e724687f4cc585f303
|
7
|
+
data.tar.gz: acda88bdc802a4b7c8a64818ab426a48bb39c7bb2d4ee8c1a54d9424fc8e3982ab9f5431b67f9cb90167b5ddb2bead8fa1ec54d2986d09a38e49697be0ef8f19
|
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
## Overview
|
6
6
|
|
7
7
|
* **Plugin type**: parser
|
8
|
-
* **Guess supported**:
|
8
|
+
* **Guess supported**: yes
|
9
9
|
|
10
10
|
## Configuration
|
11
11
|
|
@@ -102,7 +102,7 @@ out:
|
|
102
102
|
}
|
103
103
|
```
|
104
104
|
|
105
|
-
|
105
|
+
You don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
|
106
106
|
|
107
107
|
```
|
108
108
|
$ embulk gem install embulk-parser-avro
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.
|
16
|
+
version = "0.4.0"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.8
|
19
19
|
targetCompatibility = 1.8
|
@@ -21,7 +21,12 @@ targetCompatibility = 1.8
|
|
21
21
|
dependencies {
|
22
22
|
compile "org.embulk:embulk-core:0.9.4"
|
23
23
|
provided "org.embulk:embulk-core:0.9.4"
|
24
|
-
compile "org.apache.avro:avro:1.
|
24
|
+
compile ("org.apache.avro:avro:1.10.1") {
|
25
|
+
exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
|
26
|
+
}
|
27
|
+
compile "com.github.luben:zstd-jni:1.4.5-12"
|
28
|
+
compile "org.tukaani:xz:1.8"
|
29
|
+
compile "org.xerial.snappy:snappy-java:1.1.8.1"
|
25
30
|
testCompile "junit:junit:4.+"
|
26
31
|
|
27
32
|
testCompile "org.embulk:embulk-core:0.9.4:tests"
|
data/lib/embulk/guess/avro.rb
CHANGED
@@ -1,61 +1,3 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
# TODO implement guess plugin to make this command work:
|
5
|
-
# $ embulk guess -g "avro" partial-config.yml
|
6
|
-
#
|
7
|
-
# Depending on the file format the plugin uses, you can use choose
|
8
|
-
# one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
|
9
|
-
# or line guess (LineGuessPlugin).
|
10
|
-
|
11
|
-
#class Avro < GuessPlugin
|
12
|
-
# Plugin.register_guess("avro", self)
|
13
|
-
#
|
14
|
-
# def guess(config, sample_buffer)
|
15
|
-
# if sample_buffer[0,2] == GZIP_HEADER
|
16
|
-
# guessed = {}
|
17
|
-
# guessed["type"] = "avro"
|
18
|
-
# guessed["property1"] = "guessed-value"
|
19
|
-
# return {"parser" => guessed}
|
20
|
-
# else
|
21
|
-
# return {}
|
22
|
-
# end
|
23
|
-
# end
|
24
|
-
#end
|
25
|
-
|
26
|
-
#class Avro < TextGuessPlugin
|
27
|
-
# Plugin.register_guess("avro", self)
|
28
|
-
#
|
29
|
-
# def guess_text(config, sample_text)
|
30
|
-
# js = JSON.parse(sample_text) rescue nil
|
31
|
-
# if js && js["mykeyword"] == "keyword"
|
32
|
-
# guessed = {}
|
33
|
-
# guessed["type"] = "avro"
|
34
|
-
# guessed["property1"] = "guessed-value"
|
35
|
-
# return {"parser" => guessed}
|
36
|
-
# else
|
37
|
-
# return {}
|
38
|
-
# end
|
39
|
-
# end
|
40
|
-
#end
|
41
|
-
|
42
|
-
#class Avro < LineGuessPlugin
|
43
|
-
# Plugin.register_guess("avro", self)
|
44
|
-
#
|
45
|
-
# def guess_lines(config, sample_lines)
|
46
|
-
# all_line_matched = sample_lines.all? do |line|
|
47
|
-
# line =~ /mypattern/
|
48
|
-
# end
|
49
|
-
# if all_line_matched
|
50
|
-
# guessed = {}
|
51
|
-
# guessed["type"] = "avro"
|
52
|
-
# guessed["property1"] = "guessed-value"
|
53
|
-
# return {"parser" => guessed}
|
54
|
-
# else
|
55
|
-
# return {}
|
56
|
-
# end
|
57
|
-
# end
|
58
|
-
#end
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
1
|
+
Embulk::JavaPlugin.register_guess(
|
2
|
+
"avro", "org.embulk.guess.avro.AvroGuessPlugin",
|
3
|
+
File.expand_path('../../../../classpath', __FILE__))
|
@@ -0,0 +1,97 @@
|
|
1
|
+
package org.embulk.guess.avro;
|
2
|
+
|
3
|
+
import org.apache.avro.Schema;
|
4
|
+
import org.apache.avro.file.DataFileReader;
|
5
|
+
import org.apache.avro.file.SeekableByteArrayInput;
|
6
|
+
import org.apache.avro.generic.GenericDatumReader;
|
7
|
+
import org.apache.avro.generic.GenericRecord;
|
8
|
+
import org.embulk.config.ConfigDiff;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.spi.Buffer;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.embulk.spi.GuessPlugin;
|
13
|
+
import org.embulk.spi.type.Type;
|
14
|
+
import org.embulk.spi.type.Types;
|
15
|
+
|
16
|
+
import java.io.IOException;
|
17
|
+
import java.util.ArrayList;
|
18
|
+
import java.util.Arrays;
|
19
|
+
import java.util.Collections;
|
20
|
+
import java.util.EnumMap;
|
21
|
+
import java.util.HashMap;
|
22
|
+
import java.util.List;
|
23
|
+
import java.util.Map;
|
24
|
+
|
25
|
+
public class AvroGuessPlugin
|
26
|
+
implements GuessPlugin {
|
27
|
+
|
28
|
+
private static final byte[] AVRO_HEADER = {0x4f, 0x62, 0x6a, 0x01};
|
29
|
+
|
30
|
+
private static final Map<Schema.Type, Type> TYPE_MAP = new EnumMap<>(Schema.Type.class);
|
31
|
+
|
32
|
+
static {
|
33
|
+
TYPE_MAP.put(Schema.Type.STRING, Types.STRING);
|
34
|
+
TYPE_MAP.put(Schema.Type.BYTES, Types.STRING);
|
35
|
+
TYPE_MAP.put(Schema.Type.FIXED, Types.STRING);
|
36
|
+
TYPE_MAP.put(Schema.Type.ENUM, Types.STRING);
|
37
|
+
TYPE_MAP.put(Schema.Type.NULL, Types.STRING);
|
38
|
+
TYPE_MAP.put(Schema.Type.INT, Types.LONG);
|
39
|
+
TYPE_MAP.put(Schema.Type.LONG, Types.LONG);
|
40
|
+
TYPE_MAP.put(Schema.Type.FLOAT, Types.DOUBLE);
|
41
|
+
TYPE_MAP.put(Schema.Type.DOUBLE, Types.DOUBLE);
|
42
|
+
TYPE_MAP.put(Schema.Type.BOOLEAN, Types.BOOLEAN);
|
43
|
+
TYPE_MAP.put(Schema.Type.MAP, Types.JSON);
|
44
|
+
TYPE_MAP.put(Schema.Type.ARRAY, Types.JSON);
|
45
|
+
TYPE_MAP.put(Schema.Type.RECORD, Types.JSON);
|
46
|
+
}
|
47
|
+
|
48
|
+
private Type convertType(Schema.Field field) {
|
49
|
+
Schema.Type type = field.schema().getType();
|
50
|
+
if (type == Schema.Type.UNION) {
|
51
|
+
for (Schema schema : field.schema().getTypes()) {
|
52
|
+
Schema.Type t = schema.getType();
|
53
|
+
if (t != Schema.Type.NULL) {
|
54
|
+
type = t;
|
55
|
+
break;
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
return TYPE_MAP.get(type);
|
60
|
+
}
|
61
|
+
|
62
|
+
private byte[] copyBuffer(Buffer buffer, int size) {
|
63
|
+
byte[] bytes = new byte[size];
|
64
|
+
buffer.getBytes(0, bytes, 0, size);
|
65
|
+
return bytes;
|
66
|
+
}
|
67
|
+
|
68
|
+
@Override
|
69
|
+
public ConfigDiff guess(ConfigSource config, Buffer sample) {
|
70
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
71
|
+
|
72
|
+
byte[] bytes = copyBuffer(sample, AVRO_HEADER.length);
|
73
|
+
if (!Arrays.equals(bytes, AVRO_HEADER)) {
|
74
|
+
return configDiff;
|
75
|
+
}
|
76
|
+
ConfigDiff parserConfig = configDiff.set("parser", Collections.emptyMap()).getNested("parser");
|
77
|
+
parserConfig.set("type", "avro");
|
78
|
+
|
79
|
+
bytes = copyBuffer(sample, sample.capacity());
|
80
|
+
DataFileReader<GenericRecord> reader;
|
81
|
+
try {
|
82
|
+
reader = new DataFileReader<>(new SeekableByteArrayInput(bytes), new GenericDatumReader<>());
|
83
|
+
} catch (IOException e) {
|
84
|
+
return configDiff;
|
85
|
+
}
|
86
|
+
List<Map<String, String>> columns = new ArrayList<>();
|
87
|
+
for (Schema.Field field : reader.getSchema().getFields()) {
|
88
|
+
Map<String, String> column = new HashMap<>();
|
89
|
+
column.put("name", field.name());
|
90
|
+
column.put("type", convertType(field).getName());
|
91
|
+
columns.add(column);
|
92
|
+
}
|
93
|
+
parserConfig.set("columns", columns);
|
94
|
+
|
95
|
+
return configDiff;
|
96
|
+
}
|
97
|
+
}
|
@@ -0,0 +1,81 @@
|
|
1
|
+
package org.embulk.guess.avro;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.databind.JsonNode;
|
4
|
+
import com.fasterxml.jackson.databind.node.ObjectNode;
|
5
|
+
import com.google.common.collect.ImmutableMap;
|
6
|
+
import org.apache.commons.compress.utils.IOUtils;
|
7
|
+
import org.embulk.EmbulkTestRuntime;
|
8
|
+
import org.embulk.config.ConfigDiff;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.spi.Buffer;
|
11
|
+
import org.junit.Before;
|
12
|
+
import org.junit.Rule;
|
13
|
+
import org.junit.Test;
|
14
|
+
|
15
|
+
import java.io.IOException;
|
16
|
+
import java.io.InputStream;
|
17
|
+
import java.util.Iterator;
|
18
|
+
import java.util.Map;
|
19
|
+
|
20
|
+
import static org.junit.Assert.assertEquals;
|
21
|
+
import static org.junit.Assert.assertTrue;
|
22
|
+
|
23
|
+
public class TestAvroGuessPlugin {
|
24
|
+
|
25
|
+
@Rule
|
26
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
27
|
+
|
28
|
+
private ConfigSource config;
|
29
|
+
private AvroGuessPlugin plugin;
|
30
|
+
|
31
|
+
@Before
|
32
|
+
public void setUp() {
|
33
|
+
plugin = new AvroGuessPlugin();
|
34
|
+
config = runtime.getExec().newConfigSource();
|
35
|
+
}
|
36
|
+
|
37
|
+
@Test
|
38
|
+
public void testAvroFile() throws IOException {
|
39
|
+
Map<String, String> expectedColumns = ImmutableMap.<String, String>builder()
|
40
|
+
.put("id", "long")
|
41
|
+
.put("code", "long")
|
42
|
+
.put("name", "string")
|
43
|
+
.put("description", "string")
|
44
|
+
.put("flag", "boolean")
|
45
|
+
.put("created_at", "string")
|
46
|
+
.put("created_at_utc", "double")
|
47
|
+
.put("price", "double")
|
48
|
+
.put("spec", "json")
|
49
|
+
.put("tags", "json")
|
50
|
+
.put("options", "json")
|
51
|
+
.put("item_type", "string")
|
52
|
+
.put("dummy", "string")
|
53
|
+
.build();
|
54
|
+
|
55
|
+
ConfigDiff configDiff = guess("items.avro");
|
56
|
+
|
57
|
+
JsonNode parserNode = configDiff.getObjectNode().get("parser");
|
58
|
+
assertEquals("avro", parserNode.get("type").asText());
|
59
|
+
|
60
|
+
Iterator<JsonNode> it = parserNode.get("columns").elements();
|
61
|
+
while (it.hasNext()) {
|
62
|
+
JsonNode node = it.next();
|
63
|
+
String name = node.get("name").asText();
|
64
|
+
assertTrue(expectedColumns.containsKey(name));
|
65
|
+
assertEquals(expectedColumns.get(name), node.get("type").asText());
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
@Test
|
70
|
+
public void testNonAvroFile() throws IOException {
|
71
|
+
ConfigDiff configDiff = guess("data.json");
|
72
|
+
ObjectNode objectNode = configDiff.getObjectNode();
|
73
|
+
assertEquals(0, objectNode.size());
|
74
|
+
}
|
75
|
+
|
76
|
+
private ConfigDiff guess(String resource) throws IOException {
|
77
|
+
InputStream is = this.getClass().getResourceAsStream("/org/embulk/parser/avro/" + resource);
|
78
|
+
Buffer sample = Buffer.wrap(IOUtils.toByteArray(is));
|
79
|
+
return plugin.guess(config, sample);
|
80
|
+
}
|
81
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-avro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +63,7 @@ files:
|
|
63
63
|
- gradlew.bat
|
64
64
|
- lib/embulk/guess/avro.rb
|
65
65
|
- lib/embulk/parser/avro.rb
|
66
|
+
- src/main/java/org/embulk/guess/avro/AvroGuessPlugin.java
|
66
67
|
- src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
|
67
68
|
- src/main/java/org/embulk/parser/avro/TimestampUnit.java
|
68
69
|
- src/main/java/org/embulk/parser/avro/TimestampUnitDeserializer.java
|
@@ -76,6 +77,7 @@ files:
|
|
76
77
|
- src/main/java/org/embulk/parser/avro/getter/IntegerColumnGetter.java
|
77
78
|
- src/main/java/org/embulk/parser/avro/getter/LongColumnGetter.java
|
78
79
|
- src/main/java/org/embulk/parser/avro/getter/StringColumnGetter.java
|
80
|
+
- src/test/java/org/embulk/guess/avro/TestAvroGuessPlugin.java
|
79
81
|
- src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java
|
80
82
|
- src/test/resources/org/embulk/parser/avro/.gitignore
|
81
83
|
- src/test/resources/org/embulk/parser/avro/Gemfile
|
@@ -87,14 +89,14 @@ files:
|
|
87
89
|
- src/test/resources/org/embulk/parser/avro/item2.avsc
|
88
90
|
- src/test/resources/org/embulk/parser/avro/items.avro
|
89
91
|
- src/test/resources/org/embulk/parser/avro/items2.avro
|
90
|
-
- classpath/avro-1.
|
91
|
-
- classpath/
|
92
|
-
- classpath/xz-1.
|
93
|
-
- classpath/
|
94
|
-
- classpath/
|
95
|
-
- classpath/jackson-core-
|
96
|
-
- classpath/
|
97
|
-
- classpath/
|
92
|
+
- classpath/avro-1.10.1.jar
|
93
|
+
- classpath/zstd-jni-1.4.5-12.jar
|
94
|
+
- classpath/xz-1.8.jar
|
95
|
+
- classpath/snappy-java-1.1.8.1.jar
|
96
|
+
- classpath/commons-compress-1.20.jar
|
97
|
+
- classpath/jackson-core-2.11.3.jar
|
98
|
+
- classpath/slf4j-api-1.7.30.jar
|
99
|
+
- classpath/embulk-parser-avro-0.4.0.jar
|
98
100
|
homepage: https://github.com/joker1007/embulk-parser-avro
|
99
101
|
licenses:
|
100
102
|
- MIT
|