embulk-parser-avro 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2ee00ffbaf0cf78dfc2c7bd0861e68603b528565
4
- data.tar.gz: 08786ae000735a7f8d2aad1514a02de9c5095ec4
3
+ metadata.gz: c910b362e790acea208b141982b8d9530d3c29c8
4
+ data.tar.gz: 450bc2c36228d32575e04fdd2bca4664536ca5f7
5
5
  SHA512:
6
- metadata.gz: 2697fb6b9cd4fb2cf6a72194450031f11ade7780d2afb050d40d65de39a10a5b56da507125b5580de820ba78e399ff39348726bb0eb82edb0c7aae13de27a555
7
- data.tar.gz: 839d69c7a19dfc6eeddff3b4fb9f2bee3172b1d4ab05793753bae1fb466ef1bfa91c5e86e099f3fe44bc5c0e0047ed3a34eb789ae26c7e0507b29c25a58ce4ca
6
+ metadata.gz: 521f6e18548b1d163c2ecbbc2d2c19be584d3ab72d7530f63302ea16e9b36cbe66297c39ea4ccf08dfe4064708fc7d0a6eddeb2d472341e724687f4cc585f303
7
+ data.tar.gz: acda88bdc802a4b7c8a64818ab426a48bb39c7bb2d4ee8c1a54d9424fc8e3982ab9f5431b67f9cb90167b5ddb2bead8fa1ec54d2986d09a38e49697be0ef8f19
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  ## Overview
6
6
 
7
7
  * **Plugin type**: parser
8
- * **Guess supported**: no
8
+ * **Guess supported**: yes
9
9
 
10
10
  ## Configuration
11
11
 
@@ -102,7 +102,7 @@ out:
102
102
  }
103
103
  ```
104
104
 
105
- (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
105
+ You don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
106
106
 
107
107
  ```
108
108
  $ embulk gem install embulk-parser-avro
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.3.0"
16
+ version = "0.4.0"
17
17
 
18
18
  sourceCompatibility = 1.8
19
19
  targetCompatibility = 1.8
@@ -21,7 +21,12 @@ targetCompatibility = 1.8
21
21
  dependencies {
22
22
  compile "org.embulk:embulk-core:0.9.4"
23
23
  provided "org.embulk:embulk-core:0.9.4"
24
- compile "org.apache.avro:avro:1.8.2"
24
+ compile ("org.apache.avro:avro:1.10.1") {
25
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
26
+ }
27
+ compile "com.github.luben:zstd-jni:1.4.5-12"
28
+ compile "org.tukaani:xz:1.8"
29
+ compile "org.xerial.snappy:snappy-java:1.1.8.1"
25
30
  testCompile "junit:junit:4.+"
26
31
 
27
32
  testCompile "org.embulk:embulk-core:0.9.4:tests"
@@ -1,61 +1,3 @@
1
- module Embulk
2
- module Guess
3
-
4
- # TODO implement guess plugin to make this command work:
5
- # $ embulk guess -g "avro" partial-config.yml
6
- #
7
- # Depending on the file format the plugin uses, you can use choose
8
- # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
- # or line guess (LineGuessPlugin).
10
-
11
- #class Avro < GuessPlugin
12
- # Plugin.register_guess("avro", self)
13
- #
14
- # def guess(config, sample_buffer)
15
- # if sample_buffer[0,2] == GZIP_HEADER
16
- # guessed = {}
17
- # guessed["type"] = "avro"
18
- # guessed["property1"] = "guessed-value"
19
- # return {"parser" => guessed}
20
- # else
21
- # return {}
22
- # end
23
- # end
24
- #end
25
-
26
- #class Avro < TextGuessPlugin
27
- # Plugin.register_guess("avro", self)
28
- #
29
- # def guess_text(config, sample_text)
30
- # js = JSON.parse(sample_text) rescue nil
31
- # if js && js["mykeyword"] == "keyword"
32
- # guessed = {}
33
- # guessed["type"] = "avro"
34
- # guessed["property1"] = "guessed-value"
35
- # return {"parser" => guessed}
36
- # else
37
- # return {}
38
- # end
39
- # end
40
- #end
41
-
42
- #class Avro < LineGuessPlugin
43
- # Plugin.register_guess("avro", self)
44
- #
45
- # def guess_lines(config, sample_lines)
46
- # all_line_matched = sample_lines.all? do |line|
47
- # line =~ /mypattern/
48
- # end
49
- # if all_line_matched
50
- # guessed = {}
51
- # guessed["type"] = "avro"
52
- # guessed["property1"] = "guessed-value"
53
- # return {"parser" => guessed}
54
- # else
55
- # return {}
56
- # end
57
- # end
58
- #end
59
-
60
- end
61
- end
1
+ Embulk::JavaPlugin.register_guess(
2
+ "avro", "org.embulk.guess.avro.AvroGuessPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,97 @@
1
+ package org.embulk.guess.avro;
2
+
3
+ import org.apache.avro.Schema;
4
+ import org.apache.avro.file.DataFileReader;
5
+ import org.apache.avro.file.SeekableByteArrayInput;
6
+ import org.apache.avro.generic.GenericDatumReader;
7
+ import org.apache.avro.generic.GenericRecord;
8
+ import org.embulk.config.ConfigDiff;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.spi.Buffer;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.GuessPlugin;
13
+ import org.embulk.spi.type.Type;
14
+ import org.embulk.spi.type.Types;
15
+
16
+ import java.io.IOException;
17
+ import java.util.ArrayList;
18
+ import java.util.Arrays;
19
+ import java.util.Collections;
20
+ import java.util.EnumMap;
21
+ import java.util.HashMap;
22
+ import java.util.List;
23
+ import java.util.Map;
24
+
25
+ public class AvroGuessPlugin
26
+ implements GuessPlugin {
27
+
28
+ private static final byte[] AVRO_HEADER = {0x4f, 0x62, 0x6a, 0x01};
29
+
30
+ private static final Map<Schema.Type, Type> TYPE_MAP = new EnumMap<>(Schema.Type.class);
31
+
32
+ static {
33
+ TYPE_MAP.put(Schema.Type.STRING, Types.STRING);
34
+ TYPE_MAP.put(Schema.Type.BYTES, Types.STRING);
35
+ TYPE_MAP.put(Schema.Type.FIXED, Types.STRING);
36
+ TYPE_MAP.put(Schema.Type.ENUM, Types.STRING);
37
+ TYPE_MAP.put(Schema.Type.NULL, Types.STRING);
38
+ TYPE_MAP.put(Schema.Type.INT, Types.LONG);
39
+ TYPE_MAP.put(Schema.Type.LONG, Types.LONG);
40
+ TYPE_MAP.put(Schema.Type.FLOAT, Types.DOUBLE);
41
+ TYPE_MAP.put(Schema.Type.DOUBLE, Types.DOUBLE);
42
+ TYPE_MAP.put(Schema.Type.BOOLEAN, Types.BOOLEAN);
43
+ TYPE_MAP.put(Schema.Type.MAP, Types.JSON);
44
+ TYPE_MAP.put(Schema.Type.ARRAY, Types.JSON);
45
+ TYPE_MAP.put(Schema.Type.RECORD, Types.JSON);
46
+ }
47
+
48
+ private Type convertType(Schema.Field field) {
49
+ Schema.Type type = field.schema().getType();
50
+ if (type == Schema.Type.UNION) {
51
+ for (Schema schema : field.schema().getTypes()) {
52
+ Schema.Type t = schema.getType();
53
+ if (t != Schema.Type.NULL) {
54
+ type = t;
55
+ break;
56
+ }
57
+ }
58
+ }
59
+ return TYPE_MAP.get(type);
60
+ }
61
+
62
+ private byte[] copyBuffer(Buffer buffer, int size) {
63
+ byte[] bytes = new byte[size];
64
+ buffer.getBytes(0, bytes, 0, size);
65
+ return bytes;
66
+ }
67
+
68
+ @Override
69
+ public ConfigDiff guess(ConfigSource config, Buffer sample) {
70
+ ConfigDiff configDiff = Exec.newConfigDiff();
71
+
72
+ byte[] bytes = copyBuffer(sample, AVRO_HEADER.length);
73
+ if (!Arrays.equals(bytes, AVRO_HEADER)) {
74
+ return configDiff;
75
+ }
76
+ ConfigDiff parserConfig = configDiff.set("parser", Collections.emptyMap()).getNested("parser");
77
+ parserConfig.set("type", "avro");
78
+
79
+ bytes = copyBuffer(sample, sample.capacity());
80
+ DataFileReader<GenericRecord> reader;
81
+ try {
82
+ reader = new DataFileReader<>(new SeekableByteArrayInput(bytes), new GenericDatumReader<>());
83
+ } catch (IOException e) {
84
+ return configDiff;
85
+ }
86
+ List<Map<String, String>> columns = new ArrayList<>();
87
+ for (Schema.Field field : reader.getSchema().getFields()) {
88
+ Map<String, String> column = new HashMap<>();
89
+ column.put("name", field.name());
90
+ column.put("type", convertType(field).getName());
91
+ columns.add(column);
92
+ }
93
+ parserConfig.set("columns", columns);
94
+
95
+ return configDiff;
96
+ }
97
+ }
@@ -0,0 +1,81 @@
1
+ package org.embulk.guess.avro;
2
+
3
+ import com.fasterxml.jackson.databind.JsonNode;
4
+ import com.fasterxml.jackson.databind.node.ObjectNode;
5
+ import com.google.common.collect.ImmutableMap;
6
+ import org.apache.commons.compress.utils.IOUtils;
7
+ import org.embulk.EmbulkTestRuntime;
8
+ import org.embulk.config.ConfigDiff;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.spi.Buffer;
11
+ import org.junit.Before;
12
+ import org.junit.Rule;
13
+ import org.junit.Test;
14
+
15
+ import java.io.IOException;
16
+ import java.io.InputStream;
17
+ import java.util.Iterator;
18
+ import java.util.Map;
19
+
20
+ import static org.junit.Assert.assertEquals;
21
+ import static org.junit.Assert.assertTrue;
22
+
23
+ public class TestAvroGuessPlugin {
24
+
25
+ @Rule
26
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
27
+
28
+ private ConfigSource config;
29
+ private AvroGuessPlugin plugin;
30
+
31
+ @Before
32
+ public void setUp() {
33
+ plugin = new AvroGuessPlugin();
34
+ config = runtime.getExec().newConfigSource();
35
+ }
36
+
37
+ @Test
38
+ public void testAvroFile() throws IOException {
39
+ Map<String, String> expectedColumns = ImmutableMap.<String, String>builder()
40
+ .put("id", "long")
41
+ .put("code", "long")
42
+ .put("name", "string")
43
+ .put("description", "string")
44
+ .put("flag", "boolean")
45
+ .put("created_at", "string")
46
+ .put("created_at_utc", "double")
47
+ .put("price", "double")
48
+ .put("spec", "json")
49
+ .put("tags", "json")
50
+ .put("options", "json")
51
+ .put("item_type", "string")
52
+ .put("dummy", "string")
53
+ .build();
54
+
55
+ ConfigDiff configDiff = guess("items.avro");
56
+
57
+ JsonNode parserNode = configDiff.getObjectNode().get("parser");
58
+ assertEquals("avro", parserNode.get("type").asText());
59
+
60
+ Iterator<JsonNode> it = parserNode.get("columns").elements();
61
+ while (it.hasNext()) {
62
+ JsonNode node = it.next();
63
+ String name = node.get("name").asText();
64
+ assertTrue(expectedColumns.containsKey(name));
65
+ assertEquals(expectedColumns.get(name), node.get("type").asText());
66
+ }
67
+ }
68
+
69
+ @Test
70
+ public void testNonAvroFile() throws IOException {
71
+ ConfigDiff configDiff = guess("data.json");
72
+ ObjectNode objectNode = configDiff.getObjectNode();
73
+ assertEquals(0, objectNode.size());
74
+ }
75
+
76
+ private ConfigDiff guess(String resource) throws IOException {
77
+ InputStream is = this.getClass().getResourceAsStream("/org/embulk/parser/avro/" + resource);
78
+ Buffer sample = Buffer.wrap(IOUtils.toByteArray(is));
79
+ return plugin.guess(config, sample);
80
+ }
81
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-avro
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - joker1007
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-23 00:00:00.000000000 Z
11
+ date: 2021-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -63,6 +63,7 @@ files:
63
63
  - gradlew.bat
64
64
  - lib/embulk/guess/avro.rb
65
65
  - lib/embulk/parser/avro.rb
66
+ - src/main/java/org/embulk/guess/avro/AvroGuessPlugin.java
66
67
  - src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
67
68
  - src/main/java/org/embulk/parser/avro/TimestampUnit.java
68
69
  - src/main/java/org/embulk/parser/avro/TimestampUnitDeserializer.java
@@ -76,6 +77,7 @@ files:
76
77
  - src/main/java/org/embulk/parser/avro/getter/IntegerColumnGetter.java
77
78
  - src/main/java/org/embulk/parser/avro/getter/LongColumnGetter.java
78
79
  - src/main/java/org/embulk/parser/avro/getter/StringColumnGetter.java
80
+ - src/test/java/org/embulk/guess/avro/TestAvroGuessPlugin.java
79
81
  - src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java
80
82
  - src/test/resources/org/embulk/parser/avro/.gitignore
81
83
  - src/test/resources/org/embulk/parser/avro/Gemfile
@@ -87,14 +89,14 @@ files:
87
89
  - src/test/resources/org/embulk/parser/avro/item2.avsc
88
90
  - src/test/resources/org/embulk/parser/avro/items.avro
89
91
  - src/test/resources/org/embulk/parser/avro/items2.avro
90
- - classpath/avro-1.8.2.jar
91
- - classpath/paranamer-2.7.jar
92
- - classpath/xz-1.5.jar
93
- - classpath/commons-compress-1.8.1.jar
94
- - classpath/embulk-parser-avro-0.3.0.jar
95
- - classpath/jackson-core-asl-1.9.13.jar
96
- - classpath/snappy-java-1.1.1.3.jar
97
- - classpath/jackson-mapper-asl-1.9.13.jar
92
+ - classpath/avro-1.10.1.jar
93
+ - classpath/zstd-jni-1.4.5-12.jar
94
+ - classpath/xz-1.8.jar
95
+ - classpath/snappy-java-1.1.8.1.jar
96
+ - classpath/commons-compress-1.20.jar
97
+ - classpath/jackson-core-2.11.3.jar
98
+ - classpath/slf4j-api-1.7.30.jar
99
+ - classpath/embulk-parser-avro-0.4.0.jar
98
100
  homepage: https://github.com/joker1007/embulk-parser-avro
99
101
  licenses:
100
102
  - MIT