embulk-parser-avro 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2ee00ffbaf0cf78dfc2c7bd0861e68603b528565
4
- data.tar.gz: 08786ae000735a7f8d2aad1514a02de9c5095ec4
3
+ metadata.gz: c910b362e790acea208b141982b8d9530d3c29c8
4
+ data.tar.gz: 450bc2c36228d32575e04fdd2bca4664536ca5f7
5
5
  SHA512:
6
- metadata.gz: 2697fb6b9cd4fb2cf6a72194450031f11ade7780d2afb050d40d65de39a10a5b56da507125b5580de820ba78e399ff39348726bb0eb82edb0c7aae13de27a555
7
- data.tar.gz: 839d69c7a19dfc6eeddff3b4fb9f2bee3172b1d4ab05793753bae1fb466ef1bfa91c5e86e099f3fe44bc5c0e0047ed3a34eb789ae26c7e0507b29c25a58ce4ca
6
+ metadata.gz: 521f6e18548b1d163c2ecbbc2d2c19be584d3ab72d7530f63302ea16e9b36cbe66297c39ea4ccf08dfe4064708fc7d0a6eddeb2d472341e724687f4cc585f303
7
+ data.tar.gz: acda88bdc802a4b7c8a64818ab426a48bb39c7bb2d4ee8c1a54d9424fc8e3982ab9f5431b67f9cb90167b5ddb2bead8fa1ec54d2986d09a38e49697be0ef8f19
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  ## Overview
6
6
 
7
7
  * **Plugin type**: parser
8
- * **Guess supported**: no
8
+ * **Guess supported**: yes
9
9
 
10
10
  ## Configuration
11
11
 
@@ -102,7 +102,7 @@ out:
102
102
  }
103
103
  ```
104
104
 
105
- (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
105
+ You don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
106
106
 
107
107
  ```
108
108
  $ embulk gem install embulk-parser-avro
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.3.0"
16
+ version = "0.4.0"
17
17
 
18
18
  sourceCompatibility = 1.8
19
19
  targetCompatibility = 1.8
@@ -21,7 +21,12 @@ targetCompatibility = 1.8
21
21
  dependencies {
22
22
  compile "org.embulk:embulk-core:0.9.4"
23
23
  provided "org.embulk:embulk-core:0.9.4"
24
- compile "org.apache.avro:avro:1.8.2"
24
+ compile ("org.apache.avro:avro:1.10.1") {
25
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
26
+ }
27
+ compile "com.github.luben:zstd-jni:1.4.5-12"
28
+ compile "org.tukaani:xz:1.8"
29
+ compile "org.xerial.snappy:snappy-java:1.1.8.1"
25
30
  testCompile "junit:junit:4.+"
26
31
 
27
32
  testCompile "org.embulk:embulk-core:0.9.4:tests"
@@ -1,61 +1,3 @@
1
- module Embulk
2
- module Guess
3
-
4
- # TODO implement guess plugin to make this command work:
5
- # $ embulk guess -g "avro" partial-config.yml
6
- #
7
- # Depending on the file format the plugin uses, you can use choose
8
- # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
- # or line guess (LineGuessPlugin).
10
-
11
- #class Avro < GuessPlugin
12
- # Plugin.register_guess("avro", self)
13
- #
14
- # def guess(config, sample_buffer)
15
- # if sample_buffer[0,2] == GZIP_HEADER
16
- # guessed = {}
17
- # guessed["type"] = "avro"
18
- # guessed["property1"] = "guessed-value"
19
- # return {"parser" => guessed}
20
- # else
21
- # return {}
22
- # end
23
- # end
24
- #end
25
-
26
- #class Avro < TextGuessPlugin
27
- # Plugin.register_guess("avro", self)
28
- #
29
- # def guess_text(config, sample_text)
30
- # js = JSON.parse(sample_text) rescue nil
31
- # if js && js["mykeyword"] == "keyword"
32
- # guessed = {}
33
- # guessed["type"] = "avro"
34
- # guessed["property1"] = "guessed-value"
35
- # return {"parser" => guessed}
36
- # else
37
- # return {}
38
- # end
39
- # end
40
- #end
41
-
42
- #class Avro < LineGuessPlugin
43
- # Plugin.register_guess("avro", self)
44
- #
45
- # def guess_lines(config, sample_lines)
46
- # all_line_matched = sample_lines.all? do |line|
47
- # line =~ /mypattern/
48
- # end
49
- # if all_line_matched
50
- # guessed = {}
51
- # guessed["type"] = "avro"
52
- # guessed["property1"] = "guessed-value"
53
- # return {"parser" => guessed}
54
- # else
55
- # return {}
56
- # end
57
- # end
58
- #end
59
-
60
- end
61
- end
1
+ Embulk::JavaPlugin.register_guess(
2
+ "avro", "org.embulk.guess.avro.AvroGuessPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,97 @@
1
+ package org.embulk.guess.avro;
2
+
3
+ import org.apache.avro.Schema;
4
+ import org.apache.avro.file.DataFileReader;
5
+ import org.apache.avro.file.SeekableByteArrayInput;
6
+ import org.apache.avro.generic.GenericDatumReader;
7
+ import org.apache.avro.generic.GenericRecord;
8
+ import org.embulk.config.ConfigDiff;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.spi.Buffer;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.GuessPlugin;
13
+ import org.embulk.spi.type.Type;
14
+ import org.embulk.spi.type.Types;
15
+
16
+ import java.io.IOException;
17
+ import java.util.ArrayList;
18
+ import java.util.Arrays;
19
+ import java.util.Collections;
20
+ import java.util.EnumMap;
21
+ import java.util.HashMap;
22
+ import java.util.List;
23
+ import java.util.Map;
24
+
25
+ public class AvroGuessPlugin
26
+ implements GuessPlugin {
27
+
28
+ private static final byte[] AVRO_HEADER = {0x4f, 0x62, 0x6a, 0x01};
29
+
30
+ private static final Map<Schema.Type, Type> TYPE_MAP = new EnumMap<>(Schema.Type.class);
31
+
32
+ static {
33
+ TYPE_MAP.put(Schema.Type.STRING, Types.STRING);
34
+ TYPE_MAP.put(Schema.Type.BYTES, Types.STRING);
35
+ TYPE_MAP.put(Schema.Type.FIXED, Types.STRING);
36
+ TYPE_MAP.put(Schema.Type.ENUM, Types.STRING);
37
+ TYPE_MAP.put(Schema.Type.NULL, Types.STRING);
38
+ TYPE_MAP.put(Schema.Type.INT, Types.LONG);
39
+ TYPE_MAP.put(Schema.Type.LONG, Types.LONG);
40
+ TYPE_MAP.put(Schema.Type.FLOAT, Types.DOUBLE);
41
+ TYPE_MAP.put(Schema.Type.DOUBLE, Types.DOUBLE);
42
+ TYPE_MAP.put(Schema.Type.BOOLEAN, Types.BOOLEAN);
43
+ TYPE_MAP.put(Schema.Type.MAP, Types.JSON);
44
+ TYPE_MAP.put(Schema.Type.ARRAY, Types.JSON);
45
+ TYPE_MAP.put(Schema.Type.RECORD, Types.JSON);
46
+ }
47
+
48
+ private Type convertType(Schema.Field field) {
49
+ Schema.Type type = field.schema().getType();
50
+ if (type == Schema.Type.UNION) {
51
+ for (Schema schema : field.schema().getTypes()) {
52
+ Schema.Type t = schema.getType();
53
+ if (t != Schema.Type.NULL) {
54
+ type = t;
55
+ break;
56
+ }
57
+ }
58
+ }
59
+ return TYPE_MAP.get(type);
60
+ }
61
+
62
+ private byte[] copyBuffer(Buffer buffer, int size) {
63
+ byte[] bytes = new byte[size];
64
+ buffer.getBytes(0, bytes, 0, size);
65
+ return bytes;
66
+ }
67
+
68
+ @Override
69
+ public ConfigDiff guess(ConfigSource config, Buffer sample) {
70
+ ConfigDiff configDiff = Exec.newConfigDiff();
71
+
72
+ byte[] bytes = copyBuffer(sample, AVRO_HEADER.length);
73
+ if (!Arrays.equals(bytes, AVRO_HEADER)) {
74
+ return configDiff;
75
+ }
76
+ ConfigDiff parserConfig = configDiff.set("parser", Collections.emptyMap()).getNested("parser");
77
+ parserConfig.set("type", "avro");
78
+
79
+ bytes = copyBuffer(sample, sample.capacity());
80
+ DataFileReader<GenericRecord> reader;
81
+ try {
82
+ reader = new DataFileReader<>(new SeekableByteArrayInput(bytes), new GenericDatumReader<>());
83
+ } catch (IOException e) {
84
+ return configDiff;
85
+ }
86
+ List<Map<String, String>> columns = new ArrayList<>();
87
+ for (Schema.Field field : reader.getSchema().getFields()) {
88
+ Map<String, String> column = new HashMap<>();
89
+ column.put("name", field.name());
90
+ column.put("type", convertType(field).getName());
91
+ columns.add(column);
92
+ }
93
+ parserConfig.set("columns", columns);
94
+
95
+ return configDiff;
96
+ }
97
+ }
@@ -0,0 +1,81 @@
1
+ package org.embulk.guess.avro;
2
+
3
+ import com.fasterxml.jackson.databind.JsonNode;
4
+ import com.fasterxml.jackson.databind.node.ObjectNode;
5
+ import com.google.common.collect.ImmutableMap;
6
+ import org.apache.commons.compress.utils.IOUtils;
7
+ import org.embulk.EmbulkTestRuntime;
8
+ import org.embulk.config.ConfigDiff;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.spi.Buffer;
11
+ import org.junit.Before;
12
+ import org.junit.Rule;
13
+ import org.junit.Test;
14
+
15
+ import java.io.IOException;
16
+ import java.io.InputStream;
17
+ import java.util.Iterator;
18
+ import java.util.Map;
19
+
20
+ import static org.junit.Assert.assertEquals;
21
+ import static org.junit.Assert.assertTrue;
22
+
23
+ public class TestAvroGuessPlugin {
24
+
25
+ @Rule
26
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
27
+
28
+ private ConfigSource config;
29
+ private AvroGuessPlugin plugin;
30
+
31
+ @Before
32
+ public void setUp() {
33
+ plugin = new AvroGuessPlugin();
34
+ config = runtime.getExec().newConfigSource();
35
+ }
36
+
37
+ @Test
38
+ public void testAvroFile() throws IOException {
39
+ Map<String, String> expectedColumns = ImmutableMap.<String, String>builder()
40
+ .put("id", "long")
41
+ .put("code", "long")
42
+ .put("name", "string")
43
+ .put("description", "string")
44
+ .put("flag", "boolean")
45
+ .put("created_at", "string")
46
+ .put("created_at_utc", "double")
47
+ .put("price", "double")
48
+ .put("spec", "json")
49
+ .put("tags", "json")
50
+ .put("options", "json")
51
+ .put("item_type", "string")
52
+ .put("dummy", "string")
53
+ .build();
54
+
55
+ ConfigDiff configDiff = guess("items.avro");
56
+
57
+ JsonNode parserNode = configDiff.getObjectNode().get("parser");
58
+ assertEquals("avro", parserNode.get("type").asText());
59
+
60
+ Iterator<JsonNode> it = parserNode.get("columns").elements();
61
+ while (it.hasNext()) {
62
+ JsonNode node = it.next();
63
+ String name = node.get("name").asText();
64
+ assertTrue(expectedColumns.containsKey(name));
65
+ assertEquals(expectedColumns.get(name), node.get("type").asText());
66
+ }
67
+ }
68
+
69
+ @Test
70
+ public void testNonAvroFile() throws IOException {
71
+ ConfigDiff configDiff = guess("data.json");
72
+ ObjectNode objectNode = configDiff.getObjectNode();
73
+ assertEquals(0, objectNode.size());
74
+ }
75
+
76
+ private ConfigDiff guess(String resource) throws IOException {
77
+ InputStream is = this.getClass().getResourceAsStream("/org/embulk/parser/avro/" + resource);
78
+ Buffer sample = Buffer.wrap(IOUtils.toByteArray(is));
79
+ return plugin.guess(config, sample);
80
+ }
81
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-avro
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - joker1007
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-23 00:00:00.000000000 Z
11
+ date: 2021-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -63,6 +63,7 @@ files:
63
63
  - gradlew.bat
64
64
  - lib/embulk/guess/avro.rb
65
65
  - lib/embulk/parser/avro.rb
66
+ - src/main/java/org/embulk/guess/avro/AvroGuessPlugin.java
66
67
  - src/main/java/org/embulk/parser/avro/AvroParserPlugin.java
67
68
  - src/main/java/org/embulk/parser/avro/TimestampUnit.java
68
69
  - src/main/java/org/embulk/parser/avro/TimestampUnitDeserializer.java
@@ -76,6 +77,7 @@ files:
76
77
  - src/main/java/org/embulk/parser/avro/getter/IntegerColumnGetter.java
77
78
  - src/main/java/org/embulk/parser/avro/getter/LongColumnGetter.java
78
79
  - src/main/java/org/embulk/parser/avro/getter/StringColumnGetter.java
80
+ - src/test/java/org/embulk/guess/avro/TestAvroGuessPlugin.java
79
81
  - src/test/java/org/embulk/parser/avro/TestAvroParserPlugin.java
80
82
  - src/test/resources/org/embulk/parser/avro/.gitignore
81
83
  - src/test/resources/org/embulk/parser/avro/Gemfile
@@ -87,14 +89,14 @@ files:
87
89
  - src/test/resources/org/embulk/parser/avro/item2.avsc
88
90
  - src/test/resources/org/embulk/parser/avro/items.avro
89
91
  - src/test/resources/org/embulk/parser/avro/items2.avro
90
- - classpath/avro-1.8.2.jar
91
- - classpath/paranamer-2.7.jar
92
- - classpath/xz-1.5.jar
93
- - classpath/commons-compress-1.8.1.jar
94
- - classpath/embulk-parser-avro-0.3.0.jar
95
- - classpath/jackson-core-asl-1.9.13.jar
96
- - classpath/snappy-java-1.1.1.3.jar
97
- - classpath/jackson-mapper-asl-1.9.13.jar
92
+ - classpath/avro-1.10.1.jar
93
+ - classpath/zstd-jni-1.4.5-12.jar
94
+ - classpath/xz-1.8.jar
95
+ - classpath/snappy-java-1.1.8.1.jar
96
+ - classpath/commons-compress-1.20.jar
97
+ - classpath/jackson-core-2.11.3.jar
98
+ - classpath/slf4j-api-1.7.30.jar
99
+ - classpath/embulk-parser-avro-0.4.0.jar
98
100
  homepage: https://github.com/joker1007/embulk-parser-avro
99
101
  licenses:
100
102
  - MIT