embulk-output-parquet 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/README.md +4 -2
- data/build.gradle +7 -3
- data/src/main/java/org/embulk/output/EmbulkWriteSupport.java +3 -3
- data/src/main/java/org/embulk/output/ParquetOutputPlugin.java +22 -30
- data/src/test/java/org/embulk/output/ParquetOutputPluginTest.java +37 -0
- metadata +16 -5
- data/src/test/java/org/embulk/output/TestParquetOutputPlugin.java +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e731ad9c445bd5adef66ce8c994fa73dca87252e
|
4
|
+
data.tar.gz: 392706f907d2dd684f2d2778863ae7dd28e4dfca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed522bf62b23bd0d236c945f4bbffcd3991d86ed93a835bae4739a3be96717e83ed0f584bdd968330ee426a8fd81e3d2cde11090605050bf2d9a3d70d700c603
|
7
|
+
data.tar.gz: a3e6429370cfaefc7e777f8ba27912049d9649e51b8574330b957e2495a29640466e68f8173f052f6c881c33564efbc49fb4b301d59ae9f4b21ce7b58039b684
|
data/.travis.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
language: java
|
data/README.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# Parquet output plugin for Embulk
|
2
2
|
|
3
|
-
|
4
3
|
## Overview
|
5
4
|
|
6
5
|
* **Plugin type**: output
|
@@ -16,7 +15,10 @@
|
|
16
15
|
- **block_size**: A block size of parquet file. (int, default: 134217728(128M))
|
17
16
|
- **page_size**: A page size of parquet file. (int, default: 1048576(1M))
|
18
17
|
- **compression_codec**: A compression codec. available: UNCOMPRESSED, SNAPPY, GZIP (string, default: UNCOMPRESSED)
|
19
|
-
- **
|
18
|
+
- **default_timezone**: Time zone of timestamp columns. This can be overwritten for each column using column_options
|
19
|
+
- **default_timestamp_format**: Format of timestamp columns. This can be overwritten for each column using column_options
|
20
|
+
- **column_options**: Specify timezone and timestamp format for each column. Format of this option is the same as the official csv formatter. See [document](
|
21
|
+
http://www.embulk.org/docs/built-in.html#csv-formatter-plugin).
|
20
22
|
|
21
23
|
## Example
|
22
24
|
|
data/build.gradle
CHANGED
@@ -13,19 +13,23 @@ repositories {
|
|
13
13
|
}
|
14
14
|
configurations {
|
15
15
|
provided
|
16
|
+
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
16
17
|
}
|
17
18
|
|
18
|
-
version = "0.
|
19
|
+
version = "0.3.0"
|
19
20
|
|
20
21
|
dependencies {
|
21
|
-
compile "org.embulk:embulk-core:0.7.
|
22
|
-
provided "org.embulk:embulk-core:0.7.
|
22
|
+
compile "org.embulk:embulk-core:0.7.10"
|
23
|
+
provided "org.embulk:embulk-core:0.7.10"
|
23
24
|
|
24
25
|
compile "com.twitter:parquet-hadoop:1.5.0"
|
25
26
|
compile "org.apache.hadoop:hadoop-client:2.6.0"
|
26
27
|
compile "org.xerial.snappy:snappy-java:1.1.1.6"
|
28
|
+
compile "org.apache.hadoop:hadoop-aws:2.6.0"
|
27
29
|
|
28
30
|
testCompile "junit:junit:4.+"
|
31
|
+
testCompile "org.embulk:embulk-core:0.7.7:tests"
|
32
|
+
testCompile "org.embulk:embulk-standards:0.7.7"
|
29
33
|
}
|
30
34
|
|
31
35
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -24,9 +24,9 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
|
|
24
24
|
final Schema schema;
|
25
25
|
RecordConsumer consumer;
|
26
26
|
WriteContext writeContext;
|
27
|
-
|
27
|
+
TimestampFormatter[] timestampFormatters;
|
28
28
|
|
29
|
-
public EmbulkWriteSupport(Schema schema,
|
29
|
+
public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters) {
|
30
30
|
this.schema = schema;
|
31
31
|
this.timestampFormatters = timestampFormatters;
|
32
32
|
}
|
@@ -112,7 +112,7 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
|
|
112
112
|
public void timestampColumn(Column column) {
|
113
113
|
if (!record.isNull(column)) {
|
114
114
|
Timestamp t = record.getTimestamp(column);
|
115
|
-
String formatted = timestampFormatters
|
115
|
+
String formatted = timestampFormatters[column.getIndex()].format(t);
|
116
116
|
consumer.addBinary(Binary.fromString(formatted));
|
117
117
|
}
|
118
118
|
}
|
@@ -1,24 +1,17 @@
|
|
1
1
|
package org.embulk.output;
|
2
2
|
|
3
|
-
import java.io.IOException;
|
4
|
-
import java.util.List;
|
5
|
-
import java.util.Map;
|
6
|
-
|
7
3
|
import com.google.common.base.Throwables;
|
8
|
-
import com.google.common.collect.ImmutableBiMap;
|
9
|
-
import com.google.common.collect.ImmutableMap;
|
10
4
|
import org.apache.hadoop.conf.Configuration;
|
11
5
|
import org.apache.hadoop.fs.LocalFileSystem;
|
12
6
|
import org.apache.hadoop.fs.Path;
|
13
7
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
14
|
-
import org.embulk.config.TaskReport;
|
15
8
|
import org.embulk.config.Config;
|
16
9
|
import org.embulk.config.ConfigDefault;
|
17
10
|
import org.embulk.config.ConfigDiff;
|
18
11
|
import org.embulk.config.ConfigSource;
|
19
12
|
import org.embulk.config.Task;
|
13
|
+
import org.embulk.config.TaskReport;
|
20
14
|
import org.embulk.config.TaskSource;
|
21
|
-
import org.embulk.spi.Column;
|
22
15
|
import org.embulk.spi.Exec;
|
23
16
|
import org.embulk.spi.OutputPlugin;
|
24
17
|
import org.embulk.spi.Page;
|
@@ -26,42 +19,54 @@ import org.embulk.spi.PageReader;
|
|
26
19
|
import org.embulk.spi.Schema;
|
27
20
|
import org.embulk.spi.TransactionalPageOutput;
|
28
21
|
import org.embulk.spi.time.TimestampFormatter;
|
29
|
-
import org.embulk.spi.
|
22
|
+
import org.embulk.spi.util.Timestamps;
|
30
23
|
import parquet.hadoop.ParquetWriter;
|
31
24
|
import parquet.hadoop.api.WriteSupport;
|
32
25
|
import parquet.hadoop.metadata.CompressionCodecName;
|
33
26
|
|
27
|
+
import java.io.IOException;
|
28
|
+
import java.util.List;
|
29
|
+
import java.util.Map;
|
30
|
+
|
34
31
|
@SuppressWarnings("unused")
|
35
32
|
public class ParquetOutputPlugin
|
36
33
|
implements OutputPlugin
|
37
34
|
{
|
38
35
|
public interface PluginTask
|
39
|
-
extends Task, TimestampFormatter.
|
36
|
+
extends Task, TimestampFormatter.Task
|
40
37
|
{
|
41
38
|
@Config("path_prefix")
|
42
|
-
|
39
|
+
String getPathPrefix();
|
43
40
|
|
44
41
|
@Config("file_ext")
|
45
42
|
@ConfigDefault("\".parquet\"")
|
46
|
-
|
43
|
+
String getFileNameExtension();
|
47
44
|
|
48
45
|
@Config("sequence_format")
|
49
46
|
@ConfigDefault("\".%03d\"")
|
50
|
-
|
47
|
+
String getSequenceFormat();
|
51
48
|
|
52
49
|
@Config("block_size")
|
53
50
|
@ConfigDefault("134217728") // 128M
|
54
|
-
|
51
|
+
int getBlockSize();
|
55
52
|
|
56
53
|
@Config("page_size")
|
57
54
|
@ConfigDefault("1048576") // 1M
|
58
|
-
|
55
|
+
int getPageSize();
|
59
56
|
|
60
57
|
@Config("compression_codec")
|
61
58
|
@ConfigDefault("\"UNCOMPRESSED\"")
|
62
|
-
|
59
|
+
String getCompressionCodec();
|
60
|
+
|
61
|
+
@Config("column_options")
|
62
|
+
@ConfigDefault("{}")
|
63
|
+
Map<String, TimestampColumnOption> getColumnOptions();
|
63
64
|
}
|
64
65
|
|
66
|
+
public interface TimestampColumnOption
|
67
|
+
extends Task, TimestampFormatter.TimestampColumnOption
|
68
|
+
{ }
|
69
|
+
|
65
70
|
public ConfigDiff transaction(ConfigSource config,
|
66
71
|
Schema schema, int processorCount,
|
67
72
|
OutputPlugin.Control control)
|
@@ -103,26 +108,13 @@ public class ParquetOutputPlugin
|
|
103
108
|
|
104
109
|
final PageReader reader = new PageReader(schema);
|
105
110
|
|
106
|
-
final
|
111
|
+
final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
|
107
112
|
final EmbulkWriteSupport writeSupport = new EmbulkWriteSupport(schema, timestampFormatters);
|
108
113
|
ParquetWriter<PageReader> writer = createParquetWriter(new Path(path), writeSupport, codec, blockSize, pageSize);
|
109
114
|
|
110
115
|
return new ParquetTransactionalPageOutput(reader, writer);
|
111
116
|
}
|
112
117
|
|
113
|
-
private Map<Integer, TimestampFormatter> newTimestampFormatters(
|
114
|
-
TimestampFormatter.FormatterTask task, Schema schema)
|
115
|
-
{
|
116
|
-
ImmutableMap.Builder<Integer, TimestampFormatter> builder = new ImmutableBiMap.Builder<>();
|
117
|
-
for (Column column : schema.getColumns()) {
|
118
|
-
if (column.getType() instanceof TimestampType) {
|
119
|
-
TimestampType tt = (TimestampType) column.getType();
|
120
|
-
builder.put(column.getIndex(), new TimestampFormatter(tt.getFormat(), task));
|
121
|
-
}
|
122
|
-
}
|
123
|
-
return builder.build();
|
124
|
-
}
|
125
|
-
|
126
118
|
private <T> ParquetWriter<T> createParquetWriter(Path path, WriteSupport<T> writeSupport, CompressionCodecName codec, int blockSize, int pageSize) {
|
127
119
|
ParquetWriter<T> writer = null;
|
128
120
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
package org.embulk.output;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.config.ConfigException;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.spi.Exec;
|
7
|
+
import org.junit.Rule;
|
8
|
+
import org.junit.Test;
|
9
|
+
|
10
|
+
import static org.junit.Assert.*;
|
11
|
+
|
12
|
+
public class ParquetOutputPluginTest {
|
13
|
+
@Rule
|
14
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
15
|
+
|
16
|
+
@Test
|
17
|
+
public void checkDefaultValues() {
|
18
|
+
ConfigSource config = Exec.newConfigSource()
|
19
|
+
.set("path_prefix", "test");
|
20
|
+
|
21
|
+
ParquetOutputPlugin.PluginTask task = config.loadConfig(ParquetOutputPlugin.PluginTask.class);
|
22
|
+
assertEquals(".parquet", task.getFileNameExtension());
|
23
|
+
assertEquals(".%03d", task.getSequenceFormat());
|
24
|
+
assertEquals(134217728, task.getBlockSize());
|
25
|
+
assertEquals(1048576, task.getPageSize());
|
26
|
+
assertEquals("UNCOMPRESSED", task.getCompressionCodec());
|
27
|
+
}
|
28
|
+
|
29
|
+
@Test(expected = ConfigException.class)
|
30
|
+
public void checkColumnsRequired() {
|
31
|
+
ConfigSource config = Exec.newConfigSource();
|
32
|
+
|
33
|
+
config.loadConfig(ParquetOutputPlugin.PluginTask.class);
|
34
|
+
}
|
35
|
+
|
36
|
+
|
37
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OKUNO Akihiro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -46,6 +46,7 @@ extensions: []
|
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
48
|
- .gitignore
|
49
|
+
- .travis.yml
|
49
50
|
- LICENSE.txt
|
50
51
|
- README.md
|
51
52
|
- build.gradle
|
@@ -56,7 +57,7 @@ files:
|
|
56
57
|
- lib/embulk/output/parquet.rb
|
57
58
|
- src/main/java/org/embulk/output/EmbulkWriteSupport.java
|
58
59
|
- src/main/java/org/embulk/output/ParquetOutputPlugin.java
|
59
|
-
- src/test/java/org/embulk/output/
|
60
|
+
- src/test/java/org/embulk/output/ParquetOutputPluginTest.java
|
60
61
|
- classpath/activation-1.1.jar
|
61
62
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
62
63
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -64,6 +65,7 @@ files:
|
|
64
65
|
- classpath/api-util-1.0.0-M20.jar
|
65
66
|
- classpath/asm-3.1.jar
|
66
67
|
- classpath/avro-1.7.4.jar
|
68
|
+
- classpath/aws-java-sdk-1.7.4.jar
|
67
69
|
- classpath/commons-beanutils-1.7.0.jar
|
68
70
|
- classpath/commons-cli-1.2.jar
|
69
71
|
- classpath/commons-codec-1.6.jar
|
@@ -71,6 +73,7 @@ files:
|
|
71
73
|
- classpath/commons-compress-1.4.1.jar
|
72
74
|
- classpath/commons-configuration-1.6.jar
|
73
75
|
- classpath/commons-digester-1.8.jar
|
76
|
+
- classpath/commons-el-1.0.jar
|
74
77
|
- classpath/commons-httpclient-3.1.jar
|
75
78
|
- classpath/commons-io-2.4.jar
|
76
79
|
- classpath/commons-lang-2.6.jar
|
@@ -80,10 +83,11 @@ files:
|
|
80
83
|
- classpath/curator-client-2.6.0.jar
|
81
84
|
- classpath/curator-framework-2.6.0.jar
|
82
85
|
- classpath/curator-recipes-2.6.0.jar
|
83
|
-
- classpath/embulk-output-parquet-0.
|
86
|
+
- classpath/embulk-output-parquet-0.3.0.jar
|
84
87
|
- classpath/gson-2.2.4.jar
|
85
88
|
- classpath/hadoop-annotations-2.6.0.jar
|
86
89
|
- classpath/hadoop-auth-2.6.0.jar
|
90
|
+
- classpath/hadoop-aws-2.6.0.jar
|
87
91
|
- classpath/hadoop-client-2.6.0.jar
|
88
92
|
- classpath/hadoop-common-2.6.0.jar
|
89
93
|
- classpath/hadoop-hdfs-2.6.0.jar
|
@@ -104,6 +108,9 @@ files:
|
|
104
108
|
- classpath/jackson-jaxrs-1.9.13.jar
|
105
109
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
106
110
|
- classpath/jackson-xc-1.9.13.jar
|
111
|
+
- classpath/jasper-compiler-5.5.23.jar
|
112
|
+
- classpath/jasper-runtime-5.5.23.jar
|
113
|
+
- classpath/java-xmlbuilder-0.4.jar
|
107
114
|
- classpath/jaxb-api-2.2.2.jar
|
108
115
|
- classpath/jaxb-impl-2.2.3-1.jar
|
109
116
|
- classpath/jersey-client-1.9.jar
|
@@ -111,9 +118,14 @@ files:
|
|
111
118
|
- classpath/jersey-guice-1.9.jar
|
112
119
|
- classpath/jersey-json-1.9.jar
|
113
120
|
- classpath/jersey-server-1.9.jar
|
121
|
+
- classpath/jets3t-0.9.0.jar
|
114
122
|
- classpath/jettison-1.1.jar
|
123
|
+
- classpath/jetty-6.1.26.jar
|
115
124
|
- classpath/jetty-util-6.1.26.jar
|
116
125
|
- classpath/jline-0.9.94.jar
|
126
|
+
- classpath/joda-time-2.9.1.jar
|
127
|
+
- classpath/jsch-0.1.42.jar
|
128
|
+
- classpath/jsp-api-2.1.jar
|
117
129
|
- classpath/jsr305-1.3.9.jar
|
118
130
|
- classpath/leveldbjni-all-1.8.jar
|
119
131
|
- classpath/log4j-1.2.17.jar
|
@@ -128,7 +140,6 @@ files:
|
|
128
140
|
- classpath/parquet-jackson-1.5.0.jar
|
129
141
|
- classpath/protobuf-java-2.5.0.jar
|
130
142
|
- classpath/servlet-api-2.5.jar
|
131
|
-
- classpath/slf4j-log4j12-1.7.5.jar
|
132
143
|
- classpath/snappy-java-1.1.1.6.jar
|
133
144
|
- classpath/stax-api-1.0-2.jar
|
134
145
|
- classpath/xercesImpl-2.9.1.jar
|