embulk-output-parquet 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/README.md +4 -2
- data/build.gradle +7 -3
- data/src/main/java/org/embulk/output/EmbulkWriteSupport.java +3 -3
- data/src/main/java/org/embulk/output/ParquetOutputPlugin.java +22 -30
- data/src/test/java/org/embulk/output/ParquetOutputPluginTest.java +37 -0
- metadata +16 -5
- data/src/test/java/org/embulk/output/TestParquetOutputPlugin.java +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e731ad9c445bd5adef66ce8c994fa73dca87252e
|
4
|
+
data.tar.gz: 392706f907d2dd684f2d2778863ae7dd28e4dfca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed522bf62b23bd0d236c945f4bbffcd3991d86ed93a835bae4739a3be96717e83ed0f584bdd968330ee426a8fd81e3d2cde11090605050bf2d9a3d70d700c603
|
7
|
+
data.tar.gz: a3e6429370cfaefc7e777f8ba27912049d9649e51b8574330b957e2495a29640466e68f8173f052f6c881c33564efbc49fb4b301d59ae9f4b21ce7b58039b684
|
data/.travis.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
language: java
|
data/README.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# Parquet output plugin for Embulk
|
2
2
|
|
3
|
-
|
4
3
|
## Overview
|
5
4
|
|
6
5
|
* **Plugin type**: output
|
@@ -16,7 +15,10 @@
|
|
16
15
|
- **block_size**: A block size of parquet file. (int, default: 134217728(128M))
|
17
16
|
- **page_size**: A page size of parquet file. (int, default: 1048576(1M))
|
18
17
|
- **compression_codec**: A compression codec. available: UNCOMPRESSED, SNAPPY, GZIP (string, default: UNCOMPRESSED)
|
19
|
-
- **
|
18
|
+
- **default_timezone**: Time zone of timestamp columns. This can be overwritten for each column using column_options
|
19
|
+
- **default_timestamp_format**: Format of timestamp columns. This can be overwritten for each column using column_options
|
20
|
+
- **column_options**: Specify timezone and timestamp format for each column. Format of this option is the same as the official csv formatter. See [document](
|
21
|
+
http://www.embulk.org/docs/built-in.html#csv-formatter-plugin).
|
20
22
|
|
21
23
|
## Example
|
22
24
|
|
data/build.gradle
CHANGED
@@ -13,19 +13,23 @@ repositories {
|
|
13
13
|
}
|
14
14
|
configurations {
|
15
15
|
provided
|
16
|
+
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
16
17
|
}
|
17
18
|
|
18
|
-
version = "0.
|
19
|
+
version = "0.3.0"
|
19
20
|
|
20
21
|
dependencies {
|
21
|
-
compile "org.embulk:embulk-core:0.7.
|
22
|
-
provided "org.embulk:embulk-core:0.7.
|
22
|
+
compile "org.embulk:embulk-core:0.7.10"
|
23
|
+
provided "org.embulk:embulk-core:0.7.10"
|
23
24
|
|
24
25
|
compile "com.twitter:parquet-hadoop:1.5.0"
|
25
26
|
compile "org.apache.hadoop:hadoop-client:2.6.0"
|
26
27
|
compile "org.xerial.snappy:snappy-java:1.1.1.6"
|
28
|
+
compile "org.apache.hadoop:hadoop-aws:2.6.0"
|
27
29
|
|
28
30
|
testCompile "junit:junit:4.+"
|
31
|
+
testCompile "org.embulk:embulk-core:0.7.7:tests"
|
32
|
+
testCompile "org.embulk:embulk-standards:0.7.7"
|
29
33
|
}
|
30
34
|
|
31
35
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -24,9 +24,9 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
|
|
24
24
|
final Schema schema;
|
25
25
|
RecordConsumer consumer;
|
26
26
|
WriteContext writeContext;
|
27
|
-
|
27
|
+
TimestampFormatter[] timestampFormatters;
|
28
28
|
|
29
|
-
public EmbulkWriteSupport(Schema schema,
|
29
|
+
public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters) {
|
30
30
|
this.schema = schema;
|
31
31
|
this.timestampFormatters = timestampFormatters;
|
32
32
|
}
|
@@ -112,7 +112,7 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
|
|
112
112
|
public void timestampColumn(Column column) {
|
113
113
|
if (!record.isNull(column)) {
|
114
114
|
Timestamp t = record.getTimestamp(column);
|
115
|
-
String formatted = timestampFormatters
|
115
|
+
String formatted = timestampFormatters[column.getIndex()].format(t);
|
116
116
|
consumer.addBinary(Binary.fromString(formatted));
|
117
117
|
}
|
118
118
|
}
|
@@ -1,24 +1,17 @@
|
|
1
1
|
package org.embulk.output;
|
2
2
|
|
3
|
-
import java.io.IOException;
|
4
|
-
import java.util.List;
|
5
|
-
import java.util.Map;
|
6
|
-
|
7
3
|
import com.google.common.base.Throwables;
|
8
|
-
import com.google.common.collect.ImmutableBiMap;
|
9
|
-
import com.google.common.collect.ImmutableMap;
|
10
4
|
import org.apache.hadoop.conf.Configuration;
|
11
5
|
import org.apache.hadoop.fs.LocalFileSystem;
|
12
6
|
import org.apache.hadoop.fs.Path;
|
13
7
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
14
|
-
import org.embulk.config.TaskReport;
|
15
8
|
import org.embulk.config.Config;
|
16
9
|
import org.embulk.config.ConfigDefault;
|
17
10
|
import org.embulk.config.ConfigDiff;
|
18
11
|
import org.embulk.config.ConfigSource;
|
19
12
|
import org.embulk.config.Task;
|
13
|
+
import org.embulk.config.TaskReport;
|
20
14
|
import org.embulk.config.TaskSource;
|
21
|
-
import org.embulk.spi.Column;
|
22
15
|
import org.embulk.spi.Exec;
|
23
16
|
import org.embulk.spi.OutputPlugin;
|
24
17
|
import org.embulk.spi.Page;
|
@@ -26,42 +19,54 @@ import org.embulk.spi.PageReader;
|
|
26
19
|
import org.embulk.spi.Schema;
|
27
20
|
import org.embulk.spi.TransactionalPageOutput;
|
28
21
|
import org.embulk.spi.time.TimestampFormatter;
|
29
|
-
import org.embulk.spi.
|
22
|
+
import org.embulk.spi.util.Timestamps;
|
30
23
|
import parquet.hadoop.ParquetWriter;
|
31
24
|
import parquet.hadoop.api.WriteSupport;
|
32
25
|
import parquet.hadoop.metadata.CompressionCodecName;
|
33
26
|
|
27
|
+
import java.io.IOException;
|
28
|
+
import java.util.List;
|
29
|
+
import java.util.Map;
|
30
|
+
|
34
31
|
@SuppressWarnings("unused")
|
35
32
|
public class ParquetOutputPlugin
|
36
33
|
implements OutputPlugin
|
37
34
|
{
|
38
35
|
public interface PluginTask
|
39
|
-
extends Task, TimestampFormatter.
|
36
|
+
extends Task, TimestampFormatter.Task
|
40
37
|
{
|
41
38
|
@Config("path_prefix")
|
42
|
-
|
39
|
+
String getPathPrefix();
|
43
40
|
|
44
41
|
@Config("file_ext")
|
45
42
|
@ConfigDefault("\".parquet\"")
|
46
|
-
|
43
|
+
String getFileNameExtension();
|
47
44
|
|
48
45
|
@Config("sequence_format")
|
49
46
|
@ConfigDefault("\".%03d\"")
|
50
|
-
|
47
|
+
String getSequenceFormat();
|
51
48
|
|
52
49
|
@Config("block_size")
|
53
50
|
@ConfigDefault("134217728") // 128M
|
54
|
-
|
51
|
+
int getBlockSize();
|
55
52
|
|
56
53
|
@Config("page_size")
|
57
54
|
@ConfigDefault("1048576") // 1M
|
58
|
-
|
55
|
+
int getPageSize();
|
59
56
|
|
60
57
|
@Config("compression_codec")
|
61
58
|
@ConfigDefault("\"UNCOMPRESSED\"")
|
62
|
-
|
59
|
+
String getCompressionCodec();
|
60
|
+
|
61
|
+
@Config("column_options")
|
62
|
+
@ConfigDefault("{}")
|
63
|
+
Map<String, TimestampColumnOption> getColumnOptions();
|
63
64
|
}
|
64
65
|
|
66
|
+
public interface TimestampColumnOption
|
67
|
+
extends Task, TimestampFormatter.TimestampColumnOption
|
68
|
+
{ }
|
69
|
+
|
65
70
|
public ConfigDiff transaction(ConfigSource config,
|
66
71
|
Schema schema, int processorCount,
|
67
72
|
OutputPlugin.Control control)
|
@@ -103,26 +108,13 @@ public class ParquetOutputPlugin
|
|
103
108
|
|
104
109
|
final PageReader reader = new PageReader(schema);
|
105
110
|
|
106
|
-
final
|
111
|
+
final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
|
107
112
|
final EmbulkWriteSupport writeSupport = new EmbulkWriteSupport(schema, timestampFormatters);
|
108
113
|
ParquetWriter<PageReader> writer = createParquetWriter(new Path(path), writeSupport, codec, blockSize, pageSize);
|
109
114
|
|
110
115
|
return new ParquetTransactionalPageOutput(reader, writer);
|
111
116
|
}
|
112
117
|
|
113
|
-
private Map<Integer, TimestampFormatter> newTimestampFormatters(
|
114
|
-
TimestampFormatter.FormatterTask task, Schema schema)
|
115
|
-
{
|
116
|
-
ImmutableMap.Builder<Integer, TimestampFormatter> builder = new ImmutableBiMap.Builder<>();
|
117
|
-
for (Column column : schema.getColumns()) {
|
118
|
-
if (column.getType() instanceof TimestampType) {
|
119
|
-
TimestampType tt = (TimestampType) column.getType();
|
120
|
-
builder.put(column.getIndex(), new TimestampFormatter(tt.getFormat(), task));
|
121
|
-
}
|
122
|
-
}
|
123
|
-
return builder.build();
|
124
|
-
}
|
125
|
-
|
126
118
|
private <T> ParquetWriter<T> createParquetWriter(Path path, WriteSupport<T> writeSupport, CompressionCodecName codec, int blockSize, int pageSize) {
|
127
119
|
ParquetWriter<T> writer = null;
|
128
120
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
package org.embulk.output;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.config.ConfigException;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.spi.Exec;
|
7
|
+
import org.junit.Rule;
|
8
|
+
import org.junit.Test;
|
9
|
+
|
10
|
+
import static org.junit.Assert.*;
|
11
|
+
|
12
|
+
public class ParquetOutputPluginTest {
|
13
|
+
@Rule
|
14
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
15
|
+
|
16
|
+
@Test
|
17
|
+
public void checkDefaultValues() {
|
18
|
+
ConfigSource config = Exec.newConfigSource()
|
19
|
+
.set("path_prefix", "test");
|
20
|
+
|
21
|
+
ParquetOutputPlugin.PluginTask task = config.loadConfig(ParquetOutputPlugin.PluginTask.class);
|
22
|
+
assertEquals(".parquet", task.getFileNameExtension());
|
23
|
+
assertEquals(".%03d", task.getSequenceFormat());
|
24
|
+
assertEquals(134217728, task.getBlockSize());
|
25
|
+
assertEquals(1048576, task.getPageSize());
|
26
|
+
assertEquals("UNCOMPRESSED", task.getCompressionCodec());
|
27
|
+
}
|
28
|
+
|
29
|
+
@Test(expected = ConfigException.class)
|
30
|
+
public void checkColumnsRequired() {
|
31
|
+
ConfigSource config = Exec.newConfigSource();
|
32
|
+
|
33
|
+
config.loadConfig(ParquetOutputPlugin.PluginTask.class);
|
34
|
+
}
|
35
|
+
|
36
|
+
|
37
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OKUNO Akihiro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -46,6 +46,7 @@ extensions: []
|
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
48
|
- .gitignore
|
49
|
+
- .travis.yml
|
49
50
|
- LICENSE.txt
|
50
51
|
- README.md
|
51
52
|
- build.gradle
|
@@ -56,7 +57,7 @@ files:
|
|
56
57
|
- lib/embulk/output/parquet.rb
|
57
58
|
- src/main/java/org/embulk/output/EmbulkWriteSupport.java
|
58
59
|
- src/main/java/org/embulk/output/ParquetOutputPlugin.java
|
59
|
-
- src/test/java/org/embulk/output/
|
60
|
+
- src/test/java/org/embulk/output/ParquetOutputPluginTest.java
|
60
61
|
- classpath/activation-1.1.jar
|
61
62
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
62
63
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -64,6 +65,7 @@ files:
|
|
64
65
|
- classpath/api-util-1.0.0-M20.jar
|
65
66
|
- classpath/asm-3.1.jar
|
66
67
|
- classpath/avro-1.7.4.jar
|
68
|
+
- classpath/aws-java-sdk-1.7.4.jar
|
67
69
|
- classpath/commons-beanutils-1.7.0.jar
|
68
70
|
- classpath/commons-cli-1.2.jar
|
69
71
|
- classpath/commons-codec-1.6.jar
|
@@ -71,6 +73,7 @@ files:
|
|
71
73
|
- classpath/commons-compress-1.4.1.jar
|
72
74
|
- classpath/commons-configuration-1.6.jar
|
73
75
|
- classpath/commons-digester-1.8.jar
|
76
|
+
- classpath/commons-el-1.0.jar
|
74
77
|
- classpath/commons-httpclient-3.1.jar
|
75
78
|
- classpath/commons-io-2.4.jar
|
76
79
|
- classpath/commons-lang-2.6.jar
|
@@ -80,10 +83,11 @@ files:
|
|
80
83
|
- classpath/curator-client-2.6.0.jar
|
81
84
|
- classpath/curator-framework-2.6.0.jar
|
82
85
|
- classpath/curator-recipes-2.6.0.jar
|
83
|
-
- classpath/embulk-output-parquet-0.
|
86
|
+
- classpath/embulk-output-parquet-0.3.0.jar
|
84
87
|
- classpath/gson-2.2.4.jar
|
85
88
|
- classpath/hadoop-annotations-2.6.0.jar
|
86
89
|
- classpath/hadoop-auth-2.6.0.jar
|
90
|
+
- classpath/hadoop-aws-2.6.0.jar
|
87
91
|
- classpath/hadoop-client-2.6.0.jar
|
88
92
|
- classpath/hadoop-common-2.6.0.jar
|
89
93
|
- classpath/hadoop-hdfs-2.6.0.jar
|
@@ -104,6 +108,9 @@ files:
|
|
104
108
|
- classpath/jackson-jaxrs-1.9.13.jar
|
105
109
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
106
110
|
- classpath/jackson-xc-1.9.13.jar
|
111
|
+
- classpath/jasper-compiler-5.5.23.jar
|
112
|
+
- classpath/jasper-runtime-5.5.23.jar
|
113
|
+
- classpath/java-xmlbuilder-0.4.jar
|
107
114
|
- classpath/jaxb-api-2.2.2.jar
|
108
115
|
- classpath/jaxb-impl-2.2.3-1.jar
|
109
116
|
- classpath/jersey-client-1.9.jar
|
@@ -111,9 +118,14 @@ files:
|
|
111
118
|
- classpath/jersey-guice-1.9.jar
|
112
119
|
- classpath/jersey-json-1.9.jar
|
113
120
|
- classpath/jersey-server-1.9.jar
|
121
|
+
- classpath/jets3t-0.9.0.jar
|
114
122
|
- classpath/jettison-1.1.jar
|
123
|
+
- classpath/jetty-6.1.26.jar
|
115
124
|
- classpath/jetty-util-6.1.26.jar
|
116
125
|
- classpath/jline-0.9.94.jar
|
126
|
+
- classpath/joda-time-2.9.1.jar
|
127
|
+
- classpath/jsch-0.1.42.jar
|
128
|
+
- classpath/jsp-api-2.1.jar
|
117
129
|
- classpath/jsr305-1.3.9.jar
|
118
130
|
- classpath/leveldbjni-all-1.8.jar
|
119
131
|
- classpath/log4j-1.2.17.jar
|
@@ -128,7 +140,6 @@ files:
|
|
128
140
|
- classpath/parquet-jackson-1.5.0.jar
|
129
141
|
- classpath/protobuf-java-2.5.0.jar
|
130
142
|
- classpath/servlet-api-2.5.jar
|
131
|
-
- classpath/slf4j-log4j12-1.7.5.jar
|
132
143
|
- classpath/snappy-java-1.1.1.6.jar
|
133
144
|
- classpath/stax-api-1.0-2.jar
|
134
145
|
- classpath/xercesImpl-2.9.1.jar
|