embulk-output-parquet 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c834b87845a6520887275fe90bf447820eee265b
4
- data.tar.gz: 6515640ef55fd9d02eeda515d8cbe765431a1dab
3
+ metadata.gz: e731ad9c445bd5adef66ce8c994fa73dca87252e
4
+ data.tar.gz: 392706f907d2dd684f2d2778863ae7dd28e4dfca
5
5
  SHA512:
6
- metadata.gz: 3af4575ecf73aa13157e988db3219e3a79774ba4119966107847091e0ecddd6e611d97413a088c1151f5749decd8fcf757bff4951527099926a234e46efd205d
7
- data.tar.gz: 66348d724ad2bb2835a23c7850a831a09f176baa89fb8cfd02fa0fb7a601874664f749d4e848cd89700f0baa2940ef4ac5f2600d0c47a705d24f21f066715955
6
+ metadata.gz: ed522bf62b23bd0d236c945f4bbffcd3991d86ed93a835bae4739a3be96717e83ed0f584bdd968330ee426a8fd81e3d2cde11090605050bf2d9a3d70d700c603
7
+ data.tar.gz: a3e6429370cfaefc7e777f8ba27912049d9649e51b8574330b957e2495a29640466e68f8173f052f6c881c33564efbc49fb4b301d59ae9f4b21ce7b58039b684
@@ -0,0 +1 @@
1
+ language: java
data/README.md CHANGED
@@ -1,6 +1,5 @@
1
1
  # Parquet output plugin for Embulk
2
2
 
3
-
4
3
  ## Overview
5
4
 
6
5
  * **Plugin type**: output
@@ -16,7 +15,10 @@
16
15
  - **block_size**: A block size of parquet file. (int, default: 134217728(128M))
17
16
  - **page_size**: A page size of parquet file. (int, default: 1048576(1M))
18
17
  - **compression_codec**: A compression codec. available: UNCOMPRESSED, SNAPPY, GZIP (string, default: UNCOMPRESSED)
19
- - **timezone**: A timezone for timestamp format. (string, default: UTC)
18
+ - **default_timezone**: Time zone of timestamp columns. This can be overwritten for each column using column_options
19
+ - **default_timestamp_format**: Format of timestamp columns. This can be overwritten for each column using column_options
20
+ - **column_options**: Specify timezone and timestamp format for each column. Format of this option is the same as the official csv formatter. See [document](
21
+ http://www.embulk.org/docs/built-in.html#csv-formatter-plugin).
20
22
 
21
23
  ## Example
22
24
 
@@ -13,19 +13,23 @@ repositories {
13
13
  }
14
14
  configurations {
15
15
  provided
16
+ runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
16
17
  }
17
18
 
18
- version = "0.2.0"
19
+ version = "0.3.0"
19
20
 
20
21
  dependencies {
21
- compile "org.embulk:embulk-core:0.7.4"
22
- provided "org.embulk:embulk-core:0.7.4"
22
+ compile "org.embulk:embulk-core:0.7.10"
23
+ provided "org.embulk:embulk-core:0.7.10"
23
24
 
24
25
  compile "com.twitter:parquet-hadoop:1.5.0"
25
26
  compile "org.apache.hadoop:hadoop-client:2.6.0"
26
27
  compile "org.xerial.snappy:snappy-java:1.1.1.6"
28
+ compile "org.apache.hadoop:hadoop-aws:2.6.0"
27
29
 
28
30
  testCompile "junit:junit:4.+"
31
+ testCompile "org.embulk:embulk-core:0.7.7:tests"
32
+ testCompile "org.embulk:embulk-standards:0.7.7"
29
33
  }
30
34
 
31
35
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -24,9 +24,9 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
24
24
  final Schema schema;
25
25
  RecordConsumer consumer;
26
26
  WriteContext writeContext;
27
- Map<Integer, TimestampFormatter> timestampFormatters;
27
+ TimestampFormatter[] timestampFormatters;
28
28
 
29
- public EmbulkWriteSupport(Schema schema, Map<Integer, TimestampFormatter> timestampFormatters) {
29
+ public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters) {
30
30
  this.schema = schema;
31
31
  this.timestampFormatters = timestampFormatters;
32
32
  }
@@ -112,7 +112,7 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
112
112
  public void timestampColumn(Column column) {
113
113
  if (!record.isNull(column)) {
114
114
  Timestamp t = record.getTimestamp(column);
115
- String formatted = timestampFormatters.get(column.getIndex()).format(t);
115
+ String formatted = timestampFormatters[column.getIndex()].format(t);
116
116
  consumer.addBinary(Binary.fromString(formatted));
117
117
  }
118
118
  }
@@ -1,24 +1,17 @@
1
1
  package org.embulk.output;
2
2
 
3
- import java.io.IOException;
4
- import java.util.List;
5
- import java.util.Map;
6
-
7
3
  import com.google.common.base.Throwables;
8
- import com.google.common.collect.ImmutableBiMap;
9
- import com.google.common.collect.ImmutableMap;
10
4
  import org.apache.hadoop.conf.Configuration;
11
5
  import org.apache.hadoop.fs.LocalFileSystem;
12
6
  import org.apache.hadoop.fs.Path;
13
7
  import org.apache.hadoop.hdfs.DistributedFileSystem;
14
- import org.embulk.config.TaskReport;
15
8
  import org.embulk.config.Config;
16
9
  import org.embulk.config.ConfigDefault;
17
10
  import org.embulk.config.ConfigDiff;
18
11
  import org.embulk.config.ConfigSource;
19
12
  import org.embulk.config.Task;
13
+ import org.embulk.config.TaskReport;
20
14
  import org.embulk.config.TaskSource;
21
- import org.embulk.spi.Column;
22
15
  import org.embulk.spi.Exec;
23
16
  import org.embulk.spi.OutputPlugin;
24
17
  import org.embulk.spi.Page;
@@ -26,42 +19,54 @@ import org.embulk.spi.PageReader;
26
19
  import org.embulk.spi.Schema;
27
20
  import org.embulk.spi.TransactionalPageOutput;
28
21
  import org.embulk.spi.time.TimestampFormatter;
29
- import org.embulk.spi.type.TimestampType;
22
+ import org.embulk.spi.util.Timestamps;
30
23
  import parquet.hadoop.ParquetWriter;
31
24
  import parquet.hadoop.api.WriteSupport;
32
25
  import parquet.hadoop.metadata.CompressionCodecName;
33
26
 
27
+ import java.io.IOException;
28
+ import java.util.List;
29
+ import java.util.Map;
30
+
34
31
  @SuppressWarnings("unused")
35
32
  public class ParquetOutputPlugin
36
33
  implements OutputPlugin
37
34
  {
38
35
  public interface PluginTask
39
- extends Task, TimestampFormatter.FormatterTask
36
+ extends Task, TimestampFormatter.Task
40
37
  {
41
38
  @Config("path_prefix")
42
- public String getPathPrefix();
39
+ String getPathPrefix();
43
40
 
44
41
  @Config("file_ext")
45
42
  @ConfigDefault("\".parquet\"")
46
- public String getFileNameExtension();
43
+ String getFileNameExtension();
47
44
 
48
45
  @Config("sequence_format")
49
46
  @ConfigDefault("\".%03d\"")
50
- public String getSequenceFormat();
47
+ String getSequenceFormat();
51
48
 
52
49
  @Config("block_size")
53
50
  @ConfigDefault("134217728") // 128M
54
- public int getBlockSize();
51
+ int getBlockSize();
55
52
 
56
53
  @Config("page_size")
57
54
  @ConfigDefault("1048576") // 1M
58
- public int getPageSize();
55
+ int getPageSize();
59
56
 
60
57
  @Config("compression_codec")
61
58
  @ConfigDefault("\"UNCOMPRESSED\"")
62
- public String getCompressionCodec();
59
+ String getCompressionCodec();
60
+
61
+ @Config("column_options")
62
+ @ConfigDefault("{}")
63
+ Map<String, TimestampColumnOption> getColumnOptions();
63
64
  }
64
65
 
66
+ public interface TimestampColumnOption
67
+ extends Task, TimestampFormatter.TimestampColumnOption
68
+ { }
69
+
65
70
  public ConfigDiff transaction(ConfigSource config,
66
71
  Schema schema, int processorCount,
67
72
  OutputPlugin.Control control)
@@ -103,26 +108,13 @@ public class ParquetOutputPlugin
103
108
 
104
109
  final PageReader reader = new PageReader(schema);
105
110
 
106
- final Map<Integer, TimestampFormatter> timestampFormatters = newTimestampFormatters(task, schema);
111
+ final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
107
112
  final EmbulkWriteSupport writeSupport = new EmbulkWriteSupport(schema, timestampFormatters);
108
113
  ParquetWriter<PageReader> writer = createParquetWriter(new Path(path), writeSupport, codec, blockSize, pageSize);
109
114
 
110
115
  return new ParquetTransactionalPageOutput(reader, writer);
111
116
  }
112
117
 
113
- private Map<Integer, TimestampFormatter> newTimestampFormatters(
114
- TimestampFormatter.FormatterTask task, Schema schema)
115
- {
116
- ImmutableMap.Builder<Integer, TimestampFormatter> builder = new ImmutableBiMap.Builder<>();
117
- for (Column column : schema.getColumns()) {
118
- if (column.getType() instanceof TimestampType) {
119
- TimestampType tt = (TimestampType) column.getType();
120
- builder.put(column.getIndex(), new TimestampFormatter(tt.getFormat(), task));
121
- }
122
- }
123
- return builder.build();
124
- }
125
-
126
118
  private <T> ParquetWriter<T> createParquetWriter(Path path, WriteSupport<T> writeSupport, CompressionCodecName codec, int blockSize, int pageSize) {
127
119
  ParquetWriter<T> writer = null;
128
120
 
@@ -0,0 +1,37 @@
1
+ package org.embulk.output;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.config.ConfigException;
5
+ import org.embulk.config.ConfigSource;
6
+ import org.embulk.spi.Exec;
7
+ import org.junit.Rule;
8
+ import org.junit.Test;
9
+
10
+ import static org.junit.Assert.*;
11
+
12
+ public class ParquetOutputPluginTest {
13
+ @Rule
14
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
15
+
16
+ @Test
17
+ public void checkDefaultValues() {
18
+ ConfigSource config = Exec.newConfigSource()
19
+ .set("path_prefix", "test");
20
+
21
+ ParquetOutputPlugin.PluginTask task = config.loadConfig(ParquetOutputPlugin.PluginTask.class);
22
+ assertEquals(".parquet", task.getFileNameExtension());
23
+ assertEquals(".%03d", task.getSequenceFormat());
24
+ assertEquals(134217728, task.getBlockSize());
25
+ assertEquals(1048576, task.getPageSize());
26
+ assertEquals("UNCOMPRESSED", task.getCompressionCodec());
27
+ }
28
+
29
+ @Test(expected = ConfigException.class)
30
+ public void checkColumnsRequired() {
31
+ ConfigSource config = Exec.newConfigSource();
32
+
33
+ config.loadConfig(ParquetOutputPlugin.PluginTask.class);
34
+ }
35
+
36
+
37
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - OKUNO Akihiro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-09 00:00:00.000000000 Z
11
+ date: 2015-12-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -46,6 +46,7 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - .gitignore
49
+ - .travis.yml
49
50
  - LICENSE.txt
50
51
  - README.md
51
52
  - build.gradle
@@ -56,7 +57,7 @@ files:
56
57
  - lib/embulk/output/parquet.rb
57
58
  - src/main/java/org/embulk/output/EmbulkWriteSupport.java
58
59
  - src/main/java/org/embulk/output/ParquetOutputPlugin.java
59
- - src/test/java/org/embulk/output/TestParquetOutputPlugin.java
60
+ - src/test/java/org/embulk/output/ParquetOutputPluginTest.java
60
61
  - classpath/activation-1.1.jar
61
62
  - classpath/apacheds-i18n-2.0.0-M15.jar
62
63
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -64,6 +65,7 @@ files:
64
65
  - classpath/api-util-1.0.0-M20.jar
65
66
  - classpath/asm-3.1.jar
66
67
  - classpath/avro-1.7.4.jar
68
+ - classpath/aws-java-sdk-1.7.4.jar
67
69
  - classpath/commons-beanutils-1.7.0.jar
68
70
  - classpath/commons-cli-1.2.jar
69
71
  - classpath/commons-codec-1.6.jar
@@ -71,6 +73,7 @@ files:
71
73
  - classpath/commons-compress-1.4.1.jar
72
74
  - classpath/commons-configuration-1.6.jar
73
75
  - classpath/commons-digester-1.8.jar
76
+ - classpath/commons-el-1.0.jar
74
77
  - classpath/commons-httpclient-3.1.jar
75
78
  - classpath/commons-io-2.4.jar
76
79
  - classpath/commons-lang-2.6.jar
@@ -80,10 +83,11 @@ files:
80
83
  - classpath/curator-client-2.6.0.jar
81
84
  - classpath/curator-framework-2.6.0.jar
82
85
  - classpath/curator-recipes-2.6.0.jar
83
- - classpath/embulk-output-parquet-0.2.0.jar
86
+ - classpath/embulk-output-parquet-0.3.0.jar
84
87
  - classpath/gson-2.2.4.jar
85
88
  - classpath/hadoop-annotations-2.6.0.jar
86
89
  - classpath/hadoop-auth-2.6.0.jar
90
+ - classpath/hadoop-aws-2.6.0.jar
87
91
  - classpath/hadoop-client-2.6.0.jar
88
92
  - classpath/hadoop-common-2.6.0.jar
89
93
  - classpath/hadoop-hdfs-2.6.0.jar
@@ -104,6 +108,9 @@ files:
104
108
  - classpath/jackson-jaxrs-1.9.13.jar
105
109
  - classpath/jackson-mapper-asl-1.9.13.jar
106
110
  - classpath/jackson-xc-1.9.13.jar
111
+ - classpath/jasper-compiler-5.5.23.jar
112
+ - classpath/jasper-runtime-5.5.23.jar
113
+ - classpath/java-xmlbuilder-0.4.jar
107
114
  - classpath/jaxb-api-2.2.2.jar
108
115
  - classpath/jaxb-impl-2.2.3-1.jar
109
116
  - classpath/jersey-client-1.9.jar
@@ -111,9 +118,14 @@ files:
111
118
  - classpath/jersey-guice-1.9.jar
112
119
  - classpath/jersey-json-1.9.jar
113
120
  - classpath/jersey-server-1.9.jar
121
+ - classpath/jets3t-0.9.0.jar
114
122
  - classpath/jettison-1.1.jar
123
+ - classpath/jetty-6.1.26.jar
115
124
  - classpath/jetty-util-6.1.26.jar
116
125
  - classpath/jline-0.9.94.jar
126
+ - classpath/joda-time-2.9.1.jar
127
+ - classpath/jsch-0.1.42.jar
128
+ - classpath/jsp-api-2.1.jar
117
129
  - classpath/jsr305-1.3.9.jar
118
130
  - classpath/leveldbjni-all-1.8.jar
119
131
  - classpath/log4j-1.2.17.jar
@@ -128,7 +140,6 @@ files:
128
140
  - classpath/parquet-jackson-1.5.0.jar
129
141
  - classpath/protobuf-java-2.5.0.jar
130
142
  - classpath/servlet-api-2.5.jar
131
- - classpath/slf4j-log4j12-1.7.5.jar
132
143
  - classpath/snappy-java-1.1.1.6.jar
133
144
  - classpath/stax-api-1.0-2.jar
134
145
  - classpath/xercesImpl-2.9.1.jar
@@ -1,5 +0,0 @@
1
- package org.embulk.output;
2
-
3
- public class TestParquetOutputPlugin
4
- {
5
- }