embulk-output-parquet 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c834b87845a6520887275fe90bf447820eee265b
4
- data.tar.gz: 6515640ef55fd9d02eeda515d8cbe765431a1dab
3
+ metadata.gz: e731ad9c445bd5adef66ce8c994fa73dca87252e
4
+ data.tar.gz: 392706f907d2dd684f2d2778863ae7dd28e4dfca
5
5
  SHA512:
6
- metadata.gz: 3af4575ecf73aa13157e988db3219e3a79774ba4119966107847091e0ecddd6e611d97413a088c1151f5749decd8fcf757bff4951527099926a234e46efd205d
7
- data.tar.gz: 66348d724ad2bb2835a23c7850a831a09f176baa89fb8cfd02fa0fb7a601874664f749d4e848cd89700f0baa2940ef4ac5f2600d0c47a705d24f21f066715955
6
+ metadata.gz: ed522bf62b23bd0d236c945f4bbffcd3991d86ed93a835bae4739a3be96717e83ed0f584bdd968330ee426a8fd81e3d2cde11090605050bf2d9a3d70d700c603
7
+ data.tar.gz: a3e6429370cfaefc7e777f8ba27912049d9649e51b8574330b957e2495a29640466e68f8173f052f6c881c33564efbc49fb4b301d59ae9f4b21ce7b58039b684
@@ -0,0 +1 @@
1
+ language: java
data/README.md CHANGED
@@ -1,6 +1,5 @@
1
1
  # Parquet output plugin for Embulk
2
2
 
3
-
4
3
  ## Overview
5
4
 
6
5
  * **Plugin type**: output
@@ -16,7 +15,10 @@
16
15
  - **block_size**: A block size of parquet file. (int, default: 134217728(128M))
17
16
  - **page_size**: A page size of parquet file. (int, default: 1048576(1M))
18
17
  - **compression_codec**: A compression codec. available: UNCOMPRESSED, SNAPPY, GZIP (string, default: UNCOMPRESSED)
19
- - **timezone**: A timezone for timestamp format. (string, default: UTC)
18
+ - **default_timezone**: Time zone of timestamp columns. This can be overwritten for each column using column_options
19
+ - **default_timestamp_format**: Format of timestamp columns. This can be overwritten for each column using column_options
20
+ - **column_options**: Specify timezone and timestamp format for each column. Format of this option is the same as the official csv formatter. See [document](
21
+ http://www.embulk.org/docs/built-in.html#csv-formatter-plugin).
20
22
 
21
23
  ## Example
22
24
 
@@ -13,19 +13,23 @@ repositories {
13
13
  }
14
14
  configurations {
15
15
  provided
16
+ runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
16
17
  }
17
18
 
18
- version = "0.2.0"
19
+ version = "0.3.0"
19
20
 
20
21
  dependencies {
21
- compile "org.embulk:embulk-core:0.7.4"
22
- provided "org.embulk:embulk-core:0.7.4"
22
+ compile "org.embulk:embulk-core:0.7.10"
23
+ provided "org.embulk:embulk-core:0.7.10"
23
24
 
24
25
  compile "com.twitter:parquet-hadoop:1.5.0"
25
26
  compile "org.apache.hadoop:hadoop-client:2.6.0"
26
27
  compile "org.xerial.snappy:snappy-java:1.1.1.6"
28
+ compile "org.apache.hadoop:hadoop-aws:2.6.0"
27
29
 
28
30
  testCompile "junit:junit:4.+"
31
+ testCompile "org.embulk:embulk-core:0.7.7:tests"
32
+ testCompile "org.embulk:embulk-standards:0.7.7"
29
33
  }
30
34
 
31
35
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -24,9 +24,9 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
24
24
  final Schema schema;
25
25
  RecordConsumer consumer;
26
26
  WriteContext writeContext;
27
- Map<Integer, TimestampFormatter> timestampFormatters;
27
+ TimestampFormatter[] timestampFormatters;
28
28
 
29
- public EmbulkWriteSupport(Schema schema, Map<Integer, TimestampFormatter> timestampFormatters) {
29
+ public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters) {
30
30
  this.schema = schema;
31
31
  this.timestampFormatters = timestampFormatters;
32
32
  }
@@ -112,7 +112,7 @@ public class EmbulkWriteSupport extends WriteSupport<PageReader> {
112
112
  public void timestampColumn(Column column) {
113
113
  if (!record.isNull(column)) {
114
114
  Timestamp t = record.getTimestamp(column);
115
- String formatted = timestampFormatters.get(column.getIndex()).format(t);
115
+ String formatted = timestampFormatters[column.getIndex()].format(t);
116
116
  consumer.addBinary(Binary.fromString(formatted));
117
117
  }
118
118
  }
@@ -1,24 +1,17 @@
1
1
  package org.embulk.output;
2
2
 
3
- import java.io.IOException;
4
- import java.util.List;
5
- import java.util.Map;
6
-
7
3
  import com.google.common.base.Throwables;
8
- import com.google.common.collect.ImmutableBiMap;
9
- import com.google.common.collect.ImmutableMap;
10
4
  import org.apache.hadoop.conf.Configuration;
11
5
  import org.apache.hadoop.fs.LocalFileSystem;
12
6
  import org.apache.hadoop.fs.Path;
13
7
  import org.apache.hadoop.hdfs.DistributedFileSystem;
14
- import org.embulk.config.TaskReport;
15
8
  import org.embulk.config.Config;
16
9
  import org.embulk.config.ConfigDefault;
17
10
  import org.embulk.config.ConfigDiff;
18
11
  import org.embulk.config.ConfigSource;
19
12
  import org.embulk.config.Task;
13
+ import org.embulk.config.TaskReport;
20
14
  import org.embulk.config.TaskSource;
21
- import org.embulk.spi.Column;
22
15
  import org.embulk.spi.Exec;
23
16
  import org.embulk.spi.OutputPlugin;
24
17
  import org.embulk.spi.Page;
@@ -26,42 +19,54 @@ import org.embulk.spi.PageReader;
26
19
  import org.embulk.spi.Schema;
27
20
  import org.embulk.spi.TransactionalPageOutput;
28
21
  import org.embulk.spi.time.TimestampFormatter;
29
- import org.embulk.spi.type.TimestampType;
22
+ import org.embulk.spi.util.Timestamps;
30
23
  import parquet.hadoop.ParquetWriter;
31
24
  import parquet.hadoop.api.WriteSupport;
32
25
  import parquet.hadoop.metadata.CompressionCodecName;
33
26
 
27
+ import java.io.IOException;
28
+ import java.util.List;
29
+ import java.util.Map;
30
+
34
31
  @SuppressWarnings("unused")
35
32
  public class ParquetOutputPlugin
36
33
  implements OutputPlugin
37
34
  {
38
35
  public interface PluginTask
39
- extends Task, TimestampFormatter.FormatterTask
36
+ extends Task, TimestampFormatter.Task
40
37
  {
41
38
  @Config("path_prefix")
42
- public String getPathPrefix();
39
+ String getPathPrefix();
43
40
 
44
41
  @Config("file_ext")
45
42
  @ConfigDefault("\".parquet\"")
46
- public String getFileNameExtension();
43
+ String getFileNameExtension();
47
44
 
48
45
  @Config("sequence_format")
49
46
  @ConfigDefault("\".%03d\"")
50
- public String getSequenceFormat();
47
+ String getSequenceFormat();
51
48
 
52
49
  @Config("block_size")
53
50
  @ConfigDefault("134217728") // 128M
54
- public int getBlockSize();
51
+ int getBlockSize();
55
52
 
56
53
  @Config("page_size")
57
54
  @ConfigDefault("1048576") // 1M
58
- public int getPageSize();
55
+ int getPageSize();
59
56
 
60
57
  @Config("compression_codec")
61
58
  @ConfigDefault("\"UNCOMPRESSED\"")
62
- public String getCompressionCodec();
59
+ String getCompressionCodec();
60
+
61
+ @Config("column_options")
62
+ @ConfigDefault("{}")
63
+ Map<String, TimestampColumnOption> getColumnOptions();
63
64
  }
64
65
 
66
+ public interface TimestampColumnOption
67
+ extends Task, TimestampFormatter.TimestampColumnOption
68
+ { }
69
+
65
70
  public ConfigDiff transaction(ConfigSource config,
66
71
  Schema schema, int processorCount,
67
72
  OutputPlugin.Control control)
@@ -103,26 +108,13 @@ public class ParquetOutputPlugin
103
108
 
104
109
  final PageReader reader = new PageReader(schema);
105
110
 
106
- final Map<Integer, TimestampFormatter> timestampFormatters = newTimestampFormatters(task, schema);
111
+ final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
107
112
  final EmbulkWriteSupport writeSupport = new EmbulkWriteSupport(schema, timestampFormatters);
108
113
  ParquetWriter<PageReader> writer = createParquetWriter(new Path(path), writeSupport, codec, blockSize, pageSize);
109
114
 
110
115
  return new ParquetTransactionalPageOutput(reader, writer);
111
116
  }
112
117
 
113
- private Map<Integer, TimestampFormatter> newTimestampFormatters(
114
- TimestampFormatter.FormatterTask task, Schema schema)
115
- {
116
- ImmutableMap.Builder<Integer, TimestampFormatter> builder = new ImmutableBiMap.Builder<>();
117
- for (Column column : schema.getColumns()) {
118
- if (column.getType() instanceof TimestampType) {
119
- TimestampType tt = (TimestampType) column.getType();
120
- builder.put(column.getIndex(), new TimestampFormatter(tt.getFormat(), task));
121
- }
122
- }
123
- return builder.build();
124
- }
125
-
126
118
  private <T> ParquetWriter<T> createParquetWriter(Path path, WriteSupport<T> writeSupport, CompressionCodecName codec, int blockSize, int pageSize) {
127
119
  ParquetWriter<T> writer = null;
128
120
 
@@ -0,0 +1,37 @@
1
+ package org.embulk.output;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.config.ConfigException;
5
+ import org.embulk.config.ConfigSource;
6
+ import org.embulk.spi.Exec;
7
+ import org.junit.Rule;
8
+ import org.junit.Test;
9
+
10
+ import static org.junit.Assert.*;
11
+
12
+ public class ParquetOutputPluginTest {
13
+ @Rule
14
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
15
+
16
+ @Test
17
+ public void checkDefaultValues() {
18
+ ConfigSource config = Exec.newConfigSource()
19
+ .set("path_prefix", "test");
20
+
21
+ ParquetOutputPlugin.PluginTask task = config.loadConfig(ParquetOutputPlugin.PluginTask.class);
22
+ assertEquals(".parquet", task.getFileNameExtension());
23
+ assertEquals(".%03d", task.getSequenceFormat());
24
+ assertEquals(134217728, task.getBlockSize());
25
+ assertEquals(1048576, task.getPageSize());
26
+ assertEquals("UNCOMPRESSED", task.getCompressionCodec());
27
+ }
28
+
29
+ @Test(expected = ConfigException.class)
30
+ public void checkColumnsRequired() {
31
+ ConfigSource config = Exec.newConfigSource();
32
+
33
+ config.loadConfig(ParquetOutputPlugin.PluginTask.class);
34
+ }
35
+
36
+
37
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - OKUNO Akihiro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-09 00:00:00.000000000 Z
11
+ date: 2015-12-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -46,6 +46,7 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - .gitignore
49
+ - .travis.yml
49
50
  - LICENSE.txt
50
51
  - README.md
51
52
  - build.gradle
@@ -56,7 +57,7 @@ files:
56
57
  - lib/embulk/output/parquet.rb
57
58
  - src/main/java/org/embulk/output/EmbulkWriteSupport.java
58
59
  - src/main/java/org/embulk/output/ParquetOutputPlugin.java
59
- - src/test/java/org/embulk/output/TestParquetOutputPlugin.java
60
+ - src/test/java/org/embulk/output/ParquetOutputPluginTest.java
60
61
  - classpath/activation-1.1.jar
61
62
  - classpath/apacheds-i18n-2.0.0-M15.jar
62
63
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -64,6 +65,7 @@ files:
64
65
  - classpath/api-util-1.0.0-M20.jar
65
66
  - classpath/asm-3.1.jar
66
67
  - classpath/avro-1.7.4.jar
68
+ - classpath/aws-java-sdk-1.7.4.jar
67
69
  - classpath/commons-beanutils-1.7.0.jar
68
70
  - classpath/commons-cli-1.2.jar
69
71
  - classpath/commons-codec-1.6.jar
@@ -71,6 +73,7 @@ files:
71
73
  - classpath/commons-compress-1.4.1.jar
72
74
  - classpath/commons-configuration-1.6.jar
73
75
  - classpath/commons-digester-1.8.jar
76
+ - classpath/commons-el-1.0.jar
74
77
  - classpath/commons-httpclient-3.1.jar
75
78
  - classpath/commons-io-2.4.jar
76
79
  - classpath/commons-lang-2.6.jar
@@ -80,10 +83,11 @@ files:
80
83
  - classpath/curator-client-2.6.0.jar
81
84
  - classpath/curator-framework-2.6.0.jar
82
85
  - classpath/curator-recipes-2.6.0.jar
83
- - classpath/embulk-output-parquet-0.2.0.jar
86
+ - classpath/embulk-output-parquet-0.3.0.jar
84
87
  - classpath/gson-2.2.4.jar
85
88
  - classpath/hadoop-annotations-2.6.0.jar
86
89
  - classpath/hadoop-auth-2.6.0.jar
90
+ - classpath/hadoop-aws-2.6.0.jar
87
91
  - classpath/hadoop-client-2.6.0.jar
88
92
  - classpath/hadoop-common-2.6.0.jar
89
93
  - classpath/hadoop-hdfs-2.6.0.jar
@@ -104,6 +108,9 @@ files:
104
108
  - classpath/jackson-jaxrs-1.9.13.jar
105
109
  - classpath/jackson-mapper-asl-1.9.13.jar
106
110
  - classpath/jackson-xc-1.9.13.jar
111
+ - classpath/jasper-compiler-5.5.23.jar
112
+ - classpath/jasper-runtime-5.5.23.jar
113
+ - classpath/java-xmlbuilder-0.4.jar
107
114
  - classpath/jaxb-api-2.2.2.jar
108
115
  - classpath/jaxb-impl-2.2.3-1.jar
109
116
  - classpath/jersey-client-1.9.jar
@@ -111,9 +118,14 @@ files:
111
118
  - classpath/jersey-guice-1.9.jar
112
119
  - classpath/jersey-json-1.9.jar
113
120
  - classpath/jersey-server-1.9.jar
121
+ - classpath/jets3t-0.9.0.jar
114
122
  - classpath/jettison-1.1.jar
123
+ - classpath/jetty-6.1.26.jar
115
124
  - classpath/jetty-util-6.1.26.jar
116
125
  - classpath/jline-0.9.94.jar
126
+ - classpath/joda-time-2.9.1.jar
127
+ - classpath/jsch-0.1.42.jar
128
+ - classpath/jsp-api-2.1.jar
117
129
  - classpath/jsr305-1.3.9.jar
118
130
  - classpath/leveldbjni-all-1.8.jar
119
131
  - classpath/log4j-1.2.17.jar
@@ -128,7 +140,6 @@ files:
128
140
  - classpath/parquet-jackson-1.5.0.jar
129
141
  - classpath/protobuf-java-2.5.0.jar
130
142
  - classpath/servlet-api-2.5.jar
131
- - classpath/slf4j-log4j12-1.7.5.jar
132
143
  - classpath/snappy-java-1.1.1.6.jar
133
144
  - classpath/stax-api-1.0-2.jar
134
145
  - classpath/xercesImpl-2.9.1.jar
@@ -1,5 +0,0 @@
1
- package org.embulk.output;
2
-
3
- public class TestParquetOutputPlugin
4
- {
5
- }