embulk-output-parquet 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d1ef34fa1ab2ea085e926b70700d4bff09e7bb16
4
- data.tar.gz: d2d9df28c5ed603995193552104466da98d400eb
3
+ metadata.gz: 3aea0935831ddd2ff4368ea66315ad14083d502d
4
+ data.tar.gz: 847aa476f2f85f8ac7ac0978e12adbcd0fdc636c
5
5
  SHA512:
6
- metadata.gz: 6a48e6ac6438c1cd56bf431b69ea8e980a54bff54290299d2da7a733c8defe4746230ba7afd2c5446b2d2ce8d42aa7cf74b6ca95c6dd8473cfefa278b891813f
7
- data.tar.gz: adcfe86af5337ab4f41b2eb78ae84a2e20f71d40ad63652029d83cb50368ba40139f59f9f38a1bd8a7e7425c6838f96e39f733bbda6230ca9e37c1ae17b09f80
6
+ metadata.gz: 2af5d5cbc8c0376f8c881a072caa8b45ce245506e6c3330a656e254f7f6f44b0853ffef552bbcbe27c5c2796ee0c55d5d748690f876381ebff95b2e3c3213c72
7
+ data.tar.gz: 763c60722277b4e0e92446caca2d774ffdc4ae14484eb70754c7fe69251e491bcb555ad3746072c856ced6532985b61aeddbeadb9236885d0fa1e69283750346
data/README.md CHANGED
@@ -19,8 +19,11 @@
19
19
  - **default_timestamp_format**: Format of timestamp columns. This can be overwritten for each column using column_options
20
20
  - **column_options**: Specify timezone and timestamp format for each column. Format of this option is the same as the official csv formatter. See [document](
21
21
  http://www.embulk.org/docs/built-in.html#csv-formatter-plugin).
22
+ - **config_files**: List of path to Hadoop's configuration files (array of string, default: `[]`)
22
23
  - **extra_configurations**: Add extra entries to Configuration which will be passed to ParquetWriter
23
24
  - **overwrite**: Overwrite if output files already exist. (default: fail if files exist)
25
+ - **enablesigv4**: Enable Signature Version 4 Signing Process for S3 eu-central-1(Frankfurt) region
26
+ - **addUTF8**: If true, string and timestamp columns are stored with OriginalType.UTF8 (boolean, default false)
24
27
 
25
28
  ## Example
26
29
 
@@ -14,15 +14,15 @@ configurations {
14
14
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
15
15
  }
16
16
 
17
- version = "0.5.0"
17
+ version = "0.6.0"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
 
21
21
  targetCompatibility = 1.7
22
22
 
23
23
  dependencies {
24
- compile "org.embulk:embulk-core:0.7.10"
25
- provided "org.embulk:embulk-core:0.7.10"
24
+ compile "org.embulk:embulk-core:0.8.28"
25
+ provided "org.embulk:embulk-core:0.8.28"
26
26
 
27
27
  compile "org.apache.parquet:parquet-hadoop:1.8.1"
28
28
  compile "org.apache.hadoop:hadoop-client:2.7.1"
@@ -30,8 +30,8 @@ dependencies {
30
30
  compile "org.xerial.snappy:snappy-java:1.1.1.6"
31
31
 
32
32
  testCompile "junit:junit:4.+"
33
- testCompile "org.embulk:embulk-core:0.7.7:tests"
34
- testCompile "org.embulk:embulk-standards:0.7.7"
33
+ testCompile "org.embulk:embulk-core:0.8.28:tests"
34
+ testCompile "org.embulk:embulk-standards:0.8.28"
35
35
  }
36
36
 
37
37
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -66,9 +66,9 @@
66
66
  LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
67
67
  LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
68
68
  </module>
69
- <module name="RightCurly">
70
- <property name="option" value="alone"/>
71
- </module>
69
+ <!--<module name="RightCurly">-->
70
+ <!--<property name="option" value="alone"/>-->
71
+ <!--</module>-->
72
72
  <module name="GenericWhitespace"/>
73
73
  <module name="WhitespaceAfter"/>
74
74
  <module name="NoWhitespaceBefore"/>
@@ -56,9 +56,9 @@
56
56
  LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
57
57
  LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
58
58
  </module>
59
- <module name="RightCurly">
60
- <property name="option" value="alone"/>
61
- </module>
59
+ <!--<module name="RightCurly">-->
60
+ <!--<property name="option" value="alone"/>-->
61
+ <!--</module>-->
62
62
  <module name="GenericWhitespace"/>
63
63
  <module name="WhitespaceAfter"/>
64
64
  <module name="NoWhitespaceBefore"/>
@@ -1,6 +1,6 @@
1
- #Tue Aug 11 00:26:20 PDT 2015
1
+ #Sun Jan 08 00:35:58 PST 2017
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.7-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-4.8-bin.zip
@@ -5,6 +5,7 @@ import org.apache.parquet.hadoop.api.WriteSupport;
5
5
  import org.apache.parquet.io.api.Binary;
6
6
  import org.apache.parquet.io.api.RecordConsumer;
7
7
  import org.apache.parquet.schema.MessageType;
8
+ import org.apache.parquet.schema.OriginalType;
8
9
  import org.apache.parquet.schema.PrimitiveType;
9
10
  import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
10
11
  import org.apache.parquet.schema.Type;
@@ -27,11 +28,13 @@ public class EmbulkWriteSupport
27
28
  RecordConsumer consumer;
28
29
  WriteContext writeContext;
29
30
  TimestampFormatter[] timestampFormatters;
31
+ boolean addUTF8;
30
32
 
31
- public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters)
33
+ public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters, boolean addUTF8)
32
34
  {
33
35
  this.schema = schema;
34
36
  this.timestampFormatters = timestampFormatters;
37
+ this.addUTF8 = addUTF8;
35
38
  }
36
39
 
37
40
  @Override
@@ -73,7 +76,12 @@ public class EmbulkWriteSupport
73
76
 
74
77
  private MessageType convertSchema(Schema schema)
75
78
  {
76
- SchemaConvertColumnVisitor visitor = new SchemaConvertColumnVisitor();
79
+ SchemaConvertColumnVisitor visitor = null;
80
+ if (addUTF8) {
81
+ visitor = new SchemaConvertColumnVisitorWithUTF8();
82
+ } else {
83
+ visitor = new SchemaConvertColumnVisitor();
84
+ }
77
85
  schema.visitColumns(visitor);
78
86
  String messageName = "embulk";
79
87
  return new MessageType(messageName, visitor.getConvertedFields());
@@ -123,6 +131,12 @@ public class EmbulkWriteSupport
123
131
  }
124
132
  }
125
133
 
134
+ @Override
135
+ public void jsonColumn(Column column)
136
+ {
137
+ throw new UnsupportedOperationException("This plugin doesn't support json type. Please try to upgrade version of the plugin using 'embulk gem update' command. If the latest version still doesn't support json type, please contact plugin developers, or change configuration of input plugin not to use json type.");
138
+ }
139
+
126
140
  @Override
127
141
  public void timestampColumn(Column column)
128
142
  {
@@ -163,6 +177,12 @@ public class EmbulkWriteSupport
163
177
  fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName()));
164
178
  }
165
179
 
180
+ @Override
181
+ public void jsonColumn(Column column)
182
+ {
183
+ throw new UnsupportedOperationException("This plugin doesn't support json type. Please try to upgrade version of the plugin using 'embulk gem update' command. If the latest version still doesn't support json type, please contact plugin developers, or change configuration of input plugin not to use json type.");
184
+ }
185
+
166
186
  @Override
167
187
  public void timestampColumn(Column column)
168
188
  {
@@ -175,4 +195,21 @@ public class EmbulkWriteSupport
175
195
  return fields;
176
196
  }
177
197
  }
198
+
199
+ class SchemaConvertColumnVisitorWithUTF8 extends SchemaConvertColumnVisitor
200
+ {
201
+ @Override
202
+ public void stringColumn(Column column)
203
+ {
204
+ fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName(), OriginalType.UTF8));
205
+ }
206
+
207
+ @Override
208
+ public void timestampColumn(Column column)
209
+ {
210
+ // formatted as string
211
+ fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName(), OriginalType.UTF8));
212
+ }
213
+ }
214
+
178
215
  }
@@ -13,12 +13,14 @@ public class EmbulkWriterBuilder
13
13
  {
14
14
  final Schema schema;
15
15
  final TimestampFormatter[] timestampFormatters;
16
+ final boolean addUTF8;
16
17
 
17
- public EmbulkWriterBuilder(Path file, Schema schema, TimestampFormatter[] timestampFormatters)
18
+ public EmbulkWriterBuilder(Path file, Schema schema, TimestampFormatter[] timestampFormatters, boolean addUTF8)
18
19
  {
19
20
  super(file);
20
21
  this.schema = schema;
21
22
  this.timestampFormatters = timestampFormatters;
23
+ this.addUTF8 = addUTF8;
22
24
  }
23
25
 
24
26
  @Override
@@ -30,6 +32,6 @@ public class EmbulkWriterBuilder
30
32
  @Override
31
33
  protected WriteSupport<PageReader> getWriteSupport(Configuration conf)
32
34
  {
33
- return new EmbulkWriteSupport(schema, timestampFormatters);
35
+ return new EmbulkWriteSupport(schema, timestampFormatters, addUTF8);
34
36
  }
35
37
  }
@@ -1,5 +1,6 @@
1
1
  package org.embulk.output;
2
2
 
3
+ import com.amazonaws.SDKGlobalConfiguration;
3
4
  import com.google.common.base.Throwables;
4
5
  import org.apache.hadoop.conf.Configuration;
5
6
  import org.apache.hadoop.fs.LocalFileSystem;
@@ -8,13 +9,7 @@ import org.apache.hadoop.hdfs.DistributedFileSystem;
8
9
  import org.apache.parquet.hadoop.ParquetFileWriter;
9
10
  import org.apache.parquet.hadoop.ParquetWriter;
10
11
  import org.apache.parquet.hadoop.metadata.CompressionCodecName;
11
- import org.embulk.config.Config;
12
- import org.embulk.config.ConfigDefault;
13
- import org.embulk.config.ConfigDiff;
14
- import org.embulk.config.ConfigSource;
15
- import org.embulk.config.Task;
16
- import org.embulk.config.TaskReport;
17
- import org.embulk.config.TaskSource;
12
+ import org.embulk.config.*;
18
13
  import org.embulk.spi.Exec;
19
14
  import org.embulk.spi.OutputPlugin;
20
15
  import org.embulk.spi.Page;
@@ -23,8 +18,11 @@ import org.embulk.spi.Schema;
23
18
  import org.embulk.spi.TransactionalPageOutput;
24
19
  import org.embulk.spi.time.TimestampFormatter;
25
20
  import org.embulk.spi.util.Timestamps;
21
+ import org.embulk.output.parquet.ClassLoaderSwap;
26
22
 
23
+ import java.io.File;
27
24
  import java.io.IOException;
25
+ import java.net.MalformedURLException;
28
26
  import java.util.List;
29
27
  import java.util.Map;
30
28
 
@@ -64,6 +62,10 @@ public class ParquetOutputPlugin
64
62
  @ConfigDefault("{}")
65
63
  Map<String, TimestampColumnOption> getColumnOptions();
66
64
 
65
+ @Config("config_files")
66
+ @ConfigDefault("[]")
67
+ List<String> getConfigFiles();
68
+
67
69
  @Config("extra_configurations")
68
70
  @ConfigDefault("{}")
69
71
  Map<String, String> getExtraConfigurations();
@@ -71,6 +73,14 @@ public class ParquetOutputPlugin
71
73
  @Config("overwrite")
72
74
  @ConfigDefault("false")
73
75
  boolean getOverwrite();
76
+
77
+ @Config("enablesigv4")
78
+ @ConfigDefault("false")
79
+ String getSignature();
80
+
81
+ @Config("addUTF8")
82
+ @ConfigDefault("false")
83
+ boolean getAddUTF8();
74
84
  }
75
85
 
76
86
  public interface TimestampColumnOption
@@ -79,27 +89,30 @@ public class ParquetOutputPlugin
79
89
  }
80
90
 
81
91
  public ConfigDiff transaction(ConfigSource config,
82
- Schema schema, int processorCount,
83
- OutputPlugin.Control control)
92
+ Schema schema, int processorCount,
93
+ OutputPlugin.Control control)
84
94
  {
85
95
  PluginTask task = config.loadConfig(PluginTask.class);
86
96
 
87
97
  //TODO
88
98
 
89
- control.run(task.dump());
99
+
100
+ try (@SuppressWarnings("unchecked") ClassLoaderSwap clswp = new ClassLoaderSwap(this.getClass())) {
101
+ control.run(task.dump());
102
+ }
90
103
  return Exec.newConfigDiff();
91
104
  }
92
105
 
93
106
  public ConfigDiff resume(TaskSource taskSource,
94
- Schema schema, int processorCount,
95
- OutputPlugin.Control control)
107
+ Schema schema, int processorCount,
108
+ OutputPlugin.Control control)
96
109
  {
97
110
  throw new UnsupportedOperationException("parquet output plugin does not support resuming");
98
111
  }
99
112
 
100
113
  public void cleanup(TaskSource taskSource,
101
- Schema schema, int processorCount,
102
- List<TaskReport> successTaskReports)
114
+ Schema schema, int processorCount,
115
+ List<TaskReport> successTaskReports)
103
116
  {
104
117
  //TODO
105
118
  }
@@ -124,18 +137,22 @@ public class ParquetOutputPlugin
124
137
 
125
138
  private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
126
139
  {
140
+ //In case of using Frankurt (eu-central-1) with Signature Version 4 Signing Process
141
+ System.setProperty(SDKGlobalConfiguration.ENABLE_S3_SIGV4_SYSTEM_PROPERTY, task.getSignature());
142
+
127
143
  final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
144
+ final boolean addUTF8 = task.getAddUTF8();
128
145
 
129
146
  final Path path = new Path(buildPath(task, processorIndex));
130
147
  final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
131
148
  final int blockSize = task.getBlockSize();
132
149
  final int pageSize = task.getPageSize();
133
- final Configuration conf = createConfiguration(task.getExtraConfigurations());
150
+ final Configuration conf = createConfiguration(task.getExtraConfigurations(), task.getConfigFiles());
134
151
  final boolean overwrite = task.getOverwrite();
135
152
 
136
153
  ParquetWriter<PageReader> writer = null;
137
154
  try {
138
- EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters)
155
+ EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters, addUTF8)
139
156
  .withCompressionCodec(codec)
140
157
  .withRowGroupSize(blockSize)
141
158
  .withPageSize(pageSize)
@@ -147,14 +164,13 @@ public class ParquetOutputPlugin
147
164
  }
148
165
 
149
166
  writer = builder.build();
150
- }
151
- catch (IOException e) {
167
+ } catch (IOException e) {
152
168
  Throwables.propagate(e);
153
169
  }
154
170
  return writer;
155
171
  }
156
172
 
157
- private Configuration createConfiguration(Map<String, String> extra)
173
+ private Configuration createConfiguration(Map<String, String> extra, List<String> configFiles)
158
174
  {
159
175
  Configuration conf = new Configuration();
160
176
 
@@ -162,6 +178,15 @@ public class ParquetOutputPlugin
162
178
  conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
163
179
  conf.set("fs.file.impl", LocalFileSystem.class.getName());
164
180
 
181
+ for (String configFile : configFiles) {
182
+ File file = new File(configFile);
183
+ try {
184
+ conf.addResource(file.toURI().toURL());
185
+ } catch (MalformedURLException e) {
186
+ throw new ConfigException(e);
187
+ }
188
+ }
189
+
165
190
  // Optional values
166
191
  for (Map.Entry<String, String> entry : extra.entrySet()) {
167
192
  conf.set(entry.getKey(), entry.getValue());
@@ -192,8 +217,7 @@ public class ParquetOutputPlugin
192
217
  while (reader.nextRecord()) {
193
218
  writer.write(reader);
194
219
  }
195
- }
196
- catch (IOException e) {
220
+ } catch (IOException e) {
197
221
  Throwables.propagate(e);
198
222
  }
199
223
  }
@@ -204,8 +228,7 @@ public class ParquetOutputPlugin
204
228
  try {
205
229
  writer.close();
206
230
  writer = null;
207
- }
208
- catch (IOException e) {
231
+ } catch (IOException e) {
209
232
  Throwables.propagate(e);
210
233
  }
211
234
  }
@@ -0,0 +1,25 @@
1
+ package org.embulk.output.parquet;
2
+
3
+ /**
4
+ * This class is based on embulk-input-parquet_hadoop PluginClassLoaderScope.java
5
+ */
6
+ public class ClassLoaderSwap<T> implements AutoCloseable
7
+ {
8
+ private final ClassLoader pluginClassLoader;
9
+ private final ClassLoader orgClassLoader;
10
+ private final Thread curThread;
11
+
12
+ public ClassLoaderSwap(Class<T> pluginClass)
13
+ {
14
+ this.curThread = Thread.currentThread();
15
+ this.pluginClassLoader = pluginClass.getClassLoader();
16
+ this.orgClassLoader = curThread.getContextClassLoader();
17
+ curThread.setContextClassLoader(pluginClassLoader);
18
+ }
19
+
20
+ @Override
21
+ public void close()
22
+ {
23
+ curThread.setContextClassLoader(orgClassLoader);
24
+ }
25
+ }
@@ -10,6 +10,8 @@ import org.junit.Test;
10
10
 
11
11
  import java.lang.reflect.InvocationTargetException;
12
12
  import java.lang.reflect.Method;
13
+ import java.util.ArrayList;
14
+ import java.util.List;
13
15
  import java.util.Map;
14
16
 
15
17
  import static org.junit.Assert.assertEquals;
@@ -34,6 +36,7 @@ public class ParquetOutputPluginTest
34
36
  assertEquals(1048576, task.getPageSize());
35
37
  assertEquals("UNCOMPRESSED", task.getCompressionCodec());
36
38
  assertFalse(task.getOverwrite());
39
+ assertEquals("false", task.getSignature());
37
40
  }
38
41
 
39
42
  @Test(expected = ConfigException.class)
@@ -62,9 +65,9 @@ public class ParquetOutputPluginTest
62
65
  assertEquals("bar", extra.get("foo"));
63
66
 
64
67
  ParquetOutputPlugin plugin = new ParquetOutputPlugin();
65
- Method method = ParquetOutputPlugin.class.getDeclaredMethod("createConfiguration", Map.class);
68
+ Method method = ParquetOutputPlugin.class.getDeclaredMethod("createConfiguration", Map.class, List.class);
66
69
  method.setAccessible(true);
67
- Configuration conf = (Configuration) method.invoke(plugin, extra);
70
+ Configuration conf = (Configuration) method.invoke(plugin, extra, new ArrayList<String>());
68
71
  assertEquals("bar", conf.get("foo"));
69
72
  }
70
73
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - OKUNO Akihiro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-11 00:00:00.000000000 Z
11
+ date: 2018-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -60,6 +60,7 @@ files:
60
60
  - src/main/java/org/embulk/output/EmbulkWriteSupport.java
61
61
  - src/main/java/org/embulk/output/EmbulkWriterBuilder.java
62
62
  - src/main/java/org/embulk/output/ParquetOutputPlugin.java
63
+ - src/main/java/org/embulk/output/parquet/ClassLoaderSwap.java
63
64
  - src/test/java/org/embulk/output/ParquetOutputPluginTest.java
64
65
  - classpath/activation-1.1.jar
65
66
  - classpath/apacheds-i18n-2.0.0-M15.jar
@@ -85,7 +86,7 @@ files:
85
86
  - classpath/curator-client-2.7.1.jar
86
87
  - classpath/curator-framework-2.7.1.jar
87
88
  - classpath/curator-recipes-2.7.1.jar
88
- - classpath/embulk-output-parquet-0.5.0.jar
89
+ - classpath/embulk-output-parquet-0.6.0.jar
89
90
  - classpath/gson-2.2.4.jar
90
91
  - classpath/hadoop-annotations-2.7.1.jar
91
92
  - classpath/hadoop-auth-2.7.1.jar
@@ -123,7 +124,6 @@ files:
123
124
  - classpath/jetty-6.1.26.jar
124
125
  - classpath/jetty-util-6.1.26.jar
125
126
  - classpath/jline-0.9.94.jar
126
- - classpath/joda-time-2.9.9.jar
127
127
  - classpath/jsch-0.1.42.jar
128
128
  - classpath/jsp-api-2.1.jar
129
129
  - classpath/jsr305-3.0.0.jar