embulk-output-orc 0.3.0 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/gradle.yml +25 -0
- data/README.md +30 -9
- data/build.gradle +34 -15
- data/example/example.yml +4 -6
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +17 -1
- data/gradlew.bat +17 -1
- data/src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala +42 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala +156 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala +57 -0
- data/src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala +52 -0
- data/src/main/scala/org/embulk/output/orc/PluginTask.scala +56 -0
- data/src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala +32 -0
- data/src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java +71 -0
- data/src/test/resources/example-null.yml +25 -0
- data/src/test/resources/example.yml +25 -0
- metadata +45 -42
- data/.travis.yml +0 -14
- data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +0 -82
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +0 -249
- data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +0 -28
- data/src/main/java/org/embulk/output/orc/PluginTask.java +0 -60
- data/src/main/java/org/embulk/output/orc/TimestampColumnOption.java +0 -22
- data/src/test/java/org/embulk/output/orc/TestOrcOutputPlugin.java +0 -5
data/.travis.yml
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
language: java
|
2
|
-
jdk:
|
3
|
-
- oraclejdk8
|
4
|
-
|
5
|
-
cache:
|
6
|
-
directories: # run "travis cache --delete" to delete caches
|
7
|
-
- $HOME/.gradle
|
8
|
-
|
9
|
-
sudo: false
|
10
|
-
script:
|
11
|
-
- ./gradlew --info checkstyle
|
12
|
-
- ./gradlew --info check
|
13
|
-
|
14
|
-
after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
|
@@ -1,82 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
4
|
-
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
5
|
-
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
6
|
-
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
7
|
-
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
8
|
-
import org.embulk.spi.Column;
|
9
|
-
import org.embulk.spi.ColumnVisitor;
|
10
|
-
import org.embulk.spi.PageReader;
|
11
|
-
import org.embulk.spi.time.Timestamp;
|
12
|
-
|
13
|
-
import java.nio.charset.StandardCharsets;
|
14
|
-
|
15
|
-
public class OrcColumnVisitor
|
16
|
-
implements ColumnVisitor
|
17
|
-
{
|
18
|
-
private final PageReader reader;
|
19
|
-
private final VectorizedRowBatch batch;
|
20
|
-
private final Integer i;
|
21
|
-
|
22
|
-
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
23
|
-
{
|
24
|
-
this.reader = pageReader;
|
25
|
-
this.batch = rowBatch;
|
26
|
-
this.i = i;
|
27
|
-
}
|
28
|
-
|
29
|
-
@Override
|
30
|
-
public void booleanColumn(Column column)
|
31
|
-
{
|
32
|
-
if (reader.isNull(column)) {
|
33
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
34
|
-
}
|
35
|
-
else {
|
36
|
-
if (reader.getBoolean(column)) {
|
37
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
38
|
-
}
|
39
|
-
else {
|
40
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
41
|
-
}
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
@Override
|
46
|
-
public void longColumn(Column column)
|
47
|
-
{
|
48
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
|
49
|
-
}
|
50
|
-
|
51
|
-
@Override
|
52
|
-
public void doubleColumn(Column column)
|
53
|
-
{
|
54
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
|
55
|
-
}
|
56
|
-
|
57
|
-
@Override
|
58
|
-
public void stringColumn(Column column)
|
59
|
-
{
|
60
|
-
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
|
61
|
-
reader.getString(column).getBytes(StandardCharsets.UTF_8));
|
62
|
-
}
|
63
|
-
|
64
|
-
@Override
|
65
|
-
public void timestampColumn(Column column)
|
66
|
-
{
|
67
|
-
if (reader.isNull(column)) {
|
68
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
|
69
|
-
}
|
70
|
-
else {
|
71
|
-
Timestamp timestamp = reader.getTimestamp(column);
|
72
|
-
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
73
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
74
|
-
}
|
75
|
-
}
|
76
|
-
|
77
|
-
@Override
|
78
|
-
public void jsonColumn(Column column)
|
79
|
-
{
|
80
|
-
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
81
|
-
}
|
82
|
-
}
|
@@ -1,249 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Throwables;
|
4
|
-
import org.apache.hadoop.conf.Configuration;
|
5
|
-
import org.apache.hadoop.fs.LocalFileSystem;
|
6
|
-
import org.apache.hadoop.fs.Path;
|
7
|
-
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
8
|
-
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
9
|
-
import org.apache.hadoop.util.VersionInfo;
|
10
|
-
import org.apache.orc.CompressionKind;
|
11
|
-
import org.apache.orc.OrcFile;
|
12
|
-
import org.apache.orc.TypeDescription;
|
13
|
-
import org.apache.orc.Writer;
|
14
|
-
import org.embulk.config.ConfigDiff;
|
15
|
-
import org.embulk.config.ConfigSource;
|
16
|
-
import org.embulk.config.TaskReport;
|
17
|
-
import org.embulk.config.TaskSource;
|
18
|
-
import org.embulk.spi.Column;
|
19
|
-
import org.embulk.spi.Exec;
|
20
|
-
import org.embulk.spi.OutputPlugin;
|
21
|
-
import org.embulk.spi.Page;
|
22
|
-
import org.embulk.spi.PageReader;
|
23
|
-
import org.embulk.spi.Schema;
|
24
|
-
import org.embulk.spi.TransactionalPageOutput;
|
25
|
-
import org.embulk.spi.time.TimestampFormatter;
|
26
|
-
import org.embulk.spi.type.Type;
|
27
|
-
import org.embulk.spi.util.Timestamps;
|
28
|
-
import org.embulk.util.aws.credentials.AwsCredentials;
|
29
|
-
|
30
|
-
import java.io.IOException;
|
31
|
-
import java.util.List;
|
32
|
-
|
33
|
-
public class OrcOutputPlugin
|
34
|
-
implements OutputPlugin
|
35
|
-
{
|
36
|
-
@Override
|
37
|
-
public ConfigDiff transaction(ConfigSource config,
|
38
|
-
Schema schema, int taskCount,
|
39
|
-
OutputPlugin.Control control)
|
40
|
-
{
|
41
|
-
PluginTask task = config.loadConfig(PluginTask.class);
|
42
|
-
|
43
|
-
// retryable (idempotent) output:
|
44
|
-
// return resume(task.dump(), schema, taskCount, control);
|
45
|
-
|
46
|
-
// non-retryable (non-idempotent) output:
|
47
|
-
control.run(task.dump());
|
48
|
-
return Exec.newConfigDiff();
|
49
|
-
}
|
50
|
-
|
51
|
-
@Override
|
52
|
-
public ConfigDiff resume(TaskSource taskSource,
|
53
|
-
Schema schema, int taskCount,
|
54
|
-
OutputPlugin.Control control)
|
55
|
-
{
|
56
|
-
throw new UnsupportedOperationException("orc output plugin does not support resuming");
|
57
|
-
}
|
58
|
-
|
59
|
-
@Override
|
60
|
-
public void cleanup(TaskSource taskSource,
|
61
|
-
Schema schema, int taskCount,
|
62
|
-
List<TaskReport> successTaskReports)
|
63
|
-
|
64
|
-
{
|
65
|
-
}
|
66
|
-
|
67
|
-
@Override
|
68
|
-
public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
|
69
|
-
{
|
70
|
-
PluginTask task = taskSource.loadTask(PluginTask.class);
|
71
|
-
|
72
|
-
if (task.getOverwrite()) {
|
73
|
-
OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex));
|
74
|
-
}
|
75
|
-
|
76
|
-
final PageReader reader = new PageReader(schema);
|
77
|
-
Writer writer = createWriter(task, schema, taskIndex);
|
78
|
-
|
79
|
-
return new OrcTransactionalPageOutput(reader, writer, task);
|
80
|
-
}
|
81
|
-
|
82
|
-
private String buildPath(PluginTask task, int processorIndex)
|
83
|
-
{
|
84
|
-
final String pathPrefix = task.getPathPrefix();
|
85
|
-
final String pathSuffix = task.getFileNameExtension();
|
86
|
-
final String sequenceFormat = task.getSequenceFormat();
|
87
|
-
return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
|
88
|
-
}
|
89
|
-
|
90
|
-
private TypeDescription getSchema(Schema schema)
|
91
|
-
{
|
92
|
-
TypeDescription oschema = TypeDescription.createStruct();
|
93
|
-
for (int i = 0; i < schema.size(); i++) {
|
94
|
-
Column column = schema.getColumn(i);
|
95
|
-
Type type = column.getType();
|
96
|
-
switch (type.getName()) {
|
97
|
-
case "long":
|
98
|
-
oschema.addField(column.getName(), TypeDescription.createLong());
|
99
|
-
break;
|
100
|
-
case "double":
|
101
|
-
oschema.addField(column.getName(), TypeDescription.createDouble());
|
102
|
-
break;
|
103
|
-
case "boolean":
|
104
|
-
oschema.addField(column.getName(), TypeDescription.createBoolean());
|
105
|
-
break;
|
106
|
-
case "string":
|
107
|
-
oschema.addField(column.getName(), TypeDescription.createString());
|
108
|
-
break;
|
109
|
-
case "timestamp":
|
110
|
-
oschema.addField(column.getName(), TypeDescription.createTimestamp());
|
111
|
-
break;
|
112
|
-
default:
|
113
|
-
System.out.println("Unsupported type");
|
114
|
-
break;
|
115
|
-
}
|
116
|
-
}
|
117
|
-
return oschema;
|
118
|
-
}
|
119
|
-
|
120
|
-
private Configuration getHadoopConfiguration(PluginTask task)
|
121
|
-
{
|
122
|
-
Configuration conf = new Configuration();
|
123
|
-
|
124
|
-
// see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
|
125
|
-
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
|
126
|
-
conf.set("fs.file.impl", LocalFileSystem.class.getName());
|
127
|
-
// see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
|
128
|
-
|
129
|
-
AwsCredentials.getAWSCredentialsProvider(task);
|
130
|
-
if (task.getAccessKeyId().isPresent()) {
|
131
|
-
conf.set("fs.s3a.access.key", task.getAccessKeyId().get());
|
132
|
-
conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId().get());
|
133
|
-
}
|
134
|
-
if (task.getSecretAccessKey().isPresent()) {
|
135
|
-
conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
|
136
|
-
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
|
137
|
-
}
|
138
|
-
if (task.getEndpoint().isPresent()) {
|
139
|
-
conf.set("fs.s3a.endpoint", task.getEndpoint().get());
|
140
|
-
}
|
141
|
-
return conf;
|
142
|
-
}
|
143
|
-
|
144
|
-
private Writer createWriter(PluginTask task, Schema schema, int processorIndex)
|
145
|
-
{
|
146
|
-
final TimestampFormatter[] timestampFormatters = Timestamps
|
147
|
-
.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
|
148
|
-
|
149
|
-
Configuration conf = getHadoopConfiguration(task);
|
150
|
-
TypeDescription oschema = getSchema(schema);
|
151
|
-
|
152
|
-
// see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
|
153
|
-
Thread.currentThread().setContextClassLoader(VersionInfo.class.getClassLoader());
|
154
|
-
|
155
|
-
Writer writer = null;
|
156
|
-
try {
|
157
|
-
// Make writerOptions
|
158
|
-
OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
|
159
|
-
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
160
|
-
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
161
|
-
writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
|
162
|
-
writerOptions.setSchema(oschema)
|
163
|
-
.version(OrcFile.Version.V_0_12));
|
164
|
-
}
|
165
|
-
catch (IOException e) {
|
166
|
-
Throwables.propagate(e);
|
167
|
-
}
|
168
|
-
return writer;
|
169
|
-
}
|
170
|
-
|
171
|
-
private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
|
172
|
-
{
|
173
|
-
final Integer bufferSize = task.getBufferSize();
|
174
|
-
final Integer stripSize = task.getStripSize();
|
175
|
-
final Integer blockSize = task.getBlockSize();
|
176
|
-
final String kindString = task.getCompressionKind();
|
177
|
-
CompressionKind kind = CompressionKind.valueOf(kindString);
|
178
|
-
return OrcFile.writerOptions(conf)
|
179
|
-
.bufferSize(bufferSize)
|
180
|
-
.blockSize(blockSize)
|
181
|
-
.stripeSize(stripSize)
|
182
|
-
.compress(kind);
|
183
|
-
}
|
184
|
-
|
185
|
-
class OrcTransactionalPageOutput
|
186
|
-
implements TransactionalPageOutput
|
187
|
-
{
|
188
|
-
private final PageReader reader;
|
189
|
-
private final Writer writer;
|
190
|
-
|
191
|
-
public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
|
192
|
-
{
|
193
|
-
this.reader = reader;
|
194
|
-
this.writer = writer;
|
195
|
-
}
|
196
|
-
|
197
|
-
@Override
|
198
|
-
public void add(Page page)
|
199
|
-
{
|
200
|
-
int size = page.getStringReferences().size();
|
201
|
-
final TypeDescription schema = getSchema(reader.getSchema());
|
202
|
-
final VectorizedRowBatch batch = schema.createRowBatch();
|
203
|
-
batch.size = size;
|
204
|
-
|
205
|
-
reader.setPage(page);
|
206
|
-
int i = 0;
|
207
|
-
while (reader.nextRecord()) {
|
208
|
-
reader.getSchema().visitColumns(
|
209
|
-
new OrcColumnVisitor(reader, batch, i)
|
210
|
-
);
|
211
|
-
i++;
|
212
|
-
}
|
213
|
-
try {
|
214
|
-
writer.addRowBatch(batch);
|
215
|
-
batch.reset();
|
216
|
-
}
|
217
|
-
catch (IOException e) {
|
218
|
-
e.printStackTrace();
|
219
|
-
}
|
220
|
-
}
|
221
|
-
|
222
|
-
@Override
|
223
|
-
public void finish()
|
224
|
-
{
|
225
|
-
try {
|
226
|
-
writer.close();
|
227
|
-
}
|
228
|
-
catch (IOException e) {
|
229
|
-
Throwables.propagate(e);
|
230
|
-
}
|
231
|
-
}
|
232
|
-
|
233
|
-
@Override
|
234
|
-
public void close()
|
235
|
-
{
|
236
|
-
}
|
237
|
-
|
238
|
-
@Override
|
239
|
-
public void abort()
|
240
|
-
{
|
241
|
-
}
|
242
|
-
|
243
|
-
@Override
|
244
|
-
public TaskReport commit()
|
245
|
-
{
|
246
|
-
return Exec.newTaskReport();
|
247
|
-
}
|
248
|
-
}
|
249
|
-
}
|
@@ -1,28 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Throwables;
|
4
|
-
|
5
|
-
import java.io.IOException;
|
6
|
-
import java.nio.file.Files;
|
7
|
-
import java.nio.file.Path;
|
8
|
-
import java.nio.file.Paths;
|
9
|
-
|
10
|
-
class OrcOutputPluginHelper
|
11
|
-
{
|
12
|
-
protected OrcOutputPluginHelper()
|
13
|
-
{
|
14
|
-
throw new UnsupportedOperationException();
|
15
|
-
}
|
16
|
-
|
17
|
-
static void removeOldFile(String fpath)
|
18
|
-
{
|
19
|
-
Path path = Paths.get(fpath);
|
20
|
-
// TODO: Check local file. not HDFS or S3.
|
21
|
-
try {
|
22
|
-
Files.deleteIfExists(path);
|
23
|
-
}
|
24
|
-
catch (IOException e) {
|
25
|
-
Throwables.propagate(e);
|
26
|
-
}
|
27
|
-
}
|
28
|
-
}
|
@@ -1,60 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import org.embulk.config.Config;
|
5
|
-
import org.embulk.config.ConfigDefault;
|
6
|
-
import org.embulk.config.Task;
|
7
|
-
import org.embulk.spi.time.TimestampFormatter;
|
8
|
-
import org.embulk.util.aws.credentials.AwsCredentialsTask;
|
9
|
-
import org.joda.time.DateTimeZone;
|
10
|
-
|
11
|
-
import java.util.Map;
|
12
|
-
|
13
|
-
public interface PluginTask
|
14
|
-
extends Task, TimestampFormatter.Task, AwsCredentialsTask
|
15
|
-
{
|
16
|
-
@Config("path_prefix")
|
17
|
-
String getPathPrefix();
|
18
|
-
|
19
|
-
@Config("file_ext")
|
20
|
-
@ConfigDefault("\".orc\"")
|
21
|
-
String getFileNameExtension();
|
22
|
-
|
23
|
-
@Config("column_options")
|
24
|
-
@ConfigDefault("{}")
|
25
|
-
Map<String, TimestampColumnOption> getColumnOptions();
|
26
|
-
|
27
|
-
@Config("sequence_format")
|
28
|
-
@ConfigDefault("\".%03d\"")
|
29
|
-
String getSequenceFormat();
|
30
|
-
|
31
|
-
// see: https://orc.apache.org/docs/hive-config.html
|
32
|
-
// ORC File options
|
33
|
-
@Config("strip_size")
|
34
|
-
@ConfigDefault("67108864") // 64MB
|
35
|
-
Integer getStripSize();
|
36
|
-
|
37
|
-
@Config("buffer_size")
|
38
|
-
@ConfigDefault("262144") // 256KB
|
39
|
-
Integer getBufferSize();
|
40
|
-
|
41
|
-
@Config("block_size")
|
42
|
-
@ConfigDefault("268435456") // 256MB
|
43
|
-
Integer getBlockSize();
|
44
|
-
|
45
|
-
@Config("compression_kind")
|
46
|
-
@ConfigDefault("ZLIB")
|
47
|
-
public String getCompressionKind();
|
48
|
-
|
49
|
-
@Config("overwrite")
|
50
|
-
@ConfigDefault("false")
|
51
|
-
boolean getOverwrite();
|
52
|
-
|
53
|
-
@Config("default_from_timezone")
|
54
|
-
@ConfigDefault("\"UTC\"")
|
55
|
-
DateTimeZone getDefaultFromTimeZone();
|
56
|
-
|
57
|
-
@Config("endpoint")
|
58
|
-
@ConfigDefault("null")
|
59
|
-
Optional<String> getEndpoint();
|
60
|
-
}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import org.embulk.config.Config;
|
5
|
-
import org.embulk.config.ConfigDefault;
|
6
|
-
import org.embulk.config.Task;
|
7
|
-
import org.embulk.spi.time.TimestampFormatter;
|
8
|
-
import org.joda.time.DateTimeZone;
|
9
|
-
|
10
|
-
import java.util.List;
|
11
|
-
|
12
|
-
public interface TimestampColumnOption
|
13
|
-
extends Task, TimestampFormatter.TimestampColumnOption
|
14
|
-
{
|
15
|
-
@Config("from_timezone")
|
16
|
-
@ConfigDefault("null")
|
17
|
-
Optional<DateTimeZone> getFromTimeZone();
|
18
|
-
|
19
|
-
@Config("from_format")
|
20
|
-
@ConfigDefault("null")
|
21
|
-
Optional<List<String>> getFromFormat();
|
22
|
-
}
|