embulk-output-orc 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/gradle.yml +25 -0
- data/README.md +30 -9
- data/build.gradle +34 -15
- data/example/example.yml +4 -6
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +17 -1
- data/gradlew.bat +17 -1
- data/src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala +42 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala +156 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala +57 -0
- data/src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala +52 -0
- data/src/main/scala/org/embulk/output/orc/PluginTask.scala +56 -0
- data/src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala +32 -0
- data/src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java +71 -0
- data/src/test/resources/example-null.yml +25 -0
- data/src/test/resources/example.yml +25 -0
- metadata +45 -42
- data/.travis.yml +0 -14
- data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +0 -82
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +0 -249
- data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +0 -28
- data/src/main/java/org/embulk/output/orc/PluginTask.java +0 -60
- data/src/main/java/org/embulk/output/orc/TimestampColumnOption.java +0 -22
- data/src/test/java/org/embulk/output/orc/TestOrcOutputPlugin.java +0 -5
data/.travis.yml
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
language: java
|
2
|
-
jdk:
|
3
|
-
- oraclejdk8
|
4
|
-
|
5
|
-
cache:
|
6
|
-
directories: # run "travis cache --delete" to delete caches
|
7
|
-
- $HOME/.gradle
|
8
|
-
|
9
|
-
sudo: false
|
10
|
-
script:
|
11
|
-
- ./gradlew --info checkstyle
|
12
|
-
- ./gradlew --info check
|
13
|
-
|
14
|
-
after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
|
@@ -1,82 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
4
|
-
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
5
|
-
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
6
|
-
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
7
|
-
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
8
|
-
import org.embulk.spi.Column;
|
9
|
-
import org.embulk.spi.ColumnVisitor;
|
10
|
-
import org.embulk.spi.PageReader;
|
11
|
-
import org.embulk.spi.time.Timestamp;
|
12
|
-
|
13
|
-
import java.nio.charset.StandardCharsets;
|
14
|
-
|
15
|
-
public class OrcColumnVisitor
|
16
|
-
implements ColumnVisitor
|
17
|
-
{
|
18
|
-
private final PageReader reader;
|
19
|
-
private final VectorizedRowBatch batch;
|
20
|
-
private final Integer i;
|
21
|
-
|
22
|
-
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
23
|
-
{
|
24
|
-
this.reader = pageReader;
|
25
|
-
this.batch = rowBatch;
|
26
|
-
this.i = i;
|
27
|
-
}
|
28
|
-
|
29
|
-
@Override
|
30
|
-
public void booleanColumn(Column column)
|
31
|
-
{
|
32
|
-
if (reader.isNull(column)) {
|
33
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
34
|
-
}
|
35
|
-
else {
|
36
|
-
if (reader.getBoolean(column)) {
|
37
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
38
|
-
}
|
39
|
-
else {
|
40
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
41
|
-
}
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
@Override
|
46
|
-
public void longColumn(Column column)
|
47
|
-
{
|
48
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
|
49
|
-
}
|
50
|
-
|
51
|
-
@Override
|
52
|
-
public void doubleColumn(Column column)
|
53
|
-
{
|
54
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
|
55
|
-
}
|
56
|
-
|
57
|
-
@Override
|
58
|
-
public void stringColumn(Column column)
|
59
|
-
{
|
60
|
-
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
|
61
|
-
reader.getString(column).getBytes(StandardCharsets.UTF_8));
|
62
|
-
}
|
63
|
-
|
64
|
-
@Override
|
65
|
-
public void timestampColumn(Column column)
|
66
|
-
{
|
67
|
-
if (reader.isNull(column)) {
|
68
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
|
69
|
-
}
|
70
|
-
else {
|
71
|
-
Timestamp timestamp = reader.getTimestamp(column);
|
72
|
-
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
73
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
74
|
-
}
|
75
|
-
}
|
76
|
-
|
77
|
-
@Override
|
78
|
-
public void jsonColumn(Column column)
|
79
|
-
{
|
80
|
-
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
81
|
-
}
|
82
|
-
}
|
@@ -1,249 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Throwables;
|
4
|
-
import org.apache.hadoop.conf.Configuration;
|
5
|
-
import org.apache.hadoop.fs.LocalFileSystem;
|
6
|
-
import org.apache.hadoop.fs.Path;
|
7
|
-
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
8
|
-
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
9
|
-
import org.apache.hadoop.util.VersionInfo;
|
10
|
-
import org.apache.orc.CompressionKind;
|
11
|
-
import org.apache.orc.OrcFile;
|
12
|
-
import org.apache.orc.TypeDescription;
|
13
|
-
import org.apache.orc.Writer;
|
14
|
-
import org.embulk.config.ConfigDiff;
|
15
|
-
import org.embulk.config.ConfigSource;
|
16
|
-
import org.embulk.config.TaskReport;
|
17
|
-
import org.embulk.config.TaskSource;
|
18
|
-
import org.embulk.spi.Column;
|
19
|
-
import org.embulk.spi.Exec;
|
20
|
-
import org.embulk.spi.OutputPlugin;
|
21
|
-
import org.embulk.spi.Page;
|
22
|
-
import org.embulk.spi.PageReader;
|
23
|
-
import org.embulk.spi.Schema;
|
24
|
-
import org.embulk.spi.TransactionalPageOutput;
|
25
|
-
import org.embulk.spi.time.TimestampFormatter;
|
26
|
-
import org.embulk.spi.type.Type;
|
27
|
-
import org.embulk.spi.util.Timestamps;
|
28
|
-
import org.embulk.util.aws.credentials.AwsCredentials;
|
29
|
-
|
30
|
-
import java.io.IOException;
|
31
|
-
import java.util.List;
|
32
|
-
|
33
|
-
public class OrcOutputPlugin
|
34
|
-
implements OutputPlugin
|
35
|
-
{
|
36
|
-
@Override
|
37
|
-
public ConfigDiff transaction(ConfigSource config,
|
38
|
-
Schema schema, int taskCount,
|
39
|
-
OutputPlugin.Control control)
|
40
|
-
{
|
41
|
-
PluginTask task = config.loadConfig(PluginTask.class);
|
42
|
-
|
43
|
-
// retryable (idempotent) output:
|
44
|
-
// return resume(task.dump(), schema, taskCount, control);
|
45
|
-
|
46
|
-
// non-retryable (non-idempotent) output:
|
47
|
-
control.run(task.dump());
|
48
|
-
return Exec.newConfigDiff();
|
49
|
-
}
|
50
|
-
|
51
|
-
@Override
|
52
|
-
public ConfigDiff resume(TaskSource taskSource,
|
53
|
-
Schema schema, int taskCount,
|
54
|
-
OutputPlugin.Control control)
|
55
|
-
{
|
56
|
-
throw new UnsupportedOperationException("orc output plugin does not support resuming");
|
57
|
-
}
|
58
|
-
|
59
|
-
@Override
|
60
|
-
public void cleanup(TaskSource taskSource,
|
61
|
-
Schema schema, int taskCount,
|
62
|
-
List<TaskReport> successTaskReports)
|
63
|
-
|
64
|
-
{
|
65
|
-
}
|
66
|
-
|
67
|
-
@Override
|
68
|
-
public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
|
69
|
-
{
|
70
|
-
PluginTask task = taskSource.loadTask(PluginTask.class);
|
71
|
-
|
72
|
-
if (task.getOverwrite()) {
|
73
|
-
OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex));
|
74
|
-
}
|
75
|
-
|
76
|
-
final PageReader reader = new PageReader(schema);
|
77
|
-
Writer writer = createWriter(task, schema, taskIndex);
|
78
|
-
|
79
|
-
return new OrcTransactionalPageOutput(reader, writer, task);
|
80
|
-
}
|
81
|
-
|
82
|
-
private String buildPath(PluginTask task, int processorIndex)
|
83
|
-
{
|
84
|
-
final String pathPrefix = task.getPathPrefix();
|
85
|
-
final String pathSuffix = task.getFileNameExtension();
|
86
|
-
final String sequenceFormat = task.getSequenceFormat();
|
87
|
-
return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
|
88
|
-
}
|
89
|
-
|
90
|
-
private TypeDescription getSchema(Schema schema)
|
91
|
-
{
|
92
|
-
TypeDescription oschema = TypeDescription.createStruct();
|
93
|
-
for (int i = 0; i < schema.size(); i++) {
|
94
|
-
Column column = schema.getColumn(i);
|
95
|
-
Type type = column.getType();
|
96
|
-
switch (type.getName()) {
|
97
|
-
case "long":
|
98
|
-
oschema.addField(column.getName(), TypeDescription.createLong());
|
99
|
-
break;
|
100
|
-
case "double":
|
101
|
-
oschema.addField(column.getName(), TypeDescription.createDouble());
|
102
|
-
break;
|
103
|
-
case "boolean":
|
104
|
-
oschema.addField(column.getName(), TypeDescription.createBoolean());
|
105
|
-
break;
|
106
|
-
case "string":
|
107
|
-
oschema.addField(column.getName(), TypeDescription.createString());
|
108
|
-
break;
|
109
|
-
case "timestamp":
|
110
|
-
oschema.addField(column.getName(), TypeDescription.createTimestamp());
|
111
|
-
break;
|
112
|
-
default:
|
113
|
-
System.out.println("Unsupported type");
|
114
|
-
break;
|
115
|
-
}
|
116
|
-
}
|
117
|
-
return oschema;
|
118
|
-
}
|
119
|
-
|
120
|
-
private Configuration getHadoopConfiguration(PluginTask task)
|
121
|
-
{
|
122
|
-
Configuration conf = new Configuration();
|
123
|
-
|
124
|
-
// see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
|
125
|
-
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
|
126
|
-
conf.set("fs.file.impl", LocalFileSystem.class.getName());
|
127
|
-
// see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
|
128
|
-
|
129
|
-
AwsCredentials.getAWSCredentialsProvider(task);
|
130
|
-
if (task.getAccessKeyId().isPresent()) {
|
131
|
-
conf.set("fs.s3a.access.key", task.getAccessKeyId().get());
|
132
|
-
conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId().get());
|
133
|
-
}
|
134
|
-
if (task.getSecretAccessKey().isPresent()) {
|
135
|
-
conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
|
136
|
-
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
|
137
|
-
}
|
138
|
-
if (task.getEndpoint().isPresent()) {
|
139
|
-
conf.set("fs.s3a.endpoint", task.getEndpoint().get());
|
140
|
-
}
|
141
|
-
return conf;
|
142
|
-
}
|
143
|
-
|
144
|
-
private Writer createWriter(PluginTask task, Schema schema, int processorIndex)
|
145
|
-
{
|
146
|
-
final TimestampFormatter[] timestampFormatters = Timestamps
|
147
|
-
.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
|
148
|
-
|
149
|
-
Configuration conf = getHadoopConfiguration(task);
|
150
|
-
TypeDescription oschema = getSchema(schema);
|
151
|
-
|
152
|
-
// see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
|
153
|
-
Thread.currentThread().setContextClassLoader(VersionInfo.class.getClassLoader());
|
154
|
-
|
155
|
-
Writer writer = null;
|
156
|
-
try {
|
157
|
-
// Make writerOptions
|
158
|
-
OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
|
159
|
-
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
160
|
-
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
161
|
-
writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
|
162
|
-
writerOptions.setSchema(oschema)
|
163
|
-
.version(OrcFile.Version.V_0_12));
|
164
|
-
}
|
165
|
-
catch (IOException e) {
|
166
|
-
Throwables.propagate(e);
|
167
|
-
}
|
168
|
-
return writer;
|
169
|
-
}
|
170
|
-
|
171
|
-
private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
|
172
|
-
{
|
173
|
-
final Integer bufferSize = task.getBufferSize();
|
174
|
-
final Integer stripSize = task.getStripSize();
|
175
|
-
final Integer blockSize = task.getBlockSize();
|
176
|
-
final String kindString = task.getCompressionKind();
|
177
|
-
CompressionKind kind = CompressionKind.valueOf(kindString);
|
178
|
-
return OrcFile.writerOptions(conf)
|
179
|
-
.bufferSize(bufferSize)
|
180
|
-
.blockSize(blockSize)
|
181
|
-
.stripeSize(stripSize)
|
182
|
-
.compress(kind);
|
183
|
-
}
|
184
|
-
|
185
|
-
class OrcTransactionalPageOutput
|
186
|
-
implements TransactionalPageOutput
|
187
|
-
{
|
188
|
-
private final PageReader reader;
|
189
|
-
private final Writer writer;
|
190
|
-
|
191
|
-
public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
|
192
|
-
{
|
193
|
-
this.reader = reader;
|
194
|
-
this.writer = writer;
|
195
|
-
}
|
196
|
-
|
197
|
-
@Override
|
198
|
-
public void add(Page page)
|
199
|
-
{
|
200
|
-
int size = page.getStringReferences().size();
|
201
|
-
final TypeDescription schema = getSchema(reader.getSchema());
|
202
|
-
final VectorizedRowBatch batch = schema.createRowBatch();
|
203
|
-
batch.size = size;
|
204
|
-
|
205
|
-
reader.setPage(page);
|
206
|
-
int i = 0;
|
207
|
-
while (reader.nextRecord()) {
|
208
|
-
reader.getSchema().visitColumns(
|
209
|
-
new OrcColumnVisitor(reader, batch, i)
|
210
|
-
);
|
211
|
-
i++;
|
212
|
-
}
|
213
|
-
try {
|
214
|
-
writer.addRowBatch(batch);
|
215
|
-
batch.reset();
|
216
|
-
}
|
217
|
-
catch (IOException e) {
|
218
|
-
e.printStackTrace();
|
219
|
-
}
|
220
|
-
}
|
221
|
-
|
222
|
-
@Override
|
223
|
-
public void finish()
|
224
|
-
{
|
225
|
-
try {
|
226
|
-
writer.close();
|
227
|
-
}
|
228
|
-
catch (IOException e) {
|
229
|
-
Throwables.propagate(e);
|
230
|
-
}
|
231
|
-
}
|
232
|
-
|
233
|
-
@Override
|
234
|
-
public void close()
|
235
|
-
{
|
236
|
-
}
|
237
|
-
|
238
|
-
@Override
|
239
|
-
public void abort()
|
240
|
-
{
|
241
|
-
}
|
242
|
-
|
243
|
-
@Override
|
244
|
-
public TaskReport commit()
|
245
|
-
{
|
246
|
-
return Exec.newTaskReport();
|
247
|
-
}
|
248
|
-
}
|
249
|
-
}
|
@@ -1,28 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Throwables;
|
4
|
-
|
5
|
-
import java.io.IOException;
|
6
|
-
import java.nio.file.Files;
|
7
|
-
import java.nio.file.Path;
|
8
|
-
import java.nio.file.Paths;
|
9
|
-
|
10
|
-
class OrcOutputPluginHelper
|
11
|
-
{
|
12
|
-
protected OrcOutputPluginHelper()
|
13
|
-
{
|
14
|
-
throw new UnsupportedOperationException();
|
15
|
-
}
|
16
|
-
|
17
|
-
static void removeOldFile(String fpath)
|
18
|
-
{
|
19
|
-
Path path = Paths.get(fpath);
|
20
|
-
// TODO: Check local file. not HDFS or S3.
|
21
|
-
try {
|
22
|
-
Files.deleteIfExists(path);
|
23
|
-
}
|
24
|
-
catch (IOException e) {
|
25
|
-
Throwables.propagate(e);
|
26
|
-
}
|
27
|
-
}
|
28
|
-
}
|
@@ -1,60 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import org.embulk.config.Config;
|
5
|
-
import org.embulk.config.ConfigDefault;
|
6
|
-
import org.embulk.config.Task;
|
7
|
-
import org.embulk.spi.time.TimestampFormatter;
|
8
|
-
import org.embulk.util.aws.credentials.AwsCredentialsTask;
|
9
|
-
import org.joda.time.DateTimeZone;
|
10
|
-
|
11
|
-
import java.util.Map;
|
12
|
-
|
13
|
-
public interface PluginTask
|
14
|
-
extends Task, TimestampFormatter.Task, AwsCredentialsTask
|
15
|
-
{
|
16
|
-
@Config("path_prefix")
|
17
|
-
String getPathPrefix();
|
18
|
-
|
19
|
-
@Config("file_ext")
|
20
|
-
@ConfigDefault("\".orc\"")
|
21
|
-
String getFileNameExtension();
|
22
|
-
|
23
|
-
@Config("column_options")
|
24
|
-
@ConfigDefault("{}")
|
25
|
-
Map<String, TimestampColumnOption> getColumnOptions();
|
26
|
-
|
27
|
-
@Config("sequence_format")
|
28
|
-
@ConfigDefault("\".%03d\"")
|
29
|
-
String getSequenceFormat();
|
30
|
-
|
31
|
-
// see: https://orc.apache.org/docs/hive-config.html
|
32
|
-
// ORC File options
|
33
|
-
@Config("strip_size")
|
34
|
-
@ConfigDefault("67108864") // 64MB
|
35
|
-
Integer getStripSize();
|
36
|
-
|
37
|
-
@Config("buffer_size")
|
38
|
-
@ConfigDefault("262144") // 256KB
|
39
|
-
Integer getBufferSize();
|
40
|
-
|
41
|
-
@Config("block_size")
|
42
|
-
@ConfigDefault("268435456") // 256MB
|
43
|
-
Integer getBlockSize();
|
44
|
-
|
45
|
-
@Config("compression_kind")
|
46
|
-
@ConfigDefault("ZLIB")
|
47
|
-
public String getCompressionKind();
|
48
|
-
|
49
|
-
@Config("overwrite")
|
50
|
-
@ConfigDefault("false")
|
51
|
-
boolean getOverwrite();
|
52
|
-
|
53
|
-
@Config("default_from_timezone")
|
54
|
-
@ConfigDefault("\"UTC\"")
|
55
|
-
DateTimeZone getDefaultFromTimeZone();
|
56
|
-
|
57
|
-
@Config("endpoint")
|
58
|
-
@ConfigDefault("null")
|
59
|
-
Optional<String> getEndpoint();
|
60
|
-
}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import org.embulk.config.Config;
|
5
|
-
import org.embulk.config.ConfigDefault;
|
6
|
-
import org.embulk.config.Task;
|
7
|
-
import org.embulk.spi.time.TimestampFormatter;
|
8
|
-
import org.joda.time.DateTimeZone;
|
9
|
-
|
10
|
-
import java.util.List;
|
11
|
-
|
12
|
-
public interface TimestampColumnOption
|
13
|
-
extends Task, TimestampFormatter.TimestampColumnOption
|
14
|
-
{
|
15
|
-
@Config("from_timezone")
|
16
|
-
@ConfigDefault("null")
|
17
|
-
Optional<DateTimeZone> getFromTimeZone();
|
18
|
-
|
19
|
-
@Config("from_format")
|
20
|
-
@ConfigDefault("null")
|
21
|
-
Optional<List<String>> getFromFormat();
|
22
|
-
}
|