embulk-output-orc 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +0,0 @@
1
- language: java
2
- jdk:
3
- - oraclejdk8
4
-
5
- cache:
6
- directories: # run "travis cache --delete" to delete caches
7
- - $HOME/.gradle
8
-
9
- sudo: false
10
- script:
11
- - ./gradlew --info checkstyle
12
- - ./gradlew --info check
13
-
14
- after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
@@ -1,82 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
4
- import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
5
- import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
6
- import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
7
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
8
- import org.embulk.spi.Column;
9
- import org.embulk.spi.ColumnVisitor;
10
- import org.embulk.spi.PageReader;
11
- import org.embulk.spi.time.Timestamp;
12
-
13
- import java.nio.charset.StandardCharsets;
14
-
15
- public class OrcColumnVisitor
16
- implements ColumnVisitor
17
- {
18
- private final PageReader reader;
19
- private final VectorizedRowBatch batch;
20
- private final Integer i;
21
-
22
- public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
23
- {
24
- this.reader = pageReader;
25
- this.batch = rowBatch;
26
- this.i = i;
27
- }
28
-
29
- @Override
30
- public void booleanColumn(Column column)
31
- {
32
- if (reader.isNull(column)) {
33
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
34
- }
35
- else {
36
- if (reader.getBoolean(column)) {
37
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
38
- }
39
- else {
40
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
41
- }
42
- }
43
- }
44
-
45
- @Override
46
- public void longColumn(Column column)
47
- {
48
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
49
- }
50
-
51
- @Override
52
- public void doubleColumn(Column column)
53
- {
54
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
55
- }
56
-
57
- @Override
58
- public void stringColumn(Column column)
59
- {
60
- ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
61
- reader.getString(column).getBytes(StandardCharsets.UTF_8));
62
- }
63
-
64
- @Override
65
- public void timestampColumn(Column column)
66
- {
67
- if (reader.isNull(column)) {
68
- ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
69
- }
70
- else {
71
- Timestamp timestamp = reader.getTimestamp(column);
72
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
74
- }
75
- }
76
-
77
- @Override
78
- public void jsonColumn(Column column)
79
- {
80
- throw new UnsupportedOperationException("orc output plugin does not support json type");
81
- }
82
- }
@@ -1,249 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Throwables;
4
- import org.apache.hadoop.conf.Configuration;
5
- import org.apache.hadoop.fs.LocalFileSystem;
6
- import org.apache.hadoop.fs.Path;
7
- import org.apache.hadoop.hdfs.DistributedFileSystem;
8
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
9
- import org.apache.hadoop.util.VersionInfo;
10
- import org.apache.orc.CompressionKind;
11
- import org.apache.orc.OrcFile;
12
- import org.apache.orc.TypeDescription;
13
- import org.apache.orc.Writer;
14
- import org.embulk.config.ConfigDiff;
15
- import org.embulk.config.ConfigSource;
16
- import org.embulk.config.TaskReport;
17
- import org.embulk.config.TaskSource;
18
- import org.embulk.spi.Column;
19
- import org.embulk.spi.Exec;
20
- import org.embulk.spi.OutputPlugin;
21
- import org.embulk.spi.Page;
22
- import org.embulk.spi.PageReader;
23
- import org.embulk.spi.Schema;
24
- import org.embulk.spi.TransactionalPageOutput;
25
- import org.embulk.spi.time.TimestampFormatter;
26
- import org.embulk.spi.type.Type;
27
- import org.embulk.spi.util.Timestamps;
28
- import org.embulk.util.aws.credentials.AwsCredentials;
29
-
30
- import java.io.IOException;
31
- import java.util.List;
32
-
33
- public class OrcOutputPlugin
34
- implements OutputPlugin
35
- {
36
- @Override
37
- public ConfigDiff transaction(ConfigSource config,
38
- Schema schema, int taskCount,
39
- OutputPlugin.Control control)
40
- {
41
- PluginTask task = config.loadConfig(PluginTask.class);
42
-
43
- // retryable (idempotent) output:
44
- // return resume(task.dump(), schema, taskCount, control);
45
-
46
- // non-retryable (non-idempotent) output:
47
- control.run(task.dump());
48
- return Exec.newConfigDiff();
49
- }
50
-
51
- @Override
52
- public ConfigDiff resume(TaskSource taskSource,
53
- Schema schema, int taskCount,
54
- OutputPlugin.Control control)
55
- {
56
- throw new UnsupportedOperationException("orc output plugin does not support resuming");
57
- }
58
-
59
- @Override
60
- public void cleanup(TaskSource taskSource,
61
- Schema schema, int taskCount,
62
- List<TaskReport> successTaskReports)
63
-
64
- {
65
- }
66
-
67
- @Override
68
- public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
69
- {
70
- PluginTask task = taskSource.loadTask(PluginTask.class);
71
-
72
- if (task.getOverwrite()) {
73
- OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex));
74
- }
75
-
76
- final PageReader reader = new PageReader(schema);
77
- Writer writer = createWriter(task, schema, taskIndex);
78
-
79
- return new OrcTransactionalPageOutput(reader, writer, task);
80
- }
81
-
82
- private String buildPath(PluginTask task, int processorIndex)
83
- {
84
- final String pathPrefix = task.getPathPrefix();
85
- final String pathSuffix = task.getFileNameExtension();
86
- final String sequenceFormat = task.getSequenceFormat();
87
- return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
88
- }
89
-
90
- private TypeDescription getSchema(Schema schema)
91
- {
92
- TypeDescription oschema = TypeDescription.createStruct();
93
- for (int i = 0; i < schema.size(); i++) {
94
- Column column = schema.getColumn(i);
95
- Type type = column.getType();
96
- switch (type.getName()) {
97
- case "long":
98
- oschema.addField(column.getName(), TypeDescription.createLong());
99
- break;
100
- case "double":
101
- oschema.addField(column.getName(), TypeDescription.createDouble());
102
- break;
103
- case "boolean":
104
- oschema.addField(column.getName(), TypeDescription.createBoolean());
105
- break;
106
- case "string":
107
- oschema.addField(column.getName(), TypeDescription.createString());
108
- break;
109
- case "timestamp":
110
- oschema.addField(column.getName(), TypeDescription.createTimestamp());
111
- break;
112
- default:
113
- System.out.println("Unsupported type");
114
- break;
115
- }
116
- }
117
- return oschema;
118
- }
119
-
120
- private Configuration getHadoopConfiguration(PluginTask task)
121
- {
122
- Configuration conf = new Configuration();
123
-
124
- // see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
125
- conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
126
- conf.set("fs.file.impl", LocalFileSystem.class.getName());
127
- // see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
128
-
129
- AwsCredentials.getAWSCredentialsProvider(task);
130
- if (task.getAccessKeyId().isPresent()) {
131
- conf.set("fs.s3a.access.key", task.getAccessKeyId().get());
132
- conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId().get());
133
- }
134
- if (task.getSecretAccessKey().isPresent()) {
135
- conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
136
- conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
137
- }
138
- if (task.getEndpoint().isPresent()) {
139
- conf.set("fs.s3a.endpoint", task.getEndpoint().get());
140
- }
141
- return conf;
142
- }
143
-
144
- private Writer createWriter(PluginTask task, Schema schema, int processorIndex)
145
- {
146
- final TimestampFormatter[] timestampFormatters = Timestamps
147
- .newTimestampColumnFormatters(task, schema, task.getColumnOptions());
148
-
149
- Configuration conf = getHadoopConfiguration(task);
150
- TypeDescription oschema = getSchema(schema);
151
-
152
- // see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
153
- Thread.currentThread().setContextClassLoader(VersionInfo.class.getClassLoader());
154
-
155
- Writer writer = null;
156
- try {
157
- // Make writerOptions
158
- OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
159
- // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
160
- // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
161
- writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
162
- writerOptions.setSchema(oschema)
163
- .version(OrcFile.Version.V_0_12));
164
- }
165
- catch (IOException e) {
166
- Throwables.propagate(e);
167
- }
168
- return writer;
169
- }
170
-
171
- private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
172
- {
173
- final Integer bufferSize = task.getBufferSize();
174
- final Integer stripSize = task.getStripSize();
175
- final Integer blockSize = task.getBlockSize();
176
- final String kindString = task.getCompressionKind();
177
- CompressionKind kind = CompressionKind.valueOf(kindString);
178
- return OrcFile.writerOptions(conf)
179
- .bufferSize(bufferSize)
180
- .blockSize(blockSize)
181
- .stripeSize(stripSize)
182
- .compress(kind);
183
- }
184
-
185
- class OrcTransactionalPageOutput
186
- implements TransactionalPageOutput
187
- {
188
- private final PageReader reader;
189
- private final Writer writer;
190
-
191
- public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
192
- {
193
- this.reader = reader;
194
- this.writer = writer;
195
- }
196
-
197
- @Override
198
- public void add(Page page)
199
- {
200
- int size = page.getStringReferences().size();
201
- final TypeDescription schema = getSchema(reader.getSchema());
202
- final VectorizedRowBatch batch = schema.createRowBatch();
203
- batch.size = size;
204
-
205
- reader.setPage(page);
206
- int i = 0;
207
- while (reader.nextRecord()) {
208
- reader.getSchema().visitColumns(
209
- new OrcColumnVisitor(reader, batch, i)
210
- );
211
- i++;
212
- }
213
- try {
214
- writer.addRowBatch(batch);
215
- batch.reset();
216
- }
217
- catch (IOException e) {
218
- e.printStackTrace();
219
- }
220
- }
221
-
222
- @Override
223
- public void finish()
224
- {
225
- try {
226
- writer.close();
227
- }
228
- catch (IOException e) {
229
- Throwables.propagate(e);
230
- }
231
- }
232
-
233
- @Override
234
- public void close()
235
- {
236
- }
237
-
238
- @Override
239
- public void abort()
240
- {
241
- }
242
-
243
- @Override
244
- public TaskReport commit()
245
- {
246
- return Exec.newTaskReport();
247
- }
248
- }
249
- }
@@ -1,28 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Throwables;
4
-
5
- import java.io.IOException;
6
- import java.nio.file.Files;
7
- import java.nio.file.Path;
8
- import java.nio.file.Paths;
9
-
10
- class OrcOutputPluginHelper
11
- {
12
- protected OrcOutputPluginHelper()
13
- {
14
- throw new UnsupportedOperationException();
15
- }
16
-
17
- static void removeOldFile(String fpath)
18
- {
19
- Path path = Paths.get(fpath);
20
- // TODO: Check local file. not HDFS or S3.
21
- try {
22
- Files.deleteIfExists(path);
23
- }
24
- catch (IOException e) {
25
- Throwables.propagate(e);
26
- }
27
- }
28
- }
@@ -1,60 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Optional;
4
- import org.embulk.config.Config;
5
- import org.embulk.config.ConfigDefault;
6
- import org.embulk.config.Task;
7
- import org.embulk.spi.time.TimestampFormatter;
8
- import org.embulk.util.aws.credentials.AwsCredentialsTask;
9
- import org.joda.time.DateTimeZone;
10
-
11
- import java.util.Map;
12
-
13
- public interface PluginTask
14
- extends Task, TimestampFormatter.Task, AwsCredentialsTask
15
- {
16
- @Config("path_prefix")
17
- String getPathPrefix();
18
-
19
- @Config("file_ext")
20
- @ConfigDefault("\".orc\"")
21
- String getFileNameExtension();
22
-
23
- @Config("column_options")
24
- @ConfigDefault("{}")
25
- Map<String, TimestampColumnOption> getColumnOptions();
26
-
27
- @Config("sequence_format")
28
- @ConfigDefault("\".%03d\"")
29
- String getSequenceFormat();
30
-
31
- // see: https://orc.apache.org/docs/hive-config.html
32
- // ORC File options
33
- @Config("strip_size")
34
- @ConfigDefault("67108864") // 64MB
35
- Integer getStripSize();
36
-
37
- @Config("buffer_size")
38
- @ConfigDefault("262144") // 256KB
39
- Integer getBufferSize();
40
-
41
- @Config("block_size")
42
- @ConfigDefault("268435456") // 256MB
43
- Integer getBlockSize();
44
-
45
- @Config("compression_kind")
46
- @ConfigDefault("ZLIB")
47
- public String getCompressionKind();
48
-
49
- @Config("overwrite")
50
- @ConfigDefault("false")
51
- boolean getOverwrite();
52
-
53
- @Config("default_from_timezone")
54
- @ConfigDefault("\"UTC\"")
55
- DateTimeZone getDefaultFromTimeZone();
56
-
57
- @Config("endpoint")
58
- @ConfigDefault("null")
59
- Optional<String> getEndpoint();
60
- }
@@ -1,22 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Optional;
4
- import org.embulk.config.Config;
5
- import org.embulk.config.ConfigDefault;
6
- import org.embulk.config.Task;
7
- import org.embulk.spi.time.TimestampFormatter;
8
- import org.joda.time.DateTimeZone;
9
-
10
- import java.util.List;
11
-
12
- public interface TimestampColumnOption
13
- extends Task, TimestampFormatter.TimestampColumnOption
14
- {
15
- @Config("from_timezone")
16
- @ConfigDefault("null")
17
- Optional<DateTimeZone> getFromTimeZone();
18
-
19
- @Config("from_format")
20
- @ConfigDefault("null")
21
- Optional<List<String>> getFromFormat();
22
- }