embulk-output-orc 0.3.0 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,14 +0,0 @@
1
- language: java
2
- jdk:
3
- - oraclejdk8
4
-
5
- cache:
6
- directories: # run "travis cache --delete" to delete caches
7
- - $HOME/.gradle
8
-
9
- sudo: false
10
- script:
11
- - ./gradlew --info checkstyle
12
- - ./gradlew --info check
13
-
14
- after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
@@ -1,82 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
4
- import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
5
- import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
6
- import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
7
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
8
- import org.embulk.spi.Column;
9
- import org.embulk.spi.ColumnVisitor;
10
- import org.embulk.spi.PageReader;
11
- import org.embulk.spi.time.Timestamp;
12
-
13
- import java.nio.charset.StandardCharsets;
14
-
15
- public class OrcColumnVisitor
16
- implements ColumnVisitor
17
- {
18
- private final PageReader reader;
19
- private final VectorizedRowBatch batch;
20
- private final Integer i;
21
-
22
- public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
23
- {
24
- this.reader = pageReader;
25
- this.batch = rowBatch;
26
- this.i = i;
27
- }
28
-
29
- @Override
30
- public void booleanColumn(Column column)
31
- {
32
- if (reader.isNull(column)) {
33
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
34
- }
35
- else {
36
- if (reader.getBoolean(column)) {
37
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
38
- }
39
- else {
40
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
41
- }
42
- }
43
- }
44
-
45
- @Override
46
- public void longColumn(Column column)
47
- {
48
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
49
- }
50
-
51
- @Override
52
- public void doubleColumn(Column column)
53
- {
54
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
55
- }
56
-
57
- @Override
58
- public void stringColumn(Column column)
59
- {
60
- ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
61
- reader.getString(column).getBytes(StandardCharsets.UTF_8));
62
- }
63
-
64
- @Override
65
- public void timestampColumn(Column column)
66
- {
67
- if (reader.isNull(column)) {
68
- ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
69
- }
70
- else {
71
- Timestamp timestamp = reader.getTimestamp(column);
72
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
74
- }
75
- }
76
-
77
- @Override
78
- public void jsonColumn(Column column)
79
- {
80
- throw new UnsupportedOperationException("orc output plugin does not support json type");
81
- }
82
- }
@@ -1,249 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Throwables;
4
- import org.apache.hadoop.conf.Configuration;
5
- import org.apache.hadoop.fs.LocalFileSystem;
6
- import org.apache.hadoop.fs.Path;
7
- import org.apache.hadoop.hdfs.DistributedFileSystem;
8
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
9
- import org.apache.hadoop.util.VersionInfo;
10
- import org.apache.orc.CompressionKind;
11
- import org.apache.orc.OrcFile;
12
- import org.apache.orc.TypeDescription;
13
- import org.apache.orc.Writer;
14
- import org.embulk.config.ConfigDiff;
15
- import org.embulk.config.ConfigSource;
16
- import org.embulk.config.TaskReport;
17
- import org.embulk.config.TaskSource;
18
- import org.embulk.spi.Column;
19
- import org.embulk.spi.Exec;
20
- import org.embulk.spi.OutputPlugin;
21
- import org.embulk.spi.Page;
22
- import org.embulk.spi.PageReader;
23
- import org.embulk.spi.Schema;
24
- import org.embulk.spi.TransactionalPageOutput;
25
- import org.embulk.spi.time.TimestampFormatter;
26
- import org.embulk.spi.type.Type;
27
- import org.embulk.spi.util.Timestamps;
28
- import org.embulk.util.aws.credentials.AwsCredentials;
29
-
30
- import java.io.IOException;
31
- import java.util.List;
32
-
33
- public class OrcOutputPlugin
34
- implements OutputPlugin
35
- {
36
- @Override
37
- public ConfigDiff transaction(ConfigSource config,
38
- Schema schema, int taskCount,
39
- OutputPlugin.Control control)
40
- {
41
- PluginTask task = config.loadConfig(PluginTask.class);
42
-
43
- // retryable (idempotent) output:
44
- // return resume(task.dump(), schema, taskCount, control);
45
-
46
- // non-retryable (non-idempotent) output:
47
- control.run(task.dump());
48
- return Exec.newConfigDiff();
49
- }
50
-
51
- @Override
52
- public ConfigDiff resume(TaskSource taskSource,
53
- Schema schema, int taskCount,
54
- OutputPlugin.Control control)
55
- {
56
- throw new UnsupportedOperationException("orc output plugin does not support resuming");
57
- }
58
-
59
- @Override
60
- public void cleanup(TaskSource taskSource,
61
- Schema schema, int taskCount,
62
- List<TaskReport> successTaskReports)
63
-
64
- {
65
- }
66
-
67
- @Override
68
- public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
69
- {
70
- PluginTask task = taskSource.loadTask(PluginTask.class);
71
-
72
- if (task.getOverwrite()) {
73
- OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex));
74
- }
75
-
76
- final PageReader reader = new PageReader(schema);
77
- Writer writer = createWriter(task, schema, taskIndex);
78
-
79
- return new OrcTransactionalPageOutput(reader, writer, task);
80
- }
81
-
82
- private String buildPath(PluginTask task, int processorIndex)
83
- {
84
- final String pathPrefix = task.getPathPrefix();
85
- final String pathSuffix = task.getFileNameExtension();
86
- final String sequenceFormat = task.getSequenceFormat();
87
- return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
88
- }
89
-
90
- private TypeDescription getSchema(Schema schema)
91
- {
92
- TypeDescription oschema = TypeDescription.createStruct();
93
- for (int i = 0; i < schema.size(); i++) {
94
- Column column = schema.getColumn(i);
95
- Type type = column.getType();
96
- switch (type.getName()) {
97
- case "long":
98
- oschema.addField(column.getName(), TypeDescription.createLong());
99
- break;
100
- case "double":
101
- oschema.addField(column.getName(), TypeDescription.createDouble());
102
- break;
103
- case "boolean":
104
- oschema.addField(column.getName(), TypeDescription.createBoolean());
105
- break;
106
- case "string":
107
- oschema.addField(column.getName(), TypeDescription.createString());
108
- break;
109
- case "timestamp":
110
- oschema.addField(column.getName(), TypeDescription.createTimestamp());
111
- break;
112
- default:
113
- System.out.println("Unsupported type");
114
- break;
115
- }
116
- }
117
- return oschema;
118
- }
119
-
120
- private Configuration getHadoopConfiguration(PluginTask task)
121
- {
122
- Configuration conf = new Configuration();
123
-
124
- // see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
125
- conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
126
- conf.set("fs.file.impl", LocalFileSystem.class.getName());
127
- // see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
128
-
129
- AwsCredentials.getAWSCredentialsProvider(task);
130
- if (task.getAccessKeyId().isPresent()) {
131
- conf.set("fs.s3a.access.key", task.getAccessKeyId().get());
132
- conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId().get());
133
- }
134
- if (task.getSecretAccessKey().isPresent()) {
135
- conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
136
- conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
137
- }
138
- if (task.getEndpoint().isPresent()) {
139
- conf.set("fs.s3a.endpoint", task.getEndpoint().get());
140
- }
141
- return conf;
142
- }
143
-
144
- private Writer createWriter(PluginTask task, Schema schema, int processorIndex)
145
- {
146
- final TimestampFormatter[] timestampFormatters = Timestamps
147
- .newTimestampColumnFormatters(task, schema, task.getColumnOptions());
148
-
149
- Configuration conf = getHadoopConfiguration(task);
150
- TypeDescription oschema = getSchema(schema);
151
-
152
- // see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
153
- Thread.currentThread().setContextClassLoader(VersionInfo.class.getClassLoader());
154
-
155
- Writer writer = null;
156
- try {
157
- // Make writerOptions
158
- OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
159
- // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
160
- // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
161
- writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
162
- writerOptions.setSchema(oschema)
163
- .version(OrcFile.Version.V_0_12));
164
- }
165
- catch (IOException e) {
166
- Throwables.propagate(e);
167
- }
168
- return writer;
169
- }
170
-
171
- private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
172
- {
173
- final Integer bufferSize = task.getBufferSize();
174
- final Integer stripSize = task.getStripSize();
175
- final Integer blockSize = task.getBlockSize();
176
- final String kindString = task.getCompressionKind();
177
- CompressionKind kind = CompressionKind.valueOf(kindString);
178
- return OrcFile.writerOptions(conf)
179
- .bufferSize(bufferSize)
180
- .blockSize(blockSize)
181
- .stripeSize(stripSize)
182
- .compress(kind);
183
- }
184
-
185
- class OrcTransactionalPageOutput
186
- implements TransactionalPageOutput
187
- {
188
- private final PageReader reader;
189
- private final Writer writer;
190
-
191
- public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
192
- {
193
- this.reader = reader;
194
- this.writer = writer;
195
- }
196
-
197
- @Override
198
- public void add(Page page)
199
- {
200
- int size = page.getStringReferences().size();
201
- final TypeDescription schema = getSchema(reader.getSchema());
202
- final VectorizedRowBatch batch = schema.createRowBatch();
203
- batch.size = size;
204
-
205
- reader.setPage(page);
206
- int i = 0;
207
- while (reader.nextRecord()) {
208
- reader.getSchema().visitColumns(
209
- new OrcColumnVisitor(reader, batch, i)
210
- );
211
- i++;
212
- }
213
- try {
214
- writer.addRowBatch(batch);
215
- batch.reset();
216
- }
217
- catch (IOException e) {
218
- e.printStackTrace();
219
- }
220
- }
221
-
222
- @Override
223
- public void finish()
224
- {
225
- try {
226
- writer.close();
227
- }
228
- catch (IOException e) {
229
- Throwables.propagate(e);
230
- }
231
- }
232
-
233
- @Override
234
- public void close()
235
- {
236
- }
237
-
238
- @Override
239
- public void abort()
240
- {
241
- }
242
-
243
- @Override
244
- public TaskReport commit()
245
- {
246
- return Exec.newTaskReport();
247
- }
248
- }
249
- }
@@ -1,28 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Throwables;
4
-
5
- import java.io.IOException;
6
- import java.nio.file.Files;
7
- import java.nio.file.Path;
8
- import java.nio.file.Paths;
9
-
10
- class OrcOutputPluginHelper
11
- {
12
- protected OrcOutputPluginHelper()
13
- {
14
- throw new UnsupportedOperationException();
15
- }
16
-
17
- static void removeOldFile(String fpath)
18
- {
19
- Path path = Paths.get(fpath);
20
- // TODO: Check local file. not HDFS or S3.
21
- try {
22
- Files.deleteIfExists(path);
23
- }
24
- catch (IOException e) {
25
- Throwables.propagate(e);
26
- }
27
- }
28
- }
@@ -1,60 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Optional;
4
- import org.embulk.config.Config;
5
- import org.embulk.config.ConfigDefault;
6
- import org.embulk.config.Task;
7
- import org.embulk.spi.time.TimestampFormatter;
8
- import org.embulk.util.aws.credentials.AwsCredentialsTask;
9
- import org.joda.time.DateTimeZone;
10
-
11
- import java.util.Map;
12
-
13
- public interface PluginTask
14
- extends Task, TimestampFormatter.Task, AwsCredentialsTask
15
- {
16
- @Config("path_prefix")
17
- String getPathPrefix();
18
-
19
- @Config("file_ext")
20
- @ConfigDefault("\".orc\"")
21
- String getFileNameExtension();
22
-
23
- @Config("column_options")
24
- @ConfigDefault("{}")
25
- Map<String, TimestampColumnOption> getColumnOptions();
26
-
27
- @Config("sequence_format")
28
- @ConfigDefault("\".%03d\"")
29
- String getSequenceFormat();
30
-
31
- // see: https://orc.apache.org/docs/hive-config.html
32
- // ORC File options
33
- @Config("strip_size")
34
- @ConfigDefault("67108864") // 64MB
35
- Integer getStripSize();
36
-
37
- @Config("buffer_size")
38
- @ConfigDefault("262144") // 256KB
39
- Integer getBufferSize();
40
-
41
- @Config("block_size")
42
- @ConfigDefault("268435456") // 256MB
43
- Integer getBlockSize();
44
-
45
- @Config("compression_kind")
46
- @ConfigDefault("ZLIB")
47
- public String getCompressionKind();
48
-
49
- @Config("overwrite")
50
- @ConfigDefault("false")
51
- boolean getOverwrite();
52
-
53
- @Config("default_from_timezone")
54
- @ConfigDefault("\"UTC\"")
55
- DateTimeZone getDefaultFromTimeZone();
56
-
57
- @Config("endpoint")
58
- @ConfigDefault("null")
59
- Optional<String> getEndpoint();
60
- }
@@ -1,22 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Optional;
4
- import org.embulk.config.Config;
5
- import org.embulk.config.ConfigDefault;
6
- import org.embulk.config.Task;
7
- import org.embulk.spi.time.TimestampFormatter;
8
- import org.joda.time.DateTimeZone;
9
-
10
- import java.util.List;
11
-
12
- public interface TimestampColumnOption
13
- extends Task, TimestampFormatter.TimestampColumnOption
14
- {
15
- @Config("from_timezone")
16
- @ConfigDefault("null")
17
- Optional<DateTimeZone> getFromTimeZone();
18
-
19
- @Config("from_format")
20
- @ConfigDefault("null")
21
- Optional<List<String>> getFromFormat();
22
- }