embulk-output-orc 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c842edfe45c7e992faae16afd3331c7f8ecf256
|
4
|
+
data.tar.gz: a6b3d098e7b012a07f4870e2fc92c898b09e1560
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3bf5bc9e310496191419ee1e9a76cf9912321e9df2d178081c324ed46f83e7600e04912df59a7ebf38e0e3761be4b8ea1931cae549d681a8ba5b6d35b1e19990
|
7
|
+
data.tar.gz: bbec4349adf56b4c684084a39f61eee8dd41152a11de02c2b5573e5e933cf8d76f280e7110f030cb31b157fdfc9e7fa457f575cd8a6b5b4d8df8ea484a51f358
|
data/build.gradle
CHANGED
data/example/example.yml
CHANGED
@@ -3,66 +3,82 @@ package org.embulk.output.orc;
|
|
3
3
|
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
4
4
|
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
5
5
|
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
6
|
+
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
6
7
|
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
7
8
|
import org.embulk.spi.Column;
|
8
9
|
import org.embulk.spi.ColumnVisitor;
|
9
|
-
import org.embulk.spi.Page;
|
10
10
|
import org.embulk.spi.PageReader;
|
11
|
+
import org.embulk.spi.time.Timestamp;
|
11
12
|
|
12
|
-
public class OrcColumnVisitor
|
13
|
+
public class OrcColumnVisitor
|
14
|
+
implements ColumnVisitor
|
13
15
|
{
|
14
16
|
private PageReader reader;
|
15
|
-
VectorizedRowBatch batch;
|
16
|
-
Integer
|
17
|
+
private VectorizedRowBatch batch;
|
18
|
+
private Integer i;
|
17
19
|
|
18
|
-
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch,
|
20
|
+
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
19
21
|
{
|
20
|
-
int size = page.getStringReferences().size();
|
21
|
-
|
22
22
|
this.reader = pageReader;
|
23
23
|
this.batch = rowBatch;
|
24
|
-
this.
|
24
|
+
this.i = i;
|
25
25
|
}
|
26
26
|
|
27
27
|
@Override
|
28
28
|
public void booleanColumn(Column column)
|
29
29
|
{
|
30
30
|
if (reader.isNull(column)) {
|
31
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[
|
31
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
32
32
|
}
|
33
33
|
else {
|
34
|
-
|
34
|
+
// TODO; Fix all true bug
|
35
|
+
if (reader.getBoolean(column)) {
|
36
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
37
|
+
}
|
38
|
+
else {
|
39
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
40
|
+
}
|
35
41
|
}
|
36
42
|
}
|
37
43
|
|
38
44
|
@Override
|
39
45
|
public void longColumn(Column column)
|
40
46
|
{
|
41
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[
|
47
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
|
42
48
|
}
|
43
49
|
|
44
50
|
@Override
|
45
51
|
public void doubleColumn(Column column)
|
46
52
|
{
|
47
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[
|
53
|
+
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
|
48
54
|
}
|
49
55
|
|
50
56
|
@Override
|
51
57
|
public void stringColumn(Column column)
|
52
58
|
{
|
53
|
-
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(
|
59
|
+
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
|
54
60
|
reader.getString(column).getBytes());
|
55
61
|
}
|
56
62
|
|
57
63
|
@Override
|
58
64
|
public void timestampColumn(Column column)
|
59
65
|
{
|
60
|
-
|
66
|
+
if (reader.isNull(column)) {
|
67
|
+
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
|
68
|
+
}
|
69
|
+
else {
|
70
|
+
Timestamp timestamp = reader.getTimestamp(column);
|
71
|
+
if (!timestamp.equals("")) {
|
72
|
+
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
73
|
+
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
74
|
+
}
|
75
|
+
// throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
|
76
|
+
}
|
61
77
|
}
|
62
78
|
|
63
79
|
@Override
|
64
80
|
public void jsonColumn(Column column)
|
65
81
|
{
|
66
|
-
|
82
|
+
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
67
83
|
}
|
68
84
|
}
|
@@ -6,10 +6,6 @@ import org.apache.hadoop.conf.Configuration;
|
|
6
6
|
import org.apache.hadoop.fs.LocalFileSystem;
|
7
7
|
import org.apache.hadoop.fs.Path;
|
8
8
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
9
|
-
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
10
|
-
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
11
|
-
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
12
|
-
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
13
9
|
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
14
10
|
import org.apache.hadoop.util.VersionInfo;
|
15
11
|
import org.apache.orc.CompressionKind;
|
@@ -24,14 +20,12 @@ import org.embulk.config.Task;
|
|
24
20
|
import org.embulk.config.TaskReport;
|
25
21
|
import org.embulk.config.TaskSource;
|
26
22
|
import org.embulk.spi.Column;
|
27
|
-
import org.embulk.spi.ColumnVisitor;
|
28
23
|
import org.embulk.spi.Exec;
|
29
24
|
import org.embulk.spi.OutputPlugin;
|
30
25
|
import org.embulk.spi.Page;
|
31
26
|
import org.embulk.spi.PageReader;
|
32
27
|
import org.embulk.spi.Schema;
|
33
28
|
import org.embulk.spi.TransactionalPageOutput;
|
34
|
-
import org.embulk.spi.time.Timestamp;
|
35
29
|
import org.embulk.spi.time.TimestampFormatter;
|
36
30
|
import org.embulk.spi.type.Type;
|
37
31
|
import org.embulk.spi.util.Timestamps;
|
@@ -64,6 +58,19 @@ public class OrcOutputPlugin
|
|
64
58
|
@ConfigDefault("\".%03d\"")
|
65
59
|
String getSequenceFormat();
|
66
60
|
|
61
|
+
// ORC File options
|
62
|
+
@Config("strip_size")
|
63
|
+
@ConfigDefault("100000")
|
64
|
+
Integer getStripSize();
|
65
|
+
|
66
|
+
@Config("buffer_size")
|
67
|
+
@ConfigDefault("10000")
|
68
|
+
Integer getBufferSize();
|
69
|
+
|
70
|
+
@Config("compression_kind")
|
71
|
+
@ConfigDefault("ZLIB")
|
72
|
+
public String getCompressionKind();
|
73
|
+
|
67
74
|
@Config("overwrite")
|
68
75
|
@ConfigDefault("false")
|
69
76
|
boolean getOverwrite();
|
@@ -191,12 +198,12 @@ public class OrcOutputPlugin
|
|
191
198
|
|
192
199
|
Writer writer = null;
|
193
200
|
try {
|
201
|
+
// Make writerOptions
|
202
|
+
OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
|
194
203
|
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
195
204
|
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
196
205
|
writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
|
197
|
-
|
198
|
-
.setSchema(oschema)
|
199
|
-
.compress(CompressionKind.ZLIB)
|
206
|
+
writerOptions.setSchema(oschema)
|
200
207
|
.version(OrcFile.Version.V_0_12));
|
201
208
|
}
|
202
209
|
catch (IOException e) {
|
@@ -205,6 +212,35 @@ public class OrcOutputPlugin
|
|
205
212
|
return writer;
|
206
213
|
}
|
207
214
|
|
215
|
+
private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
|
216
|
+
{
|
217
|
+
final Integer bufferSize = task.getBufferSize();
|
218
|
+
final Integer stripSize = task.getStripSize();
|
219
|
+
final String kindString = task.getCompressionKind();
|
220
|
+
CompressionKind kind;
|
221
|
+
switch (kindString) {
|
222
|
+
case "ZLIB":
|
223
|
+
kind = CompressionKind.ZLIB;
|
224
|
+
break;
|
225
|
+
case "SNAPPY":
|
226
|
+
kind = CompressionKind.SNAPPY;
|
227
|
+
break;
|
228
|
+
case "LZO":
|
229
|
+
kind = CompressionKind.LZO;
|
230
|
+
break;
|
231
|
+
case "LZ4":
|
232
|
+
kind = CompressionKind.LZ4;
|
233
|
+
break;
|
234
|
+
default:
|
235
|
+
kind = CompressionKind.NONE;
|
236
|
+
break;
|
237
|
+
}
|
238
|
+
return OrcFile.writerOptions(conf).
|
239
|
+
bufferSize(bufferSize)
|
240
|
+
.stripeSize(stripSize)
|
241
|
+
.compress(kind);
|
242
|
+
}
|
243
|
+
|
208
244
|
class OrcTransactionalPageOutput
|
209
245
|
implements TransactionalPageOutput
|
210
246
|
{
|
@@ -225,78 +261,18 @@ public class OrcOutputPlugin
|
|
225
261
|
@Override
|
226
262
|
public void add(Page page)
|
227
263
|
{
|
228
|
-
|
264
|
+
int size = page.getStringReferences().size();
|
229
265
|
TypeDescription schema = getSchema(reader.getSchema());
|
230
266
|
VectorizedRowBatch batch = schema.createRowBatch();
|
231
|
-
batch.size =
|
267
|
+
batch.size = size;
|
232
268
|
|
233
269
|
reader.setPage(page);
|
234
270
|
int i = 0;
|
235
271
|
while (reader.nextRecord()) {
|
236
272
|
// batch.size = page.getStringReferences().size();
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
{
|
241
|
-
|
242
|
-
@Override
|
243
|
-
public void booleanColumn(Column column)
|
244
|
-
{
|
245
|
-
if (reader.isNull(column)) {
|
246
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
|
247
|
-
}
|
248
|
-
else {
|
249
|
-
// TODO; Fix all true bug
|
250
|
-
if (reader.getBoolean(column)) {
|
251
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 1;
|
252
|
-
}
|
253
|
-
else {
|
254
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
|
255
|
-
}
|
256
|
-
}
|
257
|
-
}
|
258
|
-
|
259
|
-
@Override
|
260
|
-
public void longColumn(Column column)
|
261
|
-
{
|
262
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
|
263
|
-
}
|
264
|
-
|
265
|
-
@Override
|
266
|
-
public void doubleColumn(Column column)
|
267
|
-
{
|
268
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getDouble(column);
|
269
|
-
}
|
270
|
-
|
271
|
-
@Override
|
272
|
-
public void stringColumn(Column column)
|
273
|
-
{
|
274
|
-
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(finalI,
|
275
|
-
reader.getString(column).getBytes());
|
276
|
-
}
|
277
|
-
|
278
|
-
@Override
|
279
|
-
public void timestampColumn(Column column)
|
280
|
-
{
|
281
|
-
if (reader.isNull(column)) {
|
282
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(finalI);
|
283
|
-
}
|
284
|
-
else {
|
285
|
-
Timestamp timestamp = reader.getTimestamp(column);
|
286
|
-
if (!timestamp.equals("")) {
|
287
|
-
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
288
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(finalI, ts);
|
289
|
-
}
|
290
|
-
// throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
|
291
|
-
}
|
292
|
-
}
|
293
|
-
|
294
|
-
@Override
|
295
|
-
public void jsonColumn(Column column)
|
296
|
-
{
|
297
|
-
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
298
|
-
}
|
299
|
-
});
|
273
|
+
reader.getSchema().visitColumns(
|
274
|
+
new OrcColumnVisitor(reader, batch, i)
|
275
|
+
);
|
300
276
|
i++;
|
301
277
|
}
|
302
278
|
try {
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
@@ -85,7 +85,7 @@ files:
|
|
85
85
|
- classpath/curator-client-2.6.0.jar
|
86
86
|
- classpath/curator-framework-2.6.0.jar
|
87
87
|
- classpath/curator-recipes-2.6.0.jar
|
88
|
-
- classpath/embulk-output-orc-0.0.
|
88
|
+
- classpath/embulk-output-orc-0.0.2.jar
|
89
89
|
- classpath/gson-2.2.4.jar
|
90
90
|
- classpath/guice-servlet-3.0.jar
|
91
91
|
- classpath/hadoop-annotations-2.6.4.jar
|