embulk-output-orc 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c842edfe45c7e992faae16afd3331c7f8ecf256
|
4
|
+
data.tar.gz: a6b3d098e7b012a07f4870e2fc92c898b09e1560
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3bf5bc9e310496191419ee1e9a76cf9912321e9df2d178081c324ed46f83e7600e04912df59a7ebf38e0e3761be4b8ea1931cae549d681a8ba5b6d35b1e19990
|
7
|
+
data.tar.gz: bbec4349adf56b4c684084a39f61eee8dd41152a11de02c2b5573e5e933cf8d76f280e7110f030cb31b157fdfc9e7fa457f575cd8a6b5b4d8df8ea484a51f358
|
data/build.gradle
CHANGED
data/example/example.yml
CHANGED
@@ -3,66 +3,82 @@ package org.embulk.output.orc;
|
|
3
3
|
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
4
4
|
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
5
5
|
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
6
|
+
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
6
7
|
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
7
8
|
import org.embulk.spi.Column;
|
8
9
|
import org.embulk.spi.ColumnVisitor;
|
9
|
-
import org.embulk.spi.Page;
|
10
10
|
import org.embulk.spi.PageReader;
|
11
|
+
import org.embulk.spi.time.Timestamp;
|
11
12
|
|
12
|
-
public class OrcColumnVisitor
|
13
|
+
public class OrcColumnVisitor
|
14
|
+
implements ColumnVisitor
|
13
15
|
{
|
14
16
|
private PageReader reader;
|
15
|
-
VectorizedRowBatch batch;
|
16
|
-
Integer
|
17
|
+
private VectorizedRowBatch batch;
|
18
|
+
private Integer i;
|
17
19
|
|
18
|
-
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch,
|
20
|
+
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
19
21
|
{
|
20
|
-
int size = page.getStringReferences().size();
|
21
|
-
|
22
22
|
this.reader = pageReader;
|
23
23
|
this.batch = rowBatch;
|
24
|
-
this.
|
24
|
+
this.i = i;
|
25
25
|
}
|
26
26
|
|
27
27
|
@Override
|
28
28
|
public void booleanColumn(Column column)
|
29
29
|
{
|
30
30
|
if (reader.isNull(column)) {
|
31
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[
|
31
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
32
32
|
}
|
33
33
|
else {
|
34
|
-
|
34
|
+
// TODO; Fix all true bug
|
35
|
+
if (reader.getBoolean(column)) {
|
36
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
37
|
+
}
|
38
|
+
else {
|
39
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
40
|
+
}
|
35
41
|
}
|
36
42
|
}
|
37
43
|
|
38
44
|
@Override
|
39
45
|
public void longColumn(Column column)
|
40
46
|
{
|
41
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[
|
47
|
+
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
|
42
48
|
}
|
43
49
|
|
44
50
|
@Override
|
45
51
|
public void doubleColumn(Column column)
|
46
52
|
{
|
47
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[
|
53
|
+
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
|
48
54
|
}
|
49
55
|
|
50
56
|
@Override
|
51
57
|
public void stringColumn(Column column)
|
52
58
|
{
|
53
|
-
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(
|
59
|
+
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
|
54
60
|
reader.getString(column).getBytes());
|
55
61
|
}
|
56
62
|
|
57
63
|
@Override
|
58
64
|
public void timestampColumn(Column column)
|
59
65
|
{
|
60
|
-
|
66
|
+
if (reader.isNull(column)) {
|
67
|
+
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
|
68
|
+
}
|
69
|
+
else {
|
70
|
+
Timestamp timestamp = reader.getTimestamp(column);
|
71
|
+
if (!timestamp.equals("")) {
|
72
|
+
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
73
|
+
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
74
|
+
}
|
75
|
+
// throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
|
76
|
+
}
|
61
77
|
}
|
62
78
|
|
63
79
|
@Override
|
64
80
|
public void jsonColumn(Column column)
|
65
81
|
{
|
66
|
-
|
82
|
+
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
67
83
|
}
|
68
84
|
}
|
@@ -6,10 +6,6 @@ import org.apache.hadoop.conf.Configuration;
|
|
6
6
|
import org.apache.hadoop.fs.LocalFileSystem;
|
7
7
|
import org.apache.hadoop.fs.Path;
|
8
8
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
9
|
-
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
10
|
-
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
11
|
-
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
12
|
-
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
13
9
|
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
14
10
|
import org.apache.hadoop.util.VersionInfo;
|
15
11
|
import org.apache.orc.CompressionKind;
|
@@ -24,14 +20,12 @@ import org.embulk.config.Task;
|
|
24
20
|
import org.embulk.config.TaskReport;
|
25
21
|
import org.embulk.config.TaskSource;
|
26
22
|
import org.embulk.spi.Column;
|
27
|
-
import org.embulk.spi.ColumnVisitor;
|
28
23
|
import org.embulk.spi.Exec;
|
29
24
|
import org.embulk.spi.OutputPlugin;
|
30
25
|
import org.embulk.spi.Page;
|
31
26
|
import org.embulk.spi.PageReader;
|
32
27
|
import org.embulk.spi.Schema;
|
33
28
|
import org.embulk.spi.TransactionalPageOutput;
|
34
|
-
import org.embulk.spi.time.Timestamp;
|
35
29
|
import org.embulk.spi.time.TimestampFormatter;
|
36
30
|
import org.embulk.spi.type.Type;
|
37
31
|
import org.embulk.spi.util.Timestamps;
|
@@ -64,6 +58,19 @@ public class OrcOutputPlugin
|
|
64
58
|
@ConfigDefault("\".%03d\"")
|
65
59
|
String getSequenceFormat();
|
66
60
|
|
61
|
+
// ORC File options
|
62
|
+
@Config("strip_size")
|
63
|
+
@ConfigDefault("100000")
|
64
|
+
Integer getStripSize();
|
65
|
+
|
66
|
+
@Config("buffer_size")
|
67
|
+
@ConfigDefault("10000")
|
68
|
+
Integer getBufferSize();
|
69
|
+
|
70
|
+
@Config("compression_kind")
|
71
|
+
@ConfigDefault("ZLIB")
|
72
|
+
public String getCompressionKind();
|
73
|
+
|
67
74
|
@Config("overwrite")
|
68
75
|
@ConfigDefault("false")
|
69
76
|
boolean getOverwrite();
|
@@ -191,12 +198,12 @@ public class OrcOutputPlugin
|
|
191
198
|
|
192
199
|
Writer writer = null;
|
193
200
|
try {
|
201
|
+
// Make writerOptions
|
202
|
+
OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
|
194
203
|
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
195
204
|
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
196
205
|
writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
|
197
|
-
|
198
|
-
.setSchema(oschema)
|
199
|
-
.compress(CompressionKind.ZLIB)
|
206
|
+
writerOptions.setSchema(oschema)
|
200
207
|
.version(OrcFile.Version.V_0_12));
|
201
208
|
}
|
202
209
|
catch (IOException e) {
|
@@ -205,6 +212,35 @@ public class OrcOutputPlugin
|
|
205
212
|
return writer;
|
206
213
|
}
|
207
214
|
|
215
|
+
private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
|
216
|
+
{
|
217
|
+
final Integer bufferSize = task.getBufferSize();
|
218
|
+
final Integer stripSize = task.getStripSize();
|
219
|
+
final String kindString = task.getCompressionKind();
|
220
|
+
CompressionKind kind;
|
221
|
+
switch (kindString) {
|
222
|
+
case "ZLIB":
|
223
|
+
kind = CompressionKind.ZLIB;
|
224
|
+
break;
|
225
|
+
case "SNAPPY":
|
226
|
+
kind = CompressionKind.SNAPPY;
|
227
|
+
break;
|
228
|
+
case "LZO":
|
229
|
+
kind = CompressionKind.LZO;
|
230
|
+
break;
|
231
|
+
case "LZ4":
|
232
|
+
kind = CompressionKind.LZ4;
|
233
|
+
break;
|
234
|
+
default:
|
235
|
+
kind = CompressionKind.NONE;
|
236
|
+
break;
|
237
|
+
}
|
238
|
+
return OrcFile.writerOptions(conf).
|
239
|
+
bufferSize(bufferSize)
|
240
|
+
.stripeSize(stripSize)
|
241
|
+
.compress(kind);
|
242
|
+
}
|
243
|
+
|
208
244
|
class OrcTransactionalPageOutput
|
209
245
|
implements TransactionalPageOutput
|
210
246
|
{
|
@@ -225,78 +261,18 @@ public class OrcOutputPlugin
|
|
225
261
|
@Override
|
226
262
|
public void add(Page page)
|
227
263
|
{
|
228
|
-
|
264
|
+
int size = page.getStringReferences().size();
|
229
265
|
TypeDescription schema = getSchema(reader.getSchema());
|
230
266
|
VectorizedRowBatch batch = schema.createRowBatch();
|
231
|
-
batch.size =
|
267
|
+
batch.size = size;
|
232
268
|
|
233
269
|
reader.setPage(page);
|
234
270
|
int i = 0;
|
235
271
|
while (reader.nextRecord()) {
|
236
272
|
// batch.size = page.getStringReferences().size();
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
{
|
241
|
-
|
242
|
-
@Override
|
243
|
-
public void booleanColumn(Column column)
|
244
|
-
{
|
245
|
-
if (reader.isNull(column)) {
|
246
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
|
247
|
-
}
|
248
|
-
else {
|
249
|
-
// TODO; Fix all true bug
|
250
|
-
if (reader.getBoolean(column)) {
|
251
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 1;
|
252
|
-
}
|
253
|
-
else {
|
254
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
|
255
|
-
}
|
256
|
-
}
|
257
|
-
}
|
258
|
-
|
259
|
-
@Override
|
260
|
-
public void longColumn(Column column)
|
261
|
-
{
|
262
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
|
263
|
-
}
|
264
|
-
|
265
|
-
@Override
|
266
|
-
public void doubleColumn(Column column)
|
267
|
-
{
|
268
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getDouble(column);
|
269
|
-
}
|
270
|
-
|
271
|
-
@Override
|
272
|
-
public void stringColumn(Column column)
|
273
|
-
{
|
274
|
-
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(finalI,
|
275
|
-
reader.getString(column).getBytes());
|
276
|
-
}
|
277
|
-
|
278
|
-
@Override
|
279
|
-
public void timestampColumn(Column column)
|
280
|
-
{
|
281
|
-
if (reader.isNull(column)) {
|
282
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(finalI);
|
283
|
-
}
|
284
|
-
else {
|
285
|
-
Timestamp timestamp = reader.getTimestamp(column);
|
286
|
-
if (!timestamp.equals("")) {
|
287
|
-
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
288
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(finalI, ts);
|
289
|
-
}
|
290
|
-
// throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
|
291
|
-
}
|
292
|
-
}
|
293
|
-
|
294
|
-
@Override
|
295
|
-
public void jsonColumn(Column column)
|
296
|
-
{
|
297
|
-
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
298
|
-
}
|
299
|
-
});
|
273
|
+
reader.getSchema().visitColumns(
|
274
|
+
new OrcColumnVisitor(reader, batch, i)
|
275
|
+
);
|
300
276
|
i++;
|
301
277
|
}
|
302
278
|
try {
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
@@ -85,7 +85,7 @@ files:
|
|
85
85
|
- classpath/curator-client-2.6.0.jar
|
86
86
|
- classpath/curator-framework-2.6.0.jar
|
87
87
|
- classpath/curator-recipes-2.6.0.jar
|
88
|
-
- classpath/embulk-output-orc-0.0.
|
88
|
+
- classpath/embulk-output-orc-0.0.2.jar
|
89
89
|
- classpath/gson-2.2.4.jar
|
90
90
|
- classpath/guice-servlet-3.0.jar
|
91
91
|
- classpath/hadoop-annotations-2.6.4.jar
|