embulk-output-orc 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c6d1115e79d4012717df2f43f447bfbdd1518a88
4
- data.tar.gz: 80e8e7f5e470724a33125b4ecce74112ef098d6b
3
+ metadata.gz: 3c842edfe45c7e992faae16afd3331c7f8ecf256
4
+ data.tar.gz: a6b3d098e7b012a07f4870e2fc92c898b09e1560
5
5
  SHA512:
6
- metadata.gz: 290aec04feda06d83ddd9cd2995d62a186fbfc3399b90ede431e54eec8e34aac606077919610f446e703ee820e7625ed1d79f9d08378fccd6fcb052f15e8ab31
7
- data.tar.gz: 899435e450c217c4f8b08fa9ea617efb64b5628ba0cb42b37849a11e5363b96889b67e595b51f2f3b2b396386166d5a5ddd96fd34cfa761f735572262b59f716
6
+ metadata.gz: 3bf5bc9e310496191419ee1e9a76cf9912321e9df2d178081c324ed46f83e7600e04912df59a7ebf38e0e3761be4b8ea1931cae549d681a8ba5b6d35b1e19990
7
+ data.tar.gz: bbec4349adf56b4c684084a39f61eee8dd41152a11de02c2b5573e5e933cf8d76f280e7110f030cb31b157fdfc9e7fa457f575cd8a6b5b4d8df8ea484a51f358
@@ -14,7 +14,7 @@ configurations {
14
14
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
15
15
  }
16
16
 
17
- version = "0.0.1"
17
+ version = "0.0.2"
18
18
 
19
19
  sourceCompatibility = 1.8
20
20
  targetCompatibility = 1.8
@@ -54,3 +54,6 @@ exec:
54
54
  out:
55
55
  type: orc
56
56
  path_prefix: "/tmp/output"
57
+ buffer_size: 8000
58
+ strip_size: 90000
59
+ compression_kind: ZLIB
@@ -3,66 +3,82 @@ package org.embulk.output.orc;
3
3
  import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
4
4
  import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
5
5
  import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
6
+ import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
6
7
  import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
7
8
  import org.embulk.spi.Column;
8
9
  import org.embulk.spi.ColumnVisitor;
9
- import org.embulk.spi.Page;
10
10
  import org.embulk.spi.PageReader;
11
+ import org.embulk.spi.time.Timestamp;
11
12
 
12
- public class OrcColumnVisitor implements ColumnVisitor
13
+ public class OrcColumnVisitor
14
+ implements ColumnVisitor
13
15
  {
14
16
  private PageReader reader;
15
- VectorizedRowBatch batch;
16
- Integer finalI;
17
+ private VectorizedRowBatch batch;
18
+ private Integer i;
17
19
 
18
- public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Page page, Integer i)
20
+ public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
19
21
  {
20
- int size = page.getStringReferences().size();
21
-
22
22
  this.reader = pageReader;
23
23
  this.batch = rowBatch;
24
- this.finalI = i;
24
+ this.i = i;
25
25
  }
26
26
 
27
27
  @Override
28
28
  public void booleanColumn(Column column)
29
29
  {
30
30
  if (reader.isNull(column)) {
31
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
31
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
32
32
  }
33
33
  else {
34
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
34
+ // TODO; Fix all true bug
35
+ if (reader.getBoolean(column)) {
36
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
37
+ }
38
+ else {
39
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
40
+ }
35
41
  }
36
42
  }
37
43
 
38
44
  @Override
39
45
  public void longColumn(Column column)
40
46
  {
41
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
47
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
42
48
  }
43
49
 
44
50
  @Override
45
51
  public void doubleColumn(Column column)
46
52
  {
47
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getDouble(column);
53
+ ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
48
54
  }
49
55
 
50
56
  @Override
51
57
  public void stringColumn(Column column)
52
58
  {
53
- ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(finalI,
59
+ ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
54
60
  reader.getString(column).getBytes());
55
61
  }
56
62
 
57
63
  @Override
58
64
  public void timestampColumn(Column column)
59
65
  {
60
-
66
+ if (reader.isNull(column)) {
67
+ ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
68
+ }
69
+ else {
70
+ Timestamp timestamp = reader.getTimestamp(column);
71
+ if (!timestamp.equals("")) {
72
+ java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
+ ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
74
+ }
75
+ // throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
76
+ }
61
77
  }
62
78
 
63
79
  @Override
64
80
  public void jsonColumn(Column column)
65
81
  {
66
- // throw unsupported
82
+ throw new UnsupportedOperationException("orc output plugin does not support json type");
67
83
  }
68
84
  }
@@ -6,10 +6,6 @@ import org.apache.hadoop.conf.Configuration;
6
6
  import org.apache.hadoop.fs.LocalFileSystem;
7
7
  import org.apache.hadoop.fs.Path;
8
8
  import org.apache.hadoop.hdfs.DistributedFileSystem;
9
- import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
10
- import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
11
- import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
12
- import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
13
9
  import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
14
10
  import org.apache.hadoop.util.VersionInfo;
15
11
  import org.apache.orc.CompressionKind;
@@ -24,14 +20,12 @@ import org.embulk.config.Task;
24
20
  import org.embulk.config.TaskReport;
25
21
  import org.embulk.config.TaskSource;
26
22
  import org.embulk.spi.Column;
27
- import org.embulk.spi.ColumnVisitor;
28
23
  import org.embulk.spi.Exec;
29
24
  import org.embulk.spi.OutputPlugin;
30
25
  import org.embulk.spi.Page;
31
26
  import org.embulk.spi.PageReader;
32
27
  import org.embulk.spi.Schema;
33
28
  import org.embulk.spi.TransactionalPageOutput;
34
- import org.embulk.spi.time.Timestamp;
35
29
  import org.embulk.spi.time.TimestampFormatter;
36
30
  import org.embulk.spi.type.Type;
37
31
  import org.embulk.spi.util.Timestamps;
@@ -64,6 +58,19 @@ public class OrcOutputPlugin
64
58
  @ConfigDefault("\".%03d\"")
65
59
  String getSequenceFormat();
66
60
 
61
+ // ORC File options
62
+ @Config("strip_size")
63
+ @ConfigDefault("100000")
64
+ Integer getStripSize();
65
+
66
+ @Config("buffer_size")
67
+ @ConfigDefault("10000")
68
+ Integer getBufferSize();
69
+
70
+ @Config("compression_kind")
71
+ @ConfigDefault("ZLIB")
72
+ public String getCompressionKind();
73
+
67
74
  @Config("overwrite")
68
75
  @ConfigDefault("false")
69
76
  boolean getOverwrite();
@@ -191,12 +198,12 @@ public class OrcOutputPlugin
191
198
 
192
199
  Writer writer = null;
193
200
  try {
201
+ // Make writerOptions
202
+ OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
194
203
  // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
195
204
  // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
196
205
  writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
197
- OrcFile.writerOptions(conf)
198
- .setSchema(oschema)
199
- .compress(CompressionKind.ZLIB)
206
+ writerOptions.setSchema(oschema)
200
207
  .version(OrcFile.Version.V_0_12));
201
208
  }
202
209
  catch (IOException e) {
@@ -205,6 +212,35 @@ public class OrcOutputPlugin
205
212
  return writer;
206
213
  }
207
214
 
215
+ private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
216
+ {
217
+ final Integer bufferSize = task.getBufferSize();
218
+ final Integer stripSize = task.getStripSize();
219
+ final String kindString = task.getCompressionKind();
220
+ CompressionKind kind;
221
+ switch (kindString) {
222
+ case "ZLIB":
223
+ kind = CompressionKind.ZLIB;
224
+ break;
225
+ case "SNAPPY":
226
+ kind = CompressionKind.SNAPPY;
227
+ break;
228
+ case "LZO":
229
+ kind = CompressionKind.LZO;
230
+ break;
231
+ case "LZ4":
232
+ kind = CompressionKind.LZ4;
233
+ break;
234
+ default:
235
+ kind = CompressionKind.NONE;
236
+ break;
237
+ }
238
+ return OrcFile.writerOptions(conf).
239
+ bufferSize(bufferSize)
240
+ .stripeSize(stripSize)
241
+ .compress(kind);
242
+ }
243
+
208
244
  class OrcTransactionalPageOutput
209
245
  implements TransactionalPageOutput
210
246
  {
@@ -225,78 +261,18 @@ public class OrcOutputPlugin
225
261
  @Override
226
262
  public void add(Page page)
227
263
  {
228
- List<String> strings = page.getStringReferences();
264
+ int size = page.getStringReferences().size();
229
265
  TypeDescription schema = getSchema(reader.getSchema());
230
266
  VectorizedRowBatch batch = schema.createRowBatch();
231
- batch.size = strings.size();
267
+ batch.size = size;
232
268
 
233
269
  reader.setPage(page);
234
270
  int i = 0;
235
271
  while (reader.nextRecord()) {
236
272
  // batch.size = page.getStringReferences().size();
237
- final int finalI = i;
238
-
239
- reader.getSchema().visitColumns(new ColumnVisitor()
240
- {
241
-
242
- @Override
243
- public void booleanColumn(Column column)
244
- {
245
- if (reader.isNull(column)) {
246
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
247
- }
248
- else {
249
- // TODO; Fix all true bug
250
- if (reader.getBoolean(column)) {
251
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 1;
252
- }
253
- else {
254
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
255
- }
256
- }
257
- }
258
-
259
- @Override
260
- public void longColumn(Column column)
261
- {
262
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
263
- }
264
-
265
- @Override
266
- public void doubleColumn(Column column)
267
- {
268
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getDouble(column);
269
- }
270
-
271
- @Override
272
- public void stringColumn(Column column)
273
- {
274
- ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(finalI,
275
- reader.getString(column).getBytes());
276
- }
277
-
278
- @Override
279
- public void timestampColumn(Column column)
280
- {
281
- if (reader.isNull(column)) {
282
- ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(finalI);
283
- }
284
- else {
285
- Timestamp timestamp = reader.getTimestamp(column);
286
- if (!timestamp.equals("")) {
287
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
288
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(finalI, ts);
289
- }
290
- // throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
291
- }
292
- }
293
-
294
- @Override
295
- public void jsonColumn(Column column)
296
- {
297
- throw new UnsupportedOperationException("orc output plugin does not support json type");
298
- }
299
- });
273
+ reader.getSchema().visitColumns(
274
+ new OrcColumnVisitor(reader, batch, i)
275
+ );
300
276
  i++;
301
277
  }
302
278
  try {
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
@@ -85,7 +85,7 @@ files:
85
85
  - classpath/curator-client-2.6.0.jar
86
86
  - classpath/curator-framework-2.6.0.jar
87
87
  - classpath/curator-recipes-2.6.0.jar
88
- - classpath/embulk-output-orc-0.0.1.jar
88
+ - classpath/embulk-output-orc-0.0.2.jar
89
89
  - classpath/gson-2.2.4.jar
90
90
  - classpath/guice-servlet-3.0.jar
91
91
  - classpath/hadoop-annotations-2.6.4.jar