embulk-output-orc 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c6d1115e79d4012717df2f43f447bfbdd1518a88
4
- data.tar.gz: 80e8e7f5e470724a33125b4ecce74112ef098d6b
3
+ metadata.gz: 3c842edfe45c7e992faae16afd3331c7f8ecf256
4
+ data.tar.gz: a6b3d098e7b012a07f4870e2fc92c898b09e1560
5
5
  SHA512:
6
- metadata.gz: 290aec04feda06d83ddd9cd2995d62a186fbfc3399b90ede431e54eec8e34aac606077919610f446e703ee820e7625ed1d79f9d08378fccd6fcb052f15e8ab31
7
- data.tar.gz: 899435e450c217c4f8b08fa9ea617efb64b5628ba0cb42b37849a11e5363b96889b67e595b51f2f3b2b396386166d5a5ddd96fd34cfa761f735572262b59f716
6
+ metadata.gz: 3bf5bc9e310496191419ee1e9a76cf9912321e9df2d178081c324ed46f83e7600e04912df59a7ebf38e0e3761be4b8ea1931cae549d681a8ba5b6d35b1e19990
7
+ data.tar.gz: bbec4349adf56b4c684084a39f61eee8dd41152a11de02c2b5573e5e933cf8d76f280e7110f030cb31b157fdfc9e7fa457f575cd8a6b5b4d8df8ea484a51f358
@@ -14,7 +14,7 @@ configurations {
14
14
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
15
15
  }
16
16
 
17
- version = "0.0.1"
17
+ version = "0.0.2"
18
18
 
19
19
  sourceCompatibility = 1.8
20
20
  targetCompatibility = 1.8
@@ -54,3 +54,6 @@ exec:
54
54
  out:
55
55
  type: orc
56
56
  path_prefix: "/tmp/output"
57
+ buffer_size: 8000
58
+ strip_size: 90000
59
+ compression_kind: ZLIB
@@ -3,66 +3,82 @@ package org.embulk.output.orc;
3
3
  import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
4
4
  import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
5
5
  import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
6
+ import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
6
7
  import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
7
8
  import org.embulk.spi.Column;
8
9
  import org.embulk.spi.ColumnVisitor;
9
- import org.embulk.spi.Page;
10
10
  import org.embulk.spi.PageReader;
11
+ import org.embulk.spi.time.Timestamp;
11
12
 
12
- public class OrcColumnVisitor implements ColumnVisitor
13
+ public class OrcColumnVisitor
14
+ implements ColumnVisitor
13
15
  {
14
16
  private PageReader reader;
15
- VectorizedRowBatch batch;
16
- Integer finalI;
17
+ private VectorizedRowBatch batch;
18
+ private Integer i;
17
19
 
18
- public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Page page, Integer i)
20
+ public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
19
21
  {
20
- int size = page.getStringReferences().size();
21
-
22
22
  this.reader = pageReader;
23
23
  this.batch = rowBatch;
24
- this.finalI = i;
24
+ this.i = i;
25
25
  }
26
26
 
27
27
  @Override
28
28
  public void booleanColumn(Column column)
29
29
  {
30
30
  if (reader.isNull(column)) {
31
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
31
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
32
32
  }
33
33
  else {
34
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
34
+ // TODO; Fix all true bug
35
+ if (reader.getBoolean(column)) {
36
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
37
+ }
38
+ else {
39
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
40
+ }
35
41
  }
36
42
  }
37
43
 
38
44
  @Override
39
45
  public void longColumn(Column column)
40
46
  {
41
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
47
+ ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
42
48
  }
43
49
 
44
50
  @Override
45
51
  public void doubleColumn(Column column)
46
52
  {
47
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getDouble(column);
53
+ ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
48
54
  }
49
55
 
50
56
  @Override
51
57
  public void stringColumn(Column column)
52
58
  {
53
- ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(finalI,
59
+ ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
54
60
  reader.getString(column).getBytes());
55
61
  }
56
62
 
57
63
  @Override
58
64
  public void timestampColumn(Column column)
59
65
  {
60
-
66
+ if (reader.isNull(column)) {
67
+ ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
68
+ }
69
+ else {
70
+ Timestamp timestamp = reader.getTimestamp(column);
71
+ if (!timestamp.equals("")) {
72
+ java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
+ ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
74
+ }
75
+ // throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
76
+ }
61
77
  }
62
78
 
63
79
  @Override
64
80
  public void jsonColumn(Column column)
65
81
  {
66
- // throw unsupported
82
+ throw new UnsupportedOperationException("orc output plugin does not support json type");
67
83
  }
68
84
  }
@@ -6,10 +6,6 @@ import org.apache.hadoop.conf.Configuration;
6
6
  import org.apache.hadoop.fs.LocalFileSystem;
7
7
  import org.apache.hadoop.fs.Path;
8
8
  import org.apache.hadoop.hdfs.DistributedFileSystem;
9
- import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
10
- import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
11
- import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
12
- import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
13
9
  import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
14
10
  import org.apache.hadoop.util.VersionInfo;
15
11
  import org.apache.orc.CompressionKind;
@@ -24,14 +20,12 @@ import org.embulk.config.Task;
24
20
  import org.embulk.config.TaskReport;
25
21
  import org.embulk.config.TaskSource;
26
22
  import org.embulk.spi.Column;
27
- import org.embulk.spi.ColumnVisitor;
28
23
  import org.embulk.spi.Exec;
29
24
  import org.embulk.spi.OutputPlugin;
30
25
  import org.embulk.spi.Page;
31
26
  import org.embulk.spi.PageReader;
32
27
  import org.embulk.spi.Schema;
33
28
  import org.embulk.spi.TransactionalPageOutput;
34
- import org.embulk.spi.time.Timestamp;
35
29
  import org.embulk.spi.time.TimestampFormatter;
36
30
  import org.embulk.spi.type.Type;
37
31
  import org.embulk.spi.util.Timestamps;
@@ -64,6 +58,19 @@ public class OrcOutputPlugin
64
58
  @ConfigDefault("\".%03d\"")
65
59
  String getSequenceFormat();
66
60
 
61
+ // ORC File options
62
+ @Config("strip_size")
63
+ @ConfigDefault("100000")
64
+ Integer getStripSize();
65
+
66
+ @Config("buffer_size")
67
+ @ConfigDefault("10000")
68
+ Integer getBufferSize();
69
+
70
+ @Config("compression_kind")
71
+ @ConfigDefault("ZLIB")
72
+ public String getCompressionKind();
73
+
67
74
  @Config("overwrite")
68
75
  @ConfigDefault("false")
69
76
  boolean getOverwrite();
@@ -191,12 +198,12 @@ public class OrcOutputPlugin
191
198
 
192
199
  Writer writer = null;
193
200
  try {
201
+ // Make writerOptions
202
+ OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
194
203
  // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
195
204
  // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
196
205
  writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)),
197
- OrcFile.writerOptions(conf)
198
- .setSchema(oschema)
199
- .compress(CompressionKind.ZLIB)
206
+ writerOptions.setSchema(oschema)
200
207
  .version(OrcFile.Version.V_0_12));
201
208
  }
202
209
  catch (IOException e) {
@@ -205,6 +212,35 @@ public class OrcOutputPlugin
205
212
  return writer;
206
213
  }
207
214
 
215
+ private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
216
+ {
217
+ final Integer bufferSize = task.getBufferSize();
218
+ final Integer stripSize = task.getStripSize();
219
+ final String kindString = task.getCompressionKind();
220
+ CompressionKind kind;
221
+ switch (kindString) {
222
+ case "ZLIB":
223
+ kind = CompressionKind.ZLIB;
224
+ break;
225
+ case "SNAPPY":
226
+ kind = CompressionKind.SNAPPY;
227
+ break;
228
+ case "LZO":
229
+ kind = CompressionKind.LZO;
230
+ break;
231
+ case "LZ4":
232
+ kind = CompressionKind.LZ4;
233
+ break;
234
+ default:
235
+ kind = CompressionKind.NONE;
236
+ break;
237
+ }
238
+ return OrcFile.writerOptions(conf).
239
+ bufferSize(bufferSize)
240
+ .stripeSize(stripSize)
241
+ .compress(kind);
242
+ }
243
+
208
244
  class OrcTransactionalPageOutput
209
245
  implements TransactionalPageOutput
210
246
  {
@@ -225,78 +261,18 @@ public class OrcOutputPlugin
225
261
  @Override
226
262
  public void add(Page page)
227
263
  {
228
- List<String> strings = page.getStringReferences();
264
+ int size = page.getStringReferences().size();
229
265
  TypeDescription schema = getSchema(reader.getSchema());
230
266
  VectorizedRowBatch batch = schema.createRowBatch();
231
- batch.size = strings.size();
267
+ batch.size = size;
232
268
 
233
269
  reader.setPage(page);
234
270
  int i = 0;
235
271
  while (reader.nextRecord()) {
236
272
  // batch.size = page.getStringReferences().size();
237
- final int finalI = i;
238
-
239
- reader.getSchema().visitColumns(new ColumnVisitor()
240
- {
241
-
242
- @Override
243
- public void booleanColumn(Column column)
244
- {
245
- if (reader.isNull(column)) {
246
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
247
- }
248
- else {
249
- // TODO; Fix all true bug
250
- if (reader.getBoolean(column)) {
251
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 1;
252
- }
253
- else {
254
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = 0;
255
- }
256
- }
257
- }
258
-
259
- @Override
260
- public void longColumn(Column column)
261
- {
262
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getLong(column);
263
- }
264
-
265
- @Override
266
- public void doubleColumn(Column column)
267
- {
268
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[finalI] = reader.getDouble(column);
269
- }
270
-
271
- @Override
272
- public void stringColumn(Column column)
273
- {
274
- ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(finalI,
275
- reader.getString(column).getBytes());
276
- }
277
-
278
- @Override
279
- public void timestampColumn(Column column)
280
- {
281
- if (reader.isNull(column)) {
282
- ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(finalI);
283
- }
284
- else {
285
- Timestamp timestamp = reader.getTimestamp(column);
286
- if (!timestamp.equals("")) {
287
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
288
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(finalI, ts);
289
- }
290
- // throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
291
- }
292
- }
293
-
294
- @Override
295
- public void jsonColumn(Column column)
296
- {
297
- throw new UnsupportedOperationException("orc output plugin does not support json type");
298
- }
299
- });
273
+ reader.getSchema().visitColumns(
274
+ new OrcColumnVisitor(reader, batch, i)
275
+ );
300
276
  i++;
301
277
  }
302
278
  try {
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
@@ -85,7 +85,7 @@ files:
85
85
  - classpath/curator-client-2.6.0.jar
86
86
  - classpath/curator-framework-2.6.0.jar
87
87
  - classpath/curator-recipes-2.6.0.jar
88
- - classpath/embulk-output-orc-0.0.1.jar
88
+ - classpath/embulk-output-orc-0.0.2.jar
89
89
  - classpath/gson-2.2.4.jar
90
90
  - classpath/guice-servlet-3.0.jar
91
91
  - classpath/hadoop-annotations-2.6.4.jar