embulk-output-orc 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5bf0784f61bbc808d36ebce5e46aaab889b891a3
4
- data.tar.gz: 8937c475721a4f2c347575580982ce2a772f8d63
3
+ metadata.gz: 8b6323f654a352230f551c23017a85e9ceee97ef
4
+ data.tar.gz: 400429eadfbb55ddaaaa2e954bddb54306a31e78
5
5
  SHA512:
6
- metadata.gz: 23a1a87ca07df8ebc6d17575a3abcf58cf9c7eb5cd6569ba62cfd7fa3cb52c42cc27a00e65f99d821c396e897ebaac78c3dc5dfe9ee6a750049c2017f08d9fa5
7
- data.tar.gz: 0c08613e8c5182987a4bbb03ae3a0ce9eddb474a1b8672aa5fbc25e69da4ea0784a9982c6bc3888263eddb68213ec885eb3bb2000aaf187cb94148fa593a780d
6
+ metadata.gz: 8359917c8f9b429189faec83b51aa3fee3abfbfc74130eef9b83489b46e8be4ff40a2ca273c7c17a6dd770bb329cb25195c1e3c85bf103b46acee5b952c7cc66
7
+ data.tar.gz: 3910abc4803eec8a36fc275229a4ff3e5b0f14c4fbf1bdea7071cd49c62b2782025acef0d485987509ea15723713a822abc3a306e570e76c57df2c951be22925
data/README.md CHANGED
@@ -36,14 +36,19 @@
36
36
  out:
37
37
  type: orc
38
38
  path_prefix: "/tmp/output"
39
- buffer_size: 8000
40
- strip_size: 90000
41
39
  compression_kind: ZLIB
42
40
  overwrite: true
43
41
  ```
44
42
 
45
43
  ## ChangeLog
46
44
 
45
+ ### ver 0.3.0
46
+
47
+ - Change default value : (block_size, buffer_size, strip_size)
48
+
49
+ - default value is Hive's default value.
50
+ (see: https://orc.apache.org/docs/hive-config.html)
51
+
47
52
  ### ver 0.2.0
48
53
 
49
54
  - support: output to s3
@@ -18,7 +18,7 @@ configurations {
18
18
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
19
19
  }
20
20
 
21
- version = "0.3.0"
21
+ version = "0.3.1"
22
22
 
23
23
  sourceCompatibility = 1.8
24
24
  targetCompatibility = 1.8
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  in:
3
3
  type: randomj
4
- rows: 1024
4
+ rows: 1024000
5
5
  threads: 1
6
6
  # default_timezone: Asia/Tokyo
7
7
  primary_key: myid
@@ -14,14 +14,12 @@ in:
14
14
  - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
15
15
  - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
16
16
 
17
- #exec:
18
- # max_threads: 6 # run at most 8 tasks concurrently
19
- # min_output_tasks: 2 # disable page scattering
17
+ exec:
18
+ max_threads: 2 # run at most 8 tasks concurrently
19
+ min_output_tasks: 1 # disable page scattering
20
20
 
21
21
  out:
22
22
  type: orc
23
23
  overwrite: true
24
24
  path_prefix: "/tmp/output"
25
- buffer_size: 8000
26
- strip_size: 90000
27
25
  compression_kind: ZLIB
@@ -197,25 +197,32 @@ public class OrcOutputPlugin
197
197
  @Override
198
198
  public void add(Page page)
199
199
  {
200
- int size = page.getStringReferences().size();
201
- final TypeDescription schema = getSchema(reader.getSchema());
202
- final VectorizedRowBatch batch = schema.createRowBatch();
203
- batch.size = size;
204
-
205
- reader.setPage(page);
206
- int i = 0;
207
- while (reader.nextRecord()) {
208
- reader.getSchema().visitColumns(
209
- new OrcColumnVisitor(reader, batch, i)
210
- );
211
- i++;
212
- }
213
- try {
214
- writer.addRowBatch(batch);
215
- batch.reset();
216
- }
217
- catch (IOException e) {
218
- e.printStackTrace();
200
+ synchronized (this) {
201
+ try {
202
+ // int size = page.getStringReferences().size();
203
+ final TypeDescription schema = getSchema(reader.getSchema());
204
+ final VectorizedRowBatch batch = schema.createRowBatch();
205
+ // batch.size = size;
206
+
207
+ reader.setPage(page);
208
+ while (reader.nextRecord()) {
209
+ final int row = batch.size++;
210
+ reader.getSchema().visitColumns(
211
+ new OrcColumnVisitor(reader, batch, row)
212
+ );
213
+ if (batch.size >= batch.getMaxSize()) {
214
+ writer.addRowBatch(batch);
215
+ batch.reset();
216
+ }
217
+ }
218
+ if (batch.size != 0) {
219
+ writer.addRowBatch(batch);
220
+ batch.reset();
221
+ }
222
+ }
223
+ catch (IOException e) {
224
+ e.printStackTrace();
225
+ }
219
226
  }
220
227
  }
221
228
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-28 00:00:00.000000000 Z
11
+ date: 2017-12-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -94,7 +94,7 @@ files:
94
94
  - classpath/curator-client-2.7.1.jar
95
95
  - classpath/curator-framework-2.7.1.jar
96
96
  - classpath/curator-recipes-2.7.1.jar
97
- - classpath/embulk-output-orc-0.3.0.jar
97
+ - classpath/embulk-output-orc-0.3.1.jar
98
98
  - classpath/embulk-util-aws-credentials-0.2.8.jar
99
99
  - classpath/gson-2.2.4.jar
100
100
  - classpath/hadoop-annotations-2.7.3.jar