embulk-output-orc 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5bf0784f61bbc808d36ebce5e46aaab889b891a3
4
- data.tar.gz: 8937c475721a4f2c347575580982ce2a772f8d63
3
+ metadata.gz: 8b6323f654a352230f551c23017a85e9ceee97ef
4
+ data.tar.gz: 400429eadfbb55ddaaaa2e954bddb54306a31e78
5
5
  SHA512:
6
- metadata.gz: 23a1a87ca07df8ebc6d17575a3abcf58cf9c7eb5cd6569ba62cfd7fa3cb52c42cc27a00e65f99d821c396e897ebaac78c3dc5dfe9ee6a750049c2017f08d9fa5
7
- data.tar.gz: 0c08613e8c5182987a4bbb03ae3a0ce9eddb474a1b8672aa5fbc25e69da4ea0784a9982c6bc3888263eddb68213ec885eb3bb2000aaf187cb94148fa593a780d
6
+ metadata.gz: 8359917c8f9b429189faec83b51aa3fee3abfbfc74130eef9b83489b46e8be4ff40a2ca273c7c17a6dd770bb329cb25195c1e3c85bf103b46acee5b952c7cc66
7
+ data.tar.gz: 3910abc4803eec8a36fc275229a4ff3e5b0f14c4fbf1bdea7071cd49c62b2782025acef0d485987509ea15723713a822abc3a306e570e76c57df2c951be22925
data/README.md CHANGED
@@ -36,14 +36,19 @@
36
36
  out:
37
37
  type: orc
38
38
  path_prefix: "/tmp/output"
39
- buffer_size: 8000
40
- strip_size: 90000
41
39
  compression_kind: ZLIB
42
40
  overwrite: true
43
41
  ```
44
42
 
45
43
  ## ChangeLog
46
44
 
45
+ ### ver 0.3.0
46
+
47
+ - Change default value : (block_size, buffer_size, strip_size)
48
+
49
+ - default value is Hive's default value.
50
+ (see: https://orc.apache.org/docs/hive-config.html)
51
+
47
52
  ### ver 0.2.0
48
53
 
49
54
  - support: output to s3
@@ -18,7 +18,7 @@ configurations {
18
18
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
19
19
  }
20
20
 
21
- version = "0.3.0"
21
+ version = "0.3.1"
22
22
 
23
23
  sourceCompatibility = 1.8
24
24
  targetCompatibility = 1.8
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  in:
3
3
  type: randomj
4
- rows: 1024
4
+ rows: 1024000
5
5
  threads: 1
6
6
  # default_timezone: Asia/Tokyo
7
7
  primary_key: myid
@@ -14,14 +14,12 @@ in:
14
14
  - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
15
15
  - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
16
16
 
17
- #exec:
18
- # max_threads: 6 # run at most 8 tasks concurrently
19
- # min_output_tasks: 2 # disable page scattering
17
+ exec:
18
+ max_threads: 2 # run at most 8 tasks concurrently
19
+ min_output_tasks: 1 # disable page scattering
20
20
 
21
21
  out:
22
22
  type: orc
23
23
  overwrite: true
24
24
  path_prefix: "/tmp/output"
25
- buffer_size: 8000
26
- strip_size: 90000
27
25
  compression_kind: ZLIB
@@ -197,25 +197,32 @@ public class OrcOutputPlugin
197
197
  @Override
198
198
  public void add(Page page)
199
199
  {
200
- int size = page.getStringReferences().size();
201
- final TypeDescription schema = getSchema(reader.getSchema());
202
- final VectorizedRowBatch batch = schema.createRowBatch();
203
- batch.size = size;
204
-
205
- reader.setPage(page);
206
- int i = 0;
207
- while (reader.nextRecord()) {
208
- reader.getSchema().visitColumns(
209
- new OrcColumnVisitor(reader, batch, i)
210
- );
211
- i++;
212
- }
213
- try {
214
- writer.addRowBatch(batch);
215
- batch.reset();
216
- }
217
- catch (IOException e) {
218
- e.printStackTrace();
200
+ synchronized (this) {
201
+ try {
202
+ // int size = page.getStringReferences().size();
203
+ final TypeDescription schema = getSchema(reader.getSchema());
204
+ final VectorizedRowBatch batch = schema.createRowBatch();
205
+ // batch.size = size;
206
+
207
+ reader.setPage(page);
208
+ while (reader.nextRecord()) {
209
+ final int row = batch.size++;
210
+ reader.getSchema().visitColumns(
211
+ new OrcColumnVisitor(reader, batch, row)
212
+ );
213
+ if (batch.size >= batch.getMaxSize()) {
214
+ writer.addRowBatch(batch);
215
+ batch.reset();
216
+ }
217
+ }
218
+ if (batch.size != 0) {
219
+ writer.addRowBatch(batch);
220
+ batch.reset();
221
+ }
222
+ }
223
+ catch (IOException e) {
224
+ e.printStackTrace();
225
+ }
219
226
  }
220
227
  }
221
228
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-28 00:00:00.000000000 Z
11
+ date: 2017-12-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -94,7 +94,7 @@ files:
94
94
  - classpath/curator-client-2.7.1.jar
95
95
  - classpath/curator-framework-2.7.1.jar
96
96
  - classpath/curator-recipes-2.7.1.jar
97
- - classpath/embulk-output-orc-0.3.0.jar
97
+ - classpath/embulk-output-orc-0.3.1.jar
98
98
  - classpath/embulk-util-aws-credentials-0.2.8.jar
99
99
  - classpath/gson-2.2.4.jar
100
100
  - classpath/hadoop-annotations-2.7.3.jar