embulk-output-orc 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f90685be3f76457be27d9ee129b56276a66dd42b
4
- data.tar.gz: ac33ac212a7bb6352aef8eb0caf310f5afd25bbe
3
+ metadata.gz: 725b869f19175110dd6e542619f5bfd9c8c0168b
4
+ data.tar.gz: 9c8016b015f380815cb2ad7127c192243ebfa45a
5
5
  SHA512:
6
- metadata.gz: aecde246d738967d2a91906560d9735ea11981e8f7349da57d7418a3ab08a7796fb5573f1e9b72b54ff6eee1a08f4ade83194024bfcf67ff6113ff6f570fb7b6
7
- data.tar.gz: 34b3eb0dc0c01388edf4310f5f55d00b0eab65b95b2def0a888867df8f79e0087c66d70e099d2af6327e57be03a80a75a6135cf281a58f5c4a0b8e22ed7caa1a
6
+ metadata.gz: 197da7bd52b19ccc2c3f30ad0d2c2c36446fa00e48426de9bb333b0ef2a8eddc213f0db947e50215297333a933e5d6d3b63ad0c0388f78e19d984fae6810bb99
7
+ data.tar.gz: ccf9bc5b8f73c8d0f48e7d17e16dd4b113b3f9b7a1b9178045109ef3476f4cb47f58e0699e4a9516ab81830d59a4bae3be2fba8f936c4cef0f705f6acc384f2d
data/README.md CHANGED
@@ -43,6 +43,12 @@ out:
43
43
 
44
44
  ## ChangeLog
45
45
 
46
+ ### ver 0.3.4
47
+
48
+ - Bump `orc` library to `1.5.4`
49
+ - bugfix
50
+ - https://github.com/yuokada/embulk-output-orc/pull/17
51
+
46
52
  ### ver 0.3.3
47
53
 
48
54
  - bugfix
@@ -18,7 +18,7 @@ configurations {
18
18
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
19
19
  }
20
20
 
21
- version = "0.3.3"
21
+ version = "0.3.4"
22
22
 
23
23
  sourceCompatibility = 1.8
24
24
  targetCompatibility = 1.8
@@ -27,8 +27,8 @@ dependencies {
27
27
  compile "org.embulk:embulk-core:0.8.34"
28
28
  provided "org.embulk:embulk-core:0.8.34"
29
29
 
30
- compile "org.apache.orc:orc:1.4.4"
31
- compile "org.apache.orc:orc-core:1.4.4"
30
+ compile "org.apache.orc:orc:1.5.4"
31
+ compile "org.apache.orc:orc-core:1.5.4"
32
32
  compile "org.apache.hadoop:hadoop-hdfs:2.7.5"
33
33
 
34
34
  compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
@@ -9,6 +9,7 @@ import org.apache.hadoop.hdfs.DistributedFileSystem;
9
9
  import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
10
10
  import org.apache.hadoop.util.VersionInfo;
11
11
  import org.apache.orc.CompressionKind;
12
+ import org.apache.orc.MemoryManager;
12
13
  import org.apache.orc.OrcFile;
13
14
  import org.apache.orc.TypeDescription;
14
15
  import org.apache.orc.Writer;
@@ -163,6 +164,7 @@ public class OrcOutputPlugin
163
164
  writer = OrcFile.createWriter(
164
165
  new Path(buildPath(task, processorIndex)),
165
166
  writerOptions.setSchema(oschema)
167
+ .memory(new WriterLocalMemoryManager())
166
168
  .version(OrcFile.Version.V_0_12)
167
169
  );
168
170
  }
@@ -201,33 +203,31 @@ public class OrcOutputPlugin
201
203
  @Override
202
204
  public void add(Page page)
203
205
  {
204
- synchronized (this) {
205
- try {
206
- // int size = page.getStringReferences().size();
207
- final TypeDescription schema = getSchema(reader.getSchema());
208
- final VectorizedRowBatch batch = schema.createRowBatch();
209
- // batch.size = size;
210
-
211
- reader.setPage(page);
212
- while (reader.nextRecord()) {
213
- final int row = batch.size++;
214
- reader.getSchema().visitColumns(
215
- new OrcColumnVisitor(reader, batch, row)
216
- );
217
- if (batch.size >= batch.getMaxSize()) {
218
- writer.addRowBatch(batch);
219
- batch.reset();
220
- }
221
- }
222
- if (batch.size != 0) {
206
+ try {
207
+ // int size = page.getStringReferences().size();
208
+ final TypeDescription schema = getSchema(reader.getSchema());
209
+ final VectorizedRowBatch batch = schema.createRowBatch();
210
+ // batch.size = size;
211
+
212
+ reader.setPage(page);
213
+ while (reader.nextRecord()) {
214
+ final int row = batch.size++;
215
+ reader.getSchema().visitColumns(
216
+ new OrcColumnVisitor(reader, batch, row)
217
+ );
218
+ if (batch.size >= batch.getMaxSize()) {
223
219
  writer.addRowBatch(batch);
224
220
  batch.reset();
225
221
  }
226
222
  }
227
- catch (IOException e) {
228
- e.printStackTrace();
223
+ if (batch.size != 0) {
224
+ writer.addRowBatch(batch);
225
+ batch.reset();
229
226
  }
230
227
  }
228
+ catch (IOException e) {
229
+ e.printStackTrace();
230
+ }
231
231
  }
232
232
 
233
233
  @Override
@@ -257,4 +257,42 @@ public class OrcOutputPlugin
257
257
  return Exec.newTaskReport();
258
258
  }
259
259
  }
260
+
261
+ // We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
262
+ // Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
263
+ // As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
264
+ // notifies checkMemory() only to that instance.
265
+ private static class WriterLocalMemoryManager implements MemoryManager
266
+ {
267
+ final long rowsBetweenChecks = 10000;
268
+
269
+ private int rowsAddedSinceCheck = 0;
270
+ Callback boundCallback = null;
271
+
272
+ @Override
273
+ public void addWriter(Path path, long requestedAllocation, Callback callback) throws IOException
274
+ {
275
+ if (boundCallback != null) {
276
+ throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.");
277
+ }
278
+
279
+ boundCallback = callback;
280
+ }
281
+
282
+ @Override
283
+ public void removeWriter(Path path) throws IOException
284
+ {
285
+ boundCallback = null;
286
+ }
287
+
288
+ @Override
289
+ public void addedRow(int rows) throws IOException
290
+ {
291
+ rowsAddedSinceCheck += rows;
292
+ if (rowsAddedSinceCheck > rowsBetweenChecks) {
293
+ boundCallback.checkMemory(1);
294
+ rowsAddedSinceCheck = 0;
295
+ }
296
+ }
297
+ }
260
298
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-10 00:00:00.000000000 Z
11
+ date: 2019-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -50,8 +50,7 @@ files:
50
50
  - LICENSE.txt
51
51
  - README.md
52
52
  - build.gradle
53
- - classpath/activation-1.1.jar
54
- - classpath/aircompressor-0.8.jar
53
+ - classpath/aircompressor-0.10.jar
55
54
  - classpath/animal-sniffer-annotations-1.14.jar
56
55
  - classpath/apacheds-i18n-2.0.0-M15.jar
57
56
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -81,7 +80,7 @@ files:
81
80
  - classpath/curator-client-2.7.1.jar
82
81
  - classpath/curator-framework-2.7.1.jar
83
82
  - classpath/curator-recipes-2.7.1.jar
84
- - classpath/embulk-output-orc-0.3.3.jar
83
+ - classpath/embulk-output-orc-0.3.4.jar
85
84
  - classpath/embulk-util-aws-credentials-0.2.8.jar
86
85
  - classpath/error_prone_annotations-2.1.3.jar
87
86
  - classpath/gson-2.2.4.jar
@@ -91,7 +90,7 @@ files:
91
90
  - classpath/hadoop-aws-2.7.5.jar
92
91
  - classpath/hadoop-common-2.7.5.jar
93
92
  - classpath/hadoop-hdfs-2.7.5.jar
94
- - classpath/hive-storage-api-2.2.1.jar
93
+ - classpath/hive-storage-api-2.6.0.jar
95
94
  - classpath/htrace-core-3.1.0-incubating.jar
96
95
  - classpath/httpclient-4.3.6.jar
97
96
  - classpath/httpcore-4.3.3.jar
@@ -101,7 +100,7 @@ files:
101
100
  - classpath/jackson-mapper-asl-1.9.13.jar
102
101
  - classpath/jackson-xc-1.8.3.jar
103
102
  - classpath/java-xmlbuilder-0.4.jar
104
- - classpath/jaxb-api-2.2.2.jar
103
+ - classpath/jaxb-api-2.2.11.jar
105
104
  - classpath/jaxb-impl-2.2.3-1.jar
106
105
  - classpath/jcl-over-slf4j-1.7.12.jar
107
106
  - classpath/jersey-core-1.9.jar
@@ -120,13 +119,13 @@ files:
120
119
  - classpath/log4j-1.2.17.jar
121
120
  - classpath/netty-3.7.0.Final.jar
122
121
  - classpath/netty-all-4.0.23.Final.jar
123
- - classpath/orc-core-1.4.4.jar
122
+ - classpath/orc-core-1.5.4.jar
123
+ - classpath/orc-shims-1.5.4.jar
124
124
  - classpath/paranamer-2.3.jar
125
125
  - classpath/protobuf-java-2.5.0.jar
126
126
  - classpath/servlet-api-2.5-20081211.jar
127
127
  - classpath/servlet-api-2.5.jar
128
128
  - classpath/snappy-java-1.0.4.1.jar
129
- - classpath/stax-api-1.0-2.jar
130
129
  - classpath/xercesImpl-2.9.1.jar
131
130
  - classpath/xml-apis-1.3.04.jar
132
131
  - classpath/xmlenc-0.52.jar