embulk-output-orc 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/build.gradle +3 -3
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +59 -21
- metadata +8 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 725b869f19175110dd6e542619f5bfd9c8c0168b
|
4
|
+
data.tar.gz: 9c8016b015f380815cb2ad7127c192243ebfa45a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 197da7bd52b19ccc2c3f30ad0d2c2c36446fa00e48426de9bb333b0ef2a8eddc213f0db947e50215297333a933e5d6d3b63ad0c0388f78e19d984fae6810bb99
|
7
|
+
data.tar.gz: ccf9bc5b8f73c8d0f48e7d17e16dd4b113b3f9b7a1b9178045109ef3476f4cb47f58e0699e4a9516ab81830d59a4bae3be2fba8f936c4cef0f705f6acc384f2d
|
data/README.md
CHANGED
data/build.gradle
CHANGED
@@ -18,7 +18,7 @@ configurations {
|
|
18
18
|
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
19
19
|
}
|
20
20
|
|
21
|
-
version = "0.3.
|
21
|
+
version = "0.3.4"
|
22
22
|
|
23
23
|
sourceCompatibility = 1.8
|
24
24
|
targetCompatibility = 1.8
|
@@ -27,8 +27,8 @@ dependencies {
|
|
27
27
|
compile "org.embulk:embulk-core:0.8.34"
|
28
28
|
provided "org.embulk:embulk-core:0.8.34"
|
29
29
|
|
30
|
-
compile "org.apache.orc:orc:1.
|
31
|
-
compile "org.apache.orc:orc-core:1.
|
30
|
+
compile "org.apache.orc:orc:1.5.4"
|
31
|
+
compile "org.apache.orc:orc-core:1.5.4"
|
32
32
|
compile "org.apache.hadoop:hadoop-hdfs:2.7.5"
|
33
33
|
|
34
34
|
compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
|
@@ -9,6 +9,7 @@ import org.apache.hadoop.hdfs.DistributedFileSystem;
|
|
9
9
|
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
10
10
|
import org.apache.hadoop.util.VersionInfo;
|
11
11
|
import org.apache.orc.CompressionKind;
|
12
|
+
import org.apache.orc.MemoryManager;
|
12
13
|
import org.apache.orc.OrcFile;
|
13
14
|
import org.apache.orc.TypeDescription;
|
14
15
|
import org.apache.orc.Writer;
|
@@ -163,6 +164,7 @@ public class OrcOutputPlugin
|
|
163
164
|
writer = OrcFile.createWriter(
|
164
165
|
new Path(buildPath(task, processorIndex)),
|
165
166
|
writerOptions.setSchema(oschema)
|
167
|
+
.memory(new WriterLocalMemoryManager())
|
166
168
|
.version(OrcFile.Version.V_0_12)
|
167
169
|
);
|
168
170
|
}
|
@@ -201,33 +203,31 @@ public class OrcOutputPlugin
|
|
201
203
|
@Override
|
202
204
|
public void add(Page page)
|
203
205
|
{
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
if (batch.size >= batch.getMaxSize()) {
|
218
|
-
writer.addRowBatch(batch);
|
219
|
-
batch.reset();
|
220
|
-
}
|
221
|
-
}
|
222
|
-
if (batch.size != 0) {
|
206
|
+
try {
|
207
|
+
// int size = page.getStringReferences().size();
|
208
|
+
final TypeDescription schema = getSchema(reader.getSchema());
|
209
|
+
final VectorizedRowBatch batch = schema.createRowBatch();
|
210
|
+
// batch.size = size;
|
211
|
+
|
212
|
+
reader.setPage(page);
|
213
|
+
while (reader.nextRecord()) {
|
214
|
+
final int row = batch.size++;
|
215
|
+
reader.getSchema().visitColumns(
|
216
|
+
new OrcColumnVisitor(reader, batch, row)
|
217
|
+
);
|
218
|
+
if (batch.size >= batch.getMaxSize()) {
|
223
219
|
writer.addRowBatch(batch);
|
224
220
|
batch.reset();
|
225
221
|
}
|
226
222
|
}
|
227
|
-
|
228
|
-
|
223
|
+
if (batch.size != 0) {
|
224
|
+
writer.addRowBatch(batch);
|
225
|
+
batch.reset();
|
229
226
|
}
|
230
227
|
}
|
228
|
+
catch (IOException e) {
|
229
|
+
e.printStackTrace();
|
230
|
+
}
|
231
231
|
}
|
232
232
|
|
233
233
|
@Override
|
@@ -257,4 +257,42 @@ public class OrcOutputPlugin
|
|
257
257
|
return Exec.newTaskReport();
|
258
258
|
}
|
259
259
|
}
|
260
|
+
|
261
|
+
// We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
|
262
|
+
// Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
|
263
|
+
// As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
|
264
|
+
// notifies checkMemory() only to that instance.
|
265
|
+
private static class WriterLocalMemoryManager implements MemoryManager
|
266
|
+
{
|
267
|
+
final long rowsBetweenChecks = 10000;
|
268
|
+
|
269
|
+
private int rowsAddedSinceCheck = 0;
|
270
|
+
Callback boundCallback = null;
|
271
|
+
|
272
|
+
@Override
|
273
|
+
public void addWriter(Path path, long requestedAllocation, Callback callback) throws IOException
|
274
|
+
{
|
275
|
+
if (boundCallback != null) {
|
276
|
+
throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.");
|
277
|
+
}
|
278
|
+
|
279
|
+
boundCallback = callback;
|
280
|
+
}
|
281
|
+
|
282
|
+
@Override
|
283
|
+
public void removeWriter(Path path) throws IOException
|
284
|
+
{
|
285
|
+
boundCallback = null;
|
286
|
+
}
|
287
|
+
|
288
|
+
@Override
|
289
|
+
public void addedRow(int rows) throws IOException
|
290
|
+
{
|
291
|
+
rowsAddedSinceCheck += rows;
|
292
|
+
if (rowsAddedSinceCheck > rowsBetweenChecks) {
|
293
|
+
boundCallback.checkMemory(1);
|
294
|
+
rowsAddedSinceCheck = 0;
|
295
|
+
}
|
296
|
+
}
|
297
|
+
}
|
260
298
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -50,8 +50,7 @@ files:
|
|
50
50
|
- LICENSE.txt
|
51
51
|
- README.md
|
52
52
|
- build.gradle
|
53
|
-
- classpath/
|
54
|
-
- classpath/aircompressor-0.8.jar
|
53
|
+
- classpath/aircompressor-0.10.jar
|
55
54
|
- classpath/animal-sniffer-annotations-1.14.jar
|
56
55
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
57
56
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -81,7 +80,7 @@ files:
|
|
81
80
|
- classpath/curator-client-2.7.1.jar
|
82
81
|
- classpath/curator-framework-2.7.1.jar
|
83
82
|
- classpath/curator-recipes-2.7.1.jar
|
84
|
-
- classpath/embulk-output-orc-0.3.
|
83
|
+
- classpath/embulk-output-orc-0.3.4.jar
|
85
84
|
- classpath/embulk-util-aws-credentials-0.2.8.jar
|
86
85
|
- classpath/error_prone_annotations-2.1.3.jar
|
87
86
|
- classpath/gson-2.2.4.jar
|
@@ -91,7 +90,7 @@ files:
|
|
91
90
|
- classpath/hadoop-aws-2.7.5.jar
|
92
91
|
- classpath/hadoop-common-2.7.5.jar
|
93
92
|
- classpath/hadoop-hdfs-2.7.5.jar
|
94
|
-
- classpath/hive-storage-api-2.
|
93
|
+
- classpath/hive-storage-api-2.6.0.jar
|
95
94
|
- classpath/htrace-core-3.1.0-incubating.jar
|
96
95
|
- classpath/httpclient-4.3.6.jar
|
97
96
|
- classpath/httpcore-4.3.3.jar
|
@@ -101,7 +100,7 @@ files:
|
|
101
100
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
102
101
|
- classpath/jackson-xc-1.8.3.jar
|
103
102
|
- classpath/java-xmlbuilder-0.4.jar
|
104
|
-
- classpath/jaxb-api-2.2.
|
103
|
+
- classpath/jaxb-api-2.2.11.jar
|
105
104
|
- classpath/jaxb-impl-2.2.3-1.jar
|
106
105
|
- classpath/jcl-over-slf4j-1.7.12.jar
|
107
106
|
- classpath/jersey-core-1.9.jar
|
@@ -120,13 +119,13 @@ files:
|
|
120
119
|
- classpath/log4j-1.2.17.jar
|
121
120
|
- classpath/netty-3.7.0.Final.jar
|
122
121
|
- classpath/netty-all-4.0.23.Final.jar
|
123
|
-
- classpath/orc-core-1.
|
122
|
+
- classpath/orc-core-1.5.4.jar
|
123
|
+
- classpath/orc-shims-1.5.4.jar
|
124
124
|
- classpath/paranamer-2.3.jar
|
125
125
|
- classpath/protobuf-java-2.5.0.jar
|
126
126
|
- classpath/servlet-api-2.5-20081211.jar
|
127
127
|
- classpath/servlet-api-2.5.jar
|
128
128
|
- classpath/snappy-java-1.0.4.1.jar
|
129
|
-
- classpath/stax-api-1.0-2.jar
|
130
129
|
- classpath/xercesImpl-2.9.1.jar
|
131
130
|
- classpath/xml-apis-1.3.04.jar
|
132
131
|
- classpath/xmlenc-0.52.jar
|