embulk-output-orc 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/build.gradle +3 -3
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +59 -21
- metadata +8 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 725b869f19175110dd6e542619f5bfd9c8c0168b
|
4
|
+
data.tar.gz: 9c8016b015f380815cb2ad7127c192243ebfa45a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 197da7bd52b19ccc2c3f30ad0d2c2c36446fa00e48426de9bb333b0ef2a8eddc213f0db947e50215297333a933e5d6d3b63ad0c0388f78e19d984fae6810bb99
|
7
|
+
data.tar.gz: ccf9bc5b8f73c8d0f48e7d17e16dd4b113b3f9b7a1b9178045109ef3476f4cb47f58e0699e4a9516ab81830d59a4bae3be2fba8f936c4cef0f705f6acc384f2d
|
data/README.md
CHANGED
data/build.gradle
CHANGED
@@ -18,7 +18,7 @@ configurations {
|
|
18
18
|
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
19
19
|
}
|
20
20
|
|
21
|
-
version = "0.3.
|
21
|
+
version = "0.3.4"
|
22
22
|
|
23
23
|
sourceCompatibility = 1.8
|
24
24
|
targetCompatibility = 1.8
|
@@ -27,8 +27,8 @@ dependencies {
|
|
27
27
|
compile "org.embulk:embulk-core:0.8.34"
|
28
28
|
provided "org.embulk:embulk-core:0.8.34"
|
29
29
|
|
30
|
-
compile "org.apache.orc:orc:1.
|
31
|
-
compile "org.apache.orc:orc-core:1.
|
30
|
+
compile "org.apache.orc:orc:1.5.4"
|
31
|
+
compile "org.apache.orc:orc-core:1.5.4"
|
32
32
|
compile "org.apache.hadoop:hadoop-hdfs:2.7.5"
|
33
33
|
|
34
34
|
compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
|
@@ -9,6 +9,7 @@ import org.apache.hadoop.hdfs.DistributedFileSystem;
|
|
9
9
|
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
10
10
|
import org.apache.hadoop.util.VersionInfo;
|
11
11
|
import org.apache.orc.CompressionKind;
|
12
|
+
import org.apache.orc.MemoryManager;
|
12
13
|
import org.apache.orc.OrcFile;
|
13
14
|
import org.apache.orc.TypeDescription;
|
14
15
|
import org.apache.orc.Writer;
|
@@ -163,6 +164,7 @@ public class OrcOutputPlugin
|
|
163
164
|
writer = OrcFile.createWriter(
|
164
165
|
new Path(buildPath(task, processorIndex)),
|
165
166
|
writerOptions.setSchema(oschema)
|
167
|
+
.memory(new WriterLocalMemoryManager())
|
166
168
|
.version(OrcFile.Version.V_0_12)
|
167
169
|
);
|
168
170
|
}
|
@@ -201,33 +203,31 @@ public class OrcOutputPlugin
|
|
201
203
|
@Override
|
202
204
|
public void add(Page page)
|
203
205
|
{
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
if (batch.size >= batch.getMaxSize()) {
|
218
|
-
writer.addRowBatch(batch);
|
219
|
-
batch.reset();
|
220
|
-
}
|
221
|
-
}
|
222
|
-
if (batch.size != 0) {
|
206
|
+
try {
|
207
|
+
// int size = page.getStringReferences().size();
|
208
|
+
final TypeDescription schema = getSchema(reader.getSchema());
|
209
|
+
final VectorizedRowBatch batch = schema.createRowBatch();
|
210
|
+
// batch.size = size;
|
211
|
+
|
212
|
+
reader.setPage(page);
|
213
|
+
while (reader.nextRecord()) {
|
214
|
+
final int row = batch.size++;
|
215
|
+
reader.getSchema().visitColumns(
|
216
|
+
new OrcColumnVisitor(reader, batch, row)
|
217
|
+
);
|
218
|
+
if (batch.size >= batch.getMaxSize()) {
|
223
219
|
writer.addRowBatch(batch);
|
224
220
|
batch.reset();
|
225
221
|
}
|
226
222
|
}
|
227
|
-
|
228
|
-
|
223
|
+
if (batch.size != 0) {
|
224
|
+
writer.addRowBatch(batch);
|
225
|
+
batch.reset();
|
229
226
|
}
|
230
227
|
}
|
228
|
+
catch (IOException e) {
|
229
|
+
e.printStackTrace();
|
230
|
+
}
|
231
231
|
}
|
232
232
|
|
233
233
|
@Override
|
@@ -257,4 +257,42 @@ public class OrcOutputPlugin
|
|
257
257
|
return Exec.newTaskReport();
|
258
258
|
}
|
259
259
|
}
|
260
|
+
|
261
|
+
// We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
|
262
|
+
// Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
|
263
|
+
// As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
|
264
|
+
// notifies checkMemory() only to that instance.
|
265
|
+
private static class WriterLocalMemoryManager implements MemoryManager
|
266
|
+
{
|
267
|
+
final long rowsBetweenChecks = 10000;
|
268
|
+
|
269
|
+
private int rowsAddedSinceCheck = 0;
|
270
|
+
Callback boundCallback = null;
|
271
|
+
|
272
|
+
@Override
|
273
|
+
public void addWriter(Path path, long requestedAllocation, Callback callback) throws IOException
|
274
|
+
{
|
275
|
+
if (boundCallback != null) {
|
276
|
+
throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.");
|
277
|
+
}
|
278
|
+
|
279
|
+
boundCallback = callback;
|
280
|
+
}
|
281
|
+
|
282
|
+
@Override
|
283
|
+
public void removeWriter(Path path) throws IOException
|
284
|
+
{
|
285
|
+
boundCallback = null;
|
286
|
+
}
|
287
|
+
|
288
|
+
@Override
|
289
|
+
public void addedRow(int rows) throws IOException
|
290
|
+
{
|
291
|
+
rowsAddedSinceCheck += rows;
|
292
|
+
if (rowsAddedSinceCheck > rowsBetweenChecks) {
|
293
|
+
boundCallback.checkMemory(1);
|
294
|
+
rowsAddedSinceCheck = 0;
|
295
|
+
}
|
296
|
+
}
|
297
|
+
}
|
260
298
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -50,8 +50,7 @@ files:
|
|
50
50
|
- LICENSE.txt
|
51
51
|
- README.md
|
52
52
|
- build.gradle
|
53
|
-
- classpath/
|
54
|
-
- classpath/aircompressor-0.8.jar
|
53
|
+
- classpath/aircompressor-0.10.jar
|
55
54
|
- classpath/animal-sniffer-annotations-1.14.jar
|
56
55
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
57
56
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -81,7 +80,7 @@ files:
|
|
81
80
|
- classpath/curator-client-2.7.1.jar
|
82
81
|
- classpath/curator-framework-2.7.1.jar
|
83
82
|
- classpath/curator-recipes-2.7.1.jar
|
84
|
-
- classpath/embulk-output-orc-0.3.
|
83
|
+
- classpath/embulk-output-orc-0.3.4.jar
|
85
84
|
- classpath/embulk-util-aws-credentials-0.2.8.jar
|
86
85
|
- classpath/error_prone_annotations-2.1.3.jar
|
87
86
|
- classpath/gson-2.2.4.jar
|
@@ -91,7 +90,7 @@ files:
|
|
91
90
|
- classpath/hadoop-aws-2.7.5.jar
|
92
91
|
- classpath/hadoop-common-2.7.5.jar
|
93
92
|
- classpath/hadoop-hdfs-2.7.5.jar
|
94
|
-
- classpath/hive-storage-api-2.
|
93
|
+
- classpath/hive-storage-api-2.6.0.jar
|
95
94
|
- classpath/htrace-core-3.1.0-incubating.jar
|
96
95
|
- classpath/httpclient-4.3.6.jar
|
97
96
|
- classpath/httpcore-4.3.3.jar
|
@@ -101,7 +100,7 @@ files:
|
|
101
100
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
102
101
|
- classpath/jackson-xc-1.8.3.jar
|
103
102
|
- classpath/java-xmlbuilder-0.4.jar
|
104
|
-
- classpath/jaxb-api-2.2.
|
103
|
+
- classpath/jaxb-api-2.2.11.jar
|
105
104
|
- classpath/jaxb-impl-2.2.3-1.jar
|
106
105
|
- classpath/jcl-over-slf4j-1.7.12.jar
|
107
106
|
- classpath/jersey-core-1.9.jar
|
@@ -120,13 +119,13 @@ files:
|
|
120
119
|
- classpath/log4j-1.2.17.jar
|
121
120
|
- classpath/netty-3.7.0.Final.jar
|
122
121
|
- classpath/netty-all-4.0.23.Final.jar
|
123
|
-
- classpath/orc-core-1.
|
122
|
+
- classpath/orc-core-1.5.4.jar
|
123
|
+
- classpath/orc-shims-1.5.4.jar
|
124
124
|
- classpath/paranamer-2.3.jar
|
125
125
|
- classpath/protobuf-java-2.5.0.jar
|
126
126
|
- classpath/servlet-api-2.5-20081211.jar
|
127
127
|
- classpath/servlet-api-2.5.jar
|
128
128
|
- classpath/snappy-java-1.0.4.1.jar
|
129
|
-
- classpath/stax-api-1.0-2.jar
|
130
129
|
- classpath/xercesImpl-2.9.1.jar
|
131
130
|
- classpath/xml-apis-1.3.04.jar
|
132
131
|
- classpath/xmlenc-0.52.jar
|