embulk-output-orc 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/README.md +5 -0
- data/build.gradle +6 -5
- data/example/example.yml +10 -43
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -2
- data/gradlew +13 -10
- data/src/main/java/org/embulk/output/orc/OrcCodec.java +21 -0
- data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +8 -10
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +19 -24
- data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +3 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 041e79d159d0ffe346b0c28f17c7009438bc65e5
|
4
|
+
data.tar.gz: 13d96f705101ce32a4d389c1539ea23c50db3872
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fdd305bfb25dfd998f0c49e55ef4ffb916f268fa5189678102b3f6e7228532da4d3ea141afe03491a2cf11bbe394975f73868a7620ab0d410306ab998d1f0d95
|
7
|
+
data.tar.gz: 88574a4b9e2982b80c93307c361ec1de26c0878d8f08acfc0333eb770df74a7e76ef38747b94a57c351d4cd42e6fce015d141ef8d94e5f47bc6ecab1eeca085e
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -19,6 +19,7 @@
|
|
19
19
|
- **buffer_size**: Set the ORC buffer size (integer, default: `10000`)
|
20
20
|
- **strip_size**: Set the ORC strip size (integer, default: `100000`)
|
21
21
|
- **compression_kind**: description (string, default: `'ZLIB'`)
|
22
|
+
- `NONE`, `ZLIB`, `SNAPPY`
|
22
23
|
- **overwrite**: (LocalFileSystem only) Overwrite if output files already exist. (boolean, default: `false`)
|
23
24
|
- **default_from_timezone** Time zone of timestamp columns. This can be overwritten for each column using column_options (DateTimeZone, default: `UTC`)
|
24
25
|
|
@@ -57,3 +58,7 @@ out:
|
|
57
58
|
```
|
58
59
|
$ ./gradlew gem # -t to watch change of files and rebuild continuously
|
59
60
|
```
|
61
|
+
|
62
|
+
## SonarQube
|
63
|
+
|
64
|
+
[embulk-output-orc](https://sonarcloud.io/dashboard?id=embulk-output-orc "embulk-output-orc - Yukihiro Okada")
|
data/build.gradle
CHANGED
@@ -3,6 +3,7 @@ plugins {
|
|
3
3
|
id "com.github.jruby-gradle.base" version "0.1.5"
|
4
4
|
id "java"
|
5
5
|
id "checkstyle"
|
6
|
+
id "org.sonarqube" version "2.5"
|
6
7
|
}
|
7
8
|
import com.github.jrubygradle.JRubyExec
|
8
9
|
repositories {
|
@@ -17,14 +18,14 @@ configurations {
|
|
17
18
|
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
18
19
|
}
|
19
20
|
|
20
|
-
version = "0.2.
|
21
|
+
version = "0.2.2"
|
21
22
|
|
22
23
|
sourceCompatibility = 1.8
|
23
24
|
targetCompatibility = 1.8
|
24
25
|
|
25
26
|
dependencies {
|
26
|
-
compile "org.embulk:embulk-core:0.8.
|
27
|
-
provided "org.embulk:embulk-core:0.8.
|
27
|
+
compile "org.embulk:embulk-core:0.8.34"
|
28
|
+
provided "org.embulk:embulk-core:0.8.34"
|
28
29
|
|
29
30
|
compile "org.apache.orc:orc:1.4.0"
|
30
31
|
compile "org.apache.orc:orc-core:1.4.0"
|
@@ -35,8 +36,8 @@ dependencies {
|
|
35
36
|
compile "org.apache.hadoop:hadoop-aws:2.7.3"
|
36
37
|
|
37
38
|
testCompile "junit:junit:4.+"
|
38
|
-
testCompile "org.embulk:embulk-core:0.8.
|
39
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
39
|
+
testCompile "org.embulk:embulk-core:0.8.34:tests"
|
40
|
+
testCompile "org.embulk:embulk-standards:0.8.34"
|
40
41
|
}
|
41
42
|
|
42
43
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
data/example/example.yml
CHANGED
@@ -1,41 +1,11 @@
|
|
1
|
-
|
2
|
-
# type: file
|
3
|
-
# path_prefix: example/example.csv
|
4
|
-
# parser:
|
5
|
-
# type: csv
|
6
|
-
# charset: UTF-8
|
7
|
-
# newline: CRLF
|
8
|
-
# null_string: 'NULL'
|
9
|
-
# skip_header_lines: 1
|
10
|
-
# comment_line_marker: '#'
|
11
|
-
# columns:
|
12
|
-
# #- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
-
# - {name: id, type: long}
|
14
|
-
# - {name: name, type: string}
|
15
|
-
# - {name: score, type: double}
|
16
|
-
# - {name: json, type: json}
|
17
|
-
#filters:
|
18
|
-
# - type: column
|
19
|
-
# columns:
|
20
|
-
# - {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
|
21
|
-
# - {name: name, default: "foo"}
|
22
|
-
# - {name: foo, default: 1, type: long}
|
23
|
-
# - {name: id}
|
24
|
-
# - {name: copy_score, src: score}
|
25
|
-
# - {name: json, default: "{\"foo\":\"FOO\"}"}
|
26
|
-
# - {name: $.json.foo}
|
27
|
-
# - {name: $.json.copy_foo, src: $.json.foo}
|
1
|
+
---
|
28
2
|
in:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
null_string: 'NULL'
|
36
|
-
skip_header_lines: 0
|
37
|
-
comment_line_marker: '#'
|
38
|
-
columns:
|
3
|
+
type: randomj
|
4
|
+
rows: 1024
|
5
|
+
threads: 1
|
6
|
+
# default_timezone: Asia/Tokyo
|
7
|
+
primary_key: myid
|
8
|
+
schema:
|
39
9
|
- {name: myid, type: long}
|
40
10
|
- {name: named, type: string}
|
41
11
|
- {name: x_flag, type: boolean}
|
@@ -44,12 +14,9 @@ in:
|
|
44
14
|
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
45
15
|
- {name: purchase, type: timestamp, format: '%Y/%m/%d'}
|
46
16
|
|
47
|
-
exec:
|
48
|
-
max_threads:
|
49
|
-
min_output_tasks:
|
50
|
-
|
51
|
-
#out:
|
52
|
-
# type: stdout
|
17
|
+
#exec:
|
18
|
+
# max_threads: 6 # run at most 8 tasks concurrently
|
19
|
+
# min_output_tasks: 2 # disable page scattering
|
53
20
|
|
54
21
|
out:
|
55
22
|
type: orc
|
Binary file
|
@@ -1,6 +1,5 @@
|
|
1
|
-
#Mon Aug 14 21:51:29 JST 2017
|
2
1
|
distributionBase=GRADLE_USER_HOME
|
3
2
|
distributionPath=wrapper/dists
|
4
3
|
zipStoreBase=GRADLE_USER_HOME
|
5
4
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-
|
5
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-4.2.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env sh
|
2
2
|
|
3
3
|
##############################################################################
|
4
4
|
##
|
@@ -33,11 +33,11 @@ DEFAULT_JVM_OPTS=""
|
|
33
33
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
34
|
MAX_FD="maximum"
|
35
35
|
|
36
|
-
warn (
|
36
|
+
warn () {
|
37
37
|
echo "$*"
|
38
38
|
}
|
39
39
|
|
40
|
-
die (
|
40
|
+
die () {
|
41
41
|
echo
|
42
42
|
echo "$*"
|
43
43
|
echo
|
@@ -154,16 +154,19 @@ if $cygwin ; then
|
|
154
154
|
esac
|
155
155
|
fi
|
156
156
|
|
157
|
-
#
|
158
|
-
|
159
|
-
|
157
|
+
# Escape application args
|
158
|
+
save () {
|
159
|
+
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
|
+
echo " "
|
160
161
|
}
|
161
|
-
|
162
|
-
|
162
|
+
APP_ARGS=$(save "$@")
|
163
|
+
|
164
|
+
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
|
+
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
163
166
|
|
164
167
|
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
165
|
-
if [
|
168
|
+
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
166
169
|
cd "$(dirname "$0")"
|
167
170
|
fi
|
168
171
|
|
169
|
-
exec "$JAVACMD" "
|
172
|
+
exec "$JAVACMD" "$@"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
package org.embulk.output.orc;
|
2
|
+
|
3
|
+
public enum OrcCodec
|
4
|
+
{
|
5
|
+
ZLIB("zlib"),
|
6
|
+
SNAPPY("snappy"),
|
7
|
+
LZO("lzo"),
|
8
|
+
LZ4("lz4"),
|
9
|
+
NONE("none"),;
|
10
|
+
String kind;
|
11
|
+
|
12
|
+
OrcCodec(String kind)
|
13
|
+
{
|
14
|
+
this.kind = kind;
|
15
|
+
}
|
16
|
+
|
17
|
+
public String getKind()
|
18
|
+
{
|
19
|
+
return kind;
|
20
|
+
}
|
21
|
+
}
|
@@ -10,12 +10,14 @@ import org.embulk.spi.ColumnVisitor;
|
|
10
10
|
import org.embulk.spi.PageReader;
|
11
11
|
import org.embulk.spi.time.Timestamp;
|
12
12
|
|
13
|
+
import java.nio.charset.StandardCharsets;
|
14
|
+
|
13
15
|
public class OrcColumnVisitor
|
14
16
|
implements ColumnVisitor
|
15
17
|
{
|
16
|
-
private PageReader reader;
|
17
|
-
private VectorizedRowBatch batch;
|
18
|
-
private Integer i;
|
18
|
+
private final PageReader reader;
|
19
|
+
private final VectorizedRowBatch batch;
|
20
|
+
private final Integer i;
|
19
21
|
|
20
22
|
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
21
23
|
{
|
@@ -31,7 +33,6 @@ public class OrcColumnVisitor
|
|
31
33
|
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
32
34
|
}
|
33
35
|
else {
|
34
|
-
// TODO; Fix all true bug
|
35
36
|
if (reader.getBoolean(column)) {
|
36
37
|
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
37
38
|
}
|
@@ -57,7 +58,7 @@ public class OrcColumnVisitor
|
|
57
58
|
public void stringColumn(Column column)
|
58
59
|
{
|
59
60
|
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
|
60
|
-
reader.getString(column).getBytes());
|
61
|
+
reader.getString(column).getBytes(StandardCharsets.UTF_8));
|
61
62
|
}
|
62
63
|
|
63
64
|
@Override
|
@@ -68,11 +69,8 @@ public class OrcColumnVisitor
|
|
68
69
|
}
|
69
70
|
else {
|
70
71
|
Timestamp timestamp = reader.getTimestamp(column);
|
71
|
-
|
72
|
-
|
73
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
74
|
-
}
|
75
|
-
// throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
|
72
|
+
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
73
|
+
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
76
74
|
}
|
77
75
|
}
|
78
76
|
|
@@ -32,10 +32,9 @@ import org.embulk.spi.util.Timestamps;
|
|
32
32
|
import org.embulk.util.aws.credentials.AwsCredentials;
|
33
33
|
import org.embulk.util.aws.credentials.AwsCredentialsTask;
|
34
34
|
import org.joda.time.DateTimeZone;
|
35
|
-
import org.joda.time.format.DateTimeFormat;
|
36
|
-
import org.joda.time.format.DateTimeFormatter;
|
37
35
|
|
38
36
|
import java.io.IOException;
|
37
|
+
import java.util.ArrayList;
|
39
38
|
import java.util.List;
|
40
39
|
import java.util.Map;
|
41
40
|
|
@@ -80,6 +79,10 @@ public class OrcOutputPlugin
|
|
80
79
|
@Config("default_from_timezone")
|
81
80
|
@ConfigDefault("\"UTC\"")
|
82
81
|
DateTimeZone getDefaultFromTimeZone();
|
82
|
+
|
83
|
+
@Config("endpoint")
|
84
|
+
@ConfigDefault("null")
|
85
|
+
Optional<String> getEndpoint();
|
83
86
|
}
|
84
87
|
|
85
88
|
public interface TimestampColumnOption
|
@@ -196,7 +199,9 @@ public class OrcOutputPlugin
|
|
196
199
|
conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
|
197
200
|
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
|
198
201
|
}
|
199
|
-
|
202
|
+
if (task.getEndpoint().isPresent()) {
|
203
|
+
conf.set("fs.s3a.endpoint", task.getEndpoint().get());
|
204
|
+
}
|
200
205
|
return conf;
|
201
206
|
}
|
202
207
|
|
@@ -222,7 +227,7 @@ public class OrcOutputPlugin
|
|
222
227
|
.version(OrcFile.Version.V_0_12));
|
223
228
|
}
|
224
229
|
catch (IOException e) {
|
225
|
-
|
230
|
+
Throwables.propagate(e);
|
226
231
|
}
|
227
232
|
return writer;
|
228
233
|
}
|
@@ -259,43 +264,34 @@ public class OrcOutputPlugin
|
|
259
264
|
class OrcTransactionalPageOutput
|
260
265
|
implements TransactionalPageOutput
|
261
266
|
{
|
262
|
-
private PageReader reader;
|
263
|
-
private Writer writer;
|
264
|
-
private
|
267
|
+
private final PageReader reader;
|
268
|
+
private final Writer writer;
|
269
|
+
private final ArrayList<VectorizedRowBatch> rowBatches = new ArrayList<>();
|
265
270
|
|
266
271
|
public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
|
267
272
|
{
|
268
273
|
this.reader = reader;
|
269
274
|
this.writer = writer;
|
270
|
-
|
271
|
-
// formatter
|
272
|
-
DateTimeZone defaultTimeZone = DateTimeZone
|
273
|
-
.forTimeZone(task.getDefaultFromTimeZone().toTimeZone());
|
274
|
-
formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(defaultTimeZone);
|
275
275
|
}
|
276
276
|
|
277
277
|
@Override
|
278
278
|
public void add(Page page)
|
279
279
|
{
|
280
280
|
int size = page.getStringReferences().size();
|
281
|
-
TypeDescription schema = getSchema(reader.getSchema());
|
282
|
-
VectorizedRowBatch batch = schema.createRowBatch();
|
281
|
+
final TypeDescription schema = getSchema(reader.getSchema());
|
282
|
+
final VectorizedRowBatch batch = schema.createRowBatch();
|
283
283
|
batch.size = size;
|
284
284
|
|
285
285
|
reader.setPage(page);
|
286
286
|
int i = 0;
|
287
287
|
while (reader.nextRecord()) {
|
288
|
-
// batch.size = page.getStringReferences().size();
|
289
288
|
reader.getSchema().visitColumns(
|
290
289
|
new OrcColumnVisitor(reader, batch, i)
|
291
290
|
);
|
292
291
|
i++;
|
293
292
|
}
|
294
|
-
|
295
|
-
|
296
|
-
}
|
297
|
-
catch (IOException e) {
|
298
|
-
e.printStackTrace();
|
293
|
+
synchronized (this) {
|
294
|
+
rowBatches.add(batch);
|
299
295
|
}
|
300
296
|
}
|
301
297
|
|
@@ -303,8 +299,10 @@ public class OrcOutputPlugin
|
|
303
299
|
public void finish()
|
304
300
|
{
|
305
301
|
try {
|
302
|
+
for (VectorizedRowBatch batch : rowBatches) {
|
303
|
+
writer.addRowBatch(batch);
|
304
|
+
}
|
306
305
|
writer.close();
|
307
|
-
writer = null;
|
308
306
|
}
|
309
307
|
catch (IOException e) {
|
310
308
|
Throwables.propagate(e);
|
@@ -314,19 +312,16 @@ public class OrcOutputPlugin
|
|
314
312
|
@Override
|
315
313
|
public void close()
|
316
314
|
{
|
317
|
-
// TODO: something
|
318
315
|
}
|
319
316
|
|
320
317
|
@Override
|
321
318
|
public void abort()
|
322
319
|
{
|
323
|
-
// TODO: something
|
324
320
|
}
|
325
321
|
|
326
322
|
@Override
|
327
323
|
public TaskReport commit()
|
328
324
|
{
|
329
|
-
// TODO: something
|
330
325
|
return Exec.newTaskReport();
|
331
326
|
}
|
332
327
|
}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
package org.embulk.output.orc;
|
2
2
|
|
3
|
+
import com.google.common.base.Throwables;
|
4
|
+
|
3
5
|
import java.io.IOException;
|
4
6
|
import java.nio.file.Files;
|
5
7
|
import java.nio.file.Path;
|
@@ -20,7 +22,7 @@ class OrcOutputPluginHelper
|
|
20
22
|
Files.deleteIfExists(path);
|
21
23
|
}
|
22
24
|
catch (IOException e) {
|
23
|
-
|
25
|
+
Throwables.propagate(e);
|
24
26
|
}
|
25
27
|
}
|
26
28
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,6 +58,7 @@ files:
|
|
58
58
|
- gradlew
|
59
59
|
- gradlew.bat
|
60
60
|
- lib/embulk/output/orc.rb
|
61
|
+
- src/main/java/org/embulk/output/orc/OrcCodec.java
|
61
62
|
- src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
|
62
63
|
- src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
|
63
64
|
- src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
|
@@ -92,7 +93,7 @@ files:
|
|
92
93
|
- classpath/curator-client-2.7.1.jar
|
93
94
|
- classpath/curator-framework-2.7.1.jar
|
94
95
|
- classpath/curator-recipes-2.7.1.jar
|
95
|
-
- classpath/embulk-output-orc-0.2.
|
96
|
+
- classpath/embulk-output-orc-0.2.2.jar
|
96
97
|
- classpath/embulk-util-aws-credentials-0.2.8.jar
|
97
98
|
- classpath/gson-2.2.4.jar
|
98
99
|
- classpath/hadoop-annotations-2.7.3.jar
|