embulk-output-orc 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/README.md +5 -0
- data/build.gradle +6 -5
- data/example/example.yml +10 -43
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -2
- data/gradlew +13 -10
- data/src/main/java/org/embulk/output/orc/OrcCodec.java +21 -0
- data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +8 -10
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +19 -24
- data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +3 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 041e79d159d0ffe346b0c28f17c7009438bc65e5
|
4
|
+
data.tar.gz: 13d96f705101ce32a4d389c1539ea23c50db3872
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fdd305bfb25dfd998f0c49e55ef4ffb916f268fa5189678102b3f6e7228532da4d3ea141afe03491a2cf11bbe394975f73868a7620ab0d410306ab998d1f0d95
|
7
|
+
data.tar.gz: 88574a4b9e2982b80c93307c361ec1de26c0878d8f08acfc0333eb770df74a7e76ef38747b94a57c351d4cd42e6fce015d141ef8d94e5f47bc6ecab1eeca085e
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -19,6 +19,7 @@
|
|
19
19
|
- **buffer_size**: Set the ORC buffer size (integer, default: `10000`)
|
20
20
|
- **strip_size**: Set the ORC strip size (integer, default: `100000`)
|
21
21
|
- **compression_kind**: description (string, default: `'ZLIB'`)
|
22
|
+
- `NONE`, `ZLIB`, `SNAPPY`
|
22
23
|
- **overwrite**: (LocalFileSystem only) Overwrite if output files already exist. (boolean, default: `false`)
|
23
24
|
- **default_from_timezone** Time zone of timestamp columns. This can be overwritten for each column using column_options (DateTimeZone, default: `UTC`)
|
24
25
|
|
@@ -57,3 +58,7 @@ out:
|
|
57
58
|
```
|
58
59
|
$ ./gradlew gem # -t to watch change of files and rebuild continuously
|
59
60
|
```
|
61
|
+
|
62
|
+
## SonarQube
|
63
|
+
|
64
|
+
[embulk-output-orc](https://sonarcloud.io/dashboard?id=embulk-output-orc "embulk-output-orc - Yukihiro Okada")
|
data/build.gradle
CHANGED
@@ -3,6 +3,7 @@ plugins {
|
|
3
3
|
id "com.github.jruby-gradle.base" version "0.1.5"
|
4
4
|
id "java"
|
5
5
|
id "checkstyle"
|
6
|
+
id "org.sonarqube" version "2.5"
|
6
7
|
}
|
7
8
|
import com.github.jrubygradle.JRubyExec
|
8
9
|
repositories {
|
@@ -17,14 +18,14 @@ configurations {
|
|
17
18
|
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
18
19
|
}
|
19
20
|
|
20
|
-
version = "0.2.
|
21
|
+
version = "0.2.2"
|
21
22
|
|
22
23
|
sourceCompatibility = 1.8
|
23
24
|
targetCompatibility = 1.8
|
24
25
|
|
25
26
|
dependencies {
|
26
|
-
compile "org.embulk:embulk-core:0.8.
|
27
|
-
provided "org.embulk:embulk-core:0.8.
|
27
|
+
compile "org.embulk:embulk-core:0.8.34"
|
28
|
+
provided "org.embulk:embulk-core:0.8.34"
|
28
29
|
|
29
30
|
compile "org.apache.orc:orc:1.4.0"
|
30
31
|
compile "org.apache.orc:orc-core:1.4.0"
|
@@ -35,8 +36,8 @@ dependencies {
|
|
35
36
|
compile "org.apache.hadoop:hadoop-aws:2.7.3"
|
36
37
|
|
37
38
|
testCompile "junit:junit:4.+"
|
38
|
-
testCompile "org.embulk:embulk-core:0.8.
|
39
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
39
|
+
testCompile "org.embulk:embulk-core:0.8.34:tests"
|
40
|
+
testCompile "org.embulk:embulk-standards:0.8.34"
|
40
41
|
}
|
41
42
|
|
42
43
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
data/example/example.yml
CHANGED
@@ -1,41 +1,11 @@
|
|
1
|
-
|
2
|
-
# type: file
|
3
|
-
# path_prefix: example/example.csv
|
4
|
-
# parser:
|
5
|
-
# type: csv
|
6
|
-
# charset: UTF-8
|
7
|
-
# newline: CRLF
|
8
|
-
# null_string: 'NULL'
|
9
|
-
# skip_header_lines: 1
|
10
|
-
# comment_line_marker: '#'
|
11
|
-
# columns:
|
12
|
-
# #- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
-
# - {name: id, type: long}
|
14
|
-
# - {name: name, type: string}
|
15
|
-
# - {name: score, type: double}
|
16
|
-
# - {name: json, type: json}
|
17
|
-
#filters:
|
18
|
-
# - type: column
|
19
|
-
# columns:
|
20
|
-
# - {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
|
21
|
-
# - {name: name, default: "foo"}
|
22
|
-
# - {name: foo, default: 1, type: long}
|
23
|
-
# - {name: id}
|
24
|
-
# - {name: copy_score, src: score}
|
25
|
-
# - {name: json, default: "{\"foo\":\"FOO\"}"}
|
26
|
-
# - {name: $.json.foo}
|
27
|
-
# - {name: $.json.copy_foo, src: $.json.foo}
|
1
|
+
---
|
28
2
|
in:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
null_string: 'NULL'
|
36
|
-
skip_header_lines: 0
|
37
|
-
comment_line_marker: '#'
|
38
|
-
columns:
|
3
|
+
type: randomj
|
4
|
+
rows: 1024
|
5
|
+
threads: 1
|
6
|
+
# default_timezone: Asia/Tokyo
|
7
|
+
primary_key: myid
|
8
|
+
schema:
|
39
9
|
- {name: myid, type: long}
|
40
10
|
- {name: named, type: string}
|
41
11
|
- {name: x_flag, type: boolean}
|
@@ -44,12 +14,9 @@ in:
|
|
44
14
|
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
45
15
|
- {name: purchase, type: timestamp, format: '%Y/%m/%d'}
|
46
16
|
|
47
|
-
exec:
|
48
|
-
max_threads:
|
49
|
-
min_output_tasks:
|
50
|
-
|
51
|
-
#out:
|
52
|
-
# type: stdout
|
17
|
+
#exec:
|
18
|
+
# max_threads: 6 # run at most 8 tasks concurrently
|
19
|
+
# min_output_tasks: 2 # disable page scattering
|
53
20
|
|
54
21
|
out:
|
55
22
|
type: orc
|
Binary file
|
@@ -1,6 +1,5 @@
|
|
1
|
-
#Mon Aug 14 21:51:29 JST 2017
|
2
1
|
distributionBase=GRADLE_USER_HOME
|
3
2
|
distributionPath=wrapper/dists
|
4
3
|
zipStoreBase=GRADLE_USER_HOME
|
5
4
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-
|
5
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-4.2.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env sh
|
2
2
|
|
3
3
|
##############################################################################
|
4
4
|
##
|
@@ -33,11 +33,11 @@ DEFAULT_JVM_OPTS=""
|
|
33
33
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
34
|
MAX_FD="maximum"
|
35
35
|
|
36
|
-
warn (
|
36
|
+
warn () {
|
37
37
|
echo "$*"
|
38
38
|
}
|
39
39
|
|
40
|
-
die (
|
40
|
+
die () {
|
41
41
|
echo
|
42
42
|
echo "$*"
|
43
43
|
echo
|
@@ -154,16 +154,19 @@ if $cygwin ; then
|
|
154
154
|
esac
|
155
155
|
fi
|
156
156
|
|
157
|
-
#
|
158
|
-
|
159
|
-
|
157
|
+
# Escape application args
|
158
|
+
save () {
|
159
|
+
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
|
+
echo " "
|
160
161
|
}
|
161
|
-
|
162
|
-
|
162
|
+
APP_ARGS=$(save "$@")
|
163
|
+
|
164
|
+
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
|
+
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
163
166
|
|
164
167
|
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
165
|
-
if [
|
168
|
+
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
166
169
|
cd "$(dirname "$0")"
|
167
170
|
fi
|
168
171
|
|
169
|
-
exec "$JAVACMD" "
|
172
|
+
exec "$JAVACMD" "$@"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
package org.embulk.output.orc;
|
2
|
+
|
3
|
+
public enum OrcCodec
|
4
|
+
{
|
5
|
+
ZLIB("zlib"),
|
6
|
+
SNAPPY("snappy"),
|
7
|
+
LZO("lzo"),
|
8
|
+
LZ4("lz4"),
|
9
|
+
NONE("none"),;
|
10
|
+
String kind;
|
11
|
+
|
12
|
+
OrcCodec(String kind)
|
13
|
+
{
|
14
|
+
this.kind = kind;
|
15
|
+
}
|
16
|
+
|
17
|
+
public String getKind()
|
18
|
+
{
|
19
|
+
return kind;
|
20
|
+
}
|
21
|
+
}
|
@@ -10,12 +10,14 @@ import org.embulk.spi.ColumnVisitor;
|
|
10
10
|
import org.embulk.spi.PageReader;
|
11
11
|
import org.embulk.spi.time.Timestamp;
|
12
12
|
|
13
|
+
import java.nio.charset.StandardCharsets;
|
14
|
+
|
13
15
|
public class OrcColumnVisitor
|
14
16
|
implements ColumnVisitor
|
15
17
|
{
|
16
|
-
private PageReader reader;
|
17
|
-
private VectorizedRowBatch batch;
|
18
|
-
private Integer i;
|
18
|
+
private final PageReader reader;
|
19
|
+
private final VectorizedRowBatch batch;
|
20
|
+
private final Integer i;
|
19
21
|
|
20
22
|
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
21
23
|
{
|
@@ -31,7 +33,6 @@ public class OrcColumnVisitor
|
|
31
33
|
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
32
34
|
}
|
33
35
|
else {
|
34
|
-
// TODO; Fix all true bug
|
35
36
|
if (reader.getBoolean(column)) {
|
36
37
|
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
37
38
|
}
|
@@ -57,7 +58,7 @@ public class OrcColumnVisitor
|
|
57
58
|
public void stringColumn(Column column)
|
58
59
|
{
|
59
60
|
((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
|
60
|
-
reader.getString(column).getBytes());
|
61
|
+
reader.getString(column).getBytes(StandardCharsets.UTF_8));
|
61
62
|
}
|
62
63
|
|
63
64
|
@Override
|
@@ -68,11 +69,8 @@ public class OrcColumnVisitor
|
|
68
69
|
}
|
69
70
|
else {
|
70
71
|
Timestamp timestamp = reader.getTimestamp(column);
|
71
|
-
|
72
|
-
|
73
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
74
|
-
}
|
75
|
-
// throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
|
72
|
+
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
73
|
+
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
76
74
|
}
|
77
75
|
}
|
78
76
|
|
@@ -32,10 +32,9 @@ import org.embulk.spi.util.Timestamps;
|
|
32
32
|
import org.embulk.util.aws.credentials.AwsCredentials;
|
33
33
|
import org.embulk.util.aws.credentials.AwsCredentialsTask;
|
34
34
|
import org.joda.time.DateTimeZone;
|
35
|
-
import org.joda.time.format.DateTimeFormat;
|
36
|
-
import org.joda.time.format.DateTimeFormatter;
|
37
35
|
|
38
36
|
import java.io.IOException;
|
37
|
+
import java.util.ArrayList;
|
39
38
|
import java.util.List;
|
40
39
|
import java.util.Map;
|
41
40
|
|
@@ -80,6 +79,10 @@ public class OrcOutputPlugin
|
|
80
79
|
@Config("default_from_timezone")
|
81
80
|
@ConfigDefault("\"UTC\"")
|
82
81
|
DateTimeZone getDefaultFromTimeZone();
|
82
|
+
|
83
|
+
@Config("endpoint")
|
84
|
+
@ConfigDefault("null")
|
85
|
+
Optional<String> getEndpoint();
|
83
86
|
}
|
84
87
|
|
85
88
|
public interface TimestampColumnOption
|
@@ -196,7 +199,9 @@ public class OrcOutputPlugin
|
|
196
199
|
conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
|
197
200
|
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
|
198
201
|
}
|
199
|
-
|
202
|
+
if (task.getEndpoint().isPresent()) {
|
203
|
+
conf.set("fs.s3a.endpoint", task.getEndpoint().get());
|
204
|
+
}
|
200
205
|
return conf;
|
201
206
|
}
|
202
207
|
|
@@ -222,7 +227,7 @@ public class OrcOutputPlugin
|
|
222
227
|
.version(OrcFile.Version.V_0_12));
|
223
228
|
}
|
224
229
|
catch (IOException e) {
|
225
|
-
|
230
|
+
Throwables.propagate(e);
|
226
231
|
}
|
227
232
|
return writer;
|
228
233
|
}
|
@@ -259,43 +264,34 @@ public class OrcOutputPlugin
|
|
259
264
|
class OrcTransactionalPageOutput
|
260
265
|
implements TransactionalPageOutput
|
261
266
|
{
|
262
|
-
private PageReader reader;
|
263
|
-
private Writer writer;
|
264
|
-
private
|
267
|
+
private final PageReader reader;
|
268
|
+
private final Writer writer;
|
269
|
+
private final ArrayList<VectorizedRowBatch> rowBatches = new ArrayList<>();
|
265
270
|
|
266
271
|
public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
|
267
272
|
{
|
268
273
|
this.reader = reader;
|
269
274
|
this.writer = writer;
|
270
|
-
|
271
|
-
// formatter
|
272
|
-
DateTimeZone defaultTimeZone = DateTimeZone
|
273
|
-
.forTimeZone(task.getDefaultFromTimeZone().toTimeZone());
|
274
|
-
formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(defaultTimeZone);
|
275
275
|
}
|
276
276
|
|
277
277
|
@Override
|
278
278
|
public void add(Page page)
|
279
279
|
{
|
280
280
|
int size = page.getStringReferences().size();
|
281
|
-
TypeDescription schema = getSchema(reader.getSchema());
|
282
|
-
VectorizedRowBatch batch = schema.createRowBatch();
|
281
|
+
final TypeDescription schema = getSchema(reader.getSchema());
|
282
|
+
final VectorizedRowBatch batch = schema.createRowBatch();
|
283
283
|
batch.size = size;
|
284
284
|
|
285
285
|
reader.setPage(page);
|
286
286
|
int i = 0;
|
287
287
|
while (reader.nextRecord()) {
|
288
|
-
// batch.size = page.getStringReferences().size();
|
289
288
|
reader.getSchema().visitColumns(
|
290
289
|
new OrcColumnVisitor(reader, batch, i)
|
291
290
|
);
|
292
291
|
i++;
|
293
292
|
}
|
294
|
-
|
295
|
-
|
296
|
-
}
|
297
|
-
catch (IOException e) {
|
298
|
-
e.printStackTrace();
|
293
|
+
synchronized (this) {
|
294
|
+
rowBatches.add(batch);
|
299
295
|
}
|
300
296
|
}
|
301
297
|
|
@@ -303,8 +299,10 @@ public class OrcOutputPlugin
|
|
303
299
|
public void finish()
|
304
300
|
{
|
305
301
|
try {
|
302
|
+
for (VectorizedRowBatch batch : rowBatches) {
|
303
|
+
writer.addRowBatch(batch);
|
304
|
+
}
|
306
305
|
writer.close();
|
307
|
-
writer = null;
|
308
306
|
}
|
309
307
|
catch (IOException e) {
|
310
308
|
Throwables.propagate(e);
|
@@ -314,19 +312,16 @@ public class OrcOutputPlugin
|
|
314
312
|
@Override
|
315
313
|
public void close()
|
316
314
|
{
|
317
|
-
// TODO: something
|
318
315
|
}
|
319
316
|
|
320
317
|
@Override
|
321
318
|
public void abort()
|
322
319
|
{
|
323
|
-
// TODO: something
|
324
320
|
}
|
325
321
|
|
326
322
|
@Override
|
327
323
|
public TaskReport commit()
|
328
324
|
{
|
329
|
-
// TODO: something
|
330
325
|
return Exec.newTaskReport();
|
331
326
|
}
|
332
327
|
}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
package org.embulk.output.orc;
|
2
2
|
|
3
|
+
import com.google.common.base.Throwables;
|
4
|
+
|
3
5
|
import java.io.IOException;
|
4
6
|
import java.nio.file.Files;
|
5
7
|
import java.nio.file.Path;
|
@@ -20,7 +22,7 @@ class OrcOutputPluginHelper
|
|
20
22
|
Files.deleteIfExists(path);
|
21
23
|
}
|
22
24
|
catch (IOException e) {
|
23
|
-
|
25
|
+
Throwables.propagate(e);
|
24
26
|
}
|
25
27
|
}
|
26
28
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,6 +58,7 @@ files:
|
|
58
58
|
- gradlew
|
59
59
|
- gradlew.bat
|
60
60
|
- lib/embulk/output/orc.rb
|
61
|
+
- src/main/java/org/embulk/output/orc/OrcCodec.java
|
61
62
|
- src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
|
62
63
|
- src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
|
63
64
|
- src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
|
@@ -92,7 +93,7 @@ files:
|
|
92
93
|
- classpath/curator-client-2.7.1.jar
|
93
94
|
- classpath/curator-framework-2.7.1.jar
|
94
95
|
- classpath/curator-recipes-2.7.1.jar
|
95
|
-
- classpath/embulk-output-orc-0.2.
|
96
|
+
- classpath/embulk-output-orc-0.2.2.jar
|
96
97
|
- classpath/embulk-util-aws-credentials-0.2.8.jar
|
97
98
|
- classpath/gson-2.2.4.jar
|
98
99
|
- classpath/hadoop-annotations-2.7.3.jar
|