embulk-output-orc 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a4d872e06384fd47cf8c1707398234974e0ecb3
4
- data.tar.gz: bb3855c44c55f16bc5e1707f09e3a1b6c683aee6
3
+ metadata.gz: 041e79d159d0ffe346b0c28f17c7009438bc65e5
4
+ data.tar.gz: 13d96f705101ce32a4d389c1539ea23c50db3872
5
5
  SHA512:
6
- metadata.gz: a16e0c62d57089d5ff431021b97586a8203a423ab3f8797a4e2274f26cb168a91a59ae72d6d42b4f2540209565d05d9e3f1ae66af242e52c90ab1f239afe90cd
7
- data.tar.gz: 21f77bd871793f06b014ce9df5c08ab7e0f0d610b7cc69b749e415a1f181891288ba06211dbe3d769bfd756bfaaf344d1eeecd42db9f085a7feba28560921c96
6
+ metadata.gz: fdd305bfb25dfd998f0c49e55ef4ffb916f268fa5189678102b3f6e7228532da4d3ea141afe03491a2cf11bbe394975f73868a7620ab0d410306ab998d1f0d95
7
+ data.tar.gz: 88574a4b9e2982b80c93307c361ec1de26c0878d8f08acfc0333eb770df74a7e76ef38747b94a57c351d4cd42e6fce015d141ef8d94e5f47bc6ecab1eeca085e
@@ -10,3 +10,5 @@ sudo: false
10
10
  script:
11
11
  - ./gradlew --info checkstyle
12
12
  - ./gradlew --info check
13
+
14
+ after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
data/README.md CHANGED
@@ -19,6 +19,7 @@
19
19
  - **buffer_size**: Set the ORC buffer size (integer, default: `10000`)
20
20
  - **strip_size**: Set the ORC strip size (integer, default: `100000`)
21
21
  - **compression_kind**: description (string, default: `'ZLIB'`)
22
+ - `NONE`, `ZLIB`, `SNAPPY`
22
23
  - **overwrite**: (LocalFileSystem only) Overwrite if output files already exist. (boolean, default: `false`)
23
24
  - **default_from_timezone** Time zone of timestamp columns. This can be overwritten for each column using column_options (DateTimeZone, default: `UTC`)
24
25
 
@@ -57,3 +58,7 @@ out:
57
58
  ```
58
59
  $ ./gradlew gem # -t to watch change of files and rebuild continuously
59
60
  ```
61
+
62
+ ## SonarQube
63
+
64
+ [embulk-output-orc](https://sonarcloud.io/dashboard?id=embulk-output-orc "embulk-output-orc - Yukihiro Okada")
@@ -3,6 +3,7 @@ plugins {
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
5
  id "checkstyle"
6
+ id "org.sonarqube" version "2.5"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
@@ -17,14 +18,14 @@ configurations {
17
18
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
18
19
  }
19
20
 
20
- version = "0.2.0"
21
+ version = "0.2.2"
21
22
 
22
23
  sourceCompatibility = 1.8
23
24
  targetCompatibility = 1.8
24
25
 
25
26
  dependencies {
26
- compile "org.embulk:embulk-core:0.8.29"
27
- provided "org.embulk:embulk-core:0.8.29"
27
+ compile "org.embulk:embulk-core:0.8.34"
28
+ provided "org.embulk:embulk-core:0.8.34"
28
29
 
29
30
  compile "org.apache.orc:orc:1.4.0"
30
31
  compile "org.apache.orc:orc-core:1.4.0"
@@ -35,8 +36,8 @@ dependencies {
35
36
  compile "org.apache.hadoop:hadoop-aws:2.7.3"
36
37
 
37
38
  testCompile "junit:junit:4.+"
38
- testCompile "org.embulk:embulk-core:0.8.29:tests"
39
- testCompile "org.embulk:embulk-standards:0.8.29"
39
+ testCompile "org.embulk:embulk-core:0.8.34:tests"
40
+ testCompile "org.embulk:embulk-standards:0.8.34"
40
41
  }
41
42
 
42
43
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -1,41 +1,11 @@
1
- # in:
2
- # type: file
3
- # path_prefix: example/example.csv
4
- # parser:
5
- # type: csv
6
- # charset: UTF-8
7
- # newline: CRLF
8
- # null_string: 'NULL'
9
- # skip_header_lines: 1
10
- # comment_line_marker: '#'
11
- # columns:
12
- # #- {name: time, type: timestamp, format: "%Y-%m-%d"}
13
- # - {name: id, type: long}
14
- # - {name: name, type: string}
15
- # - {name: score, type: double}
16
- # - {name: json, type: json}
17
- #filters:
18
- # - type: column
19
- # columns:
20
- # - {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
21
- # - {name: name, default: "foo"}
22
- # - {name: foo, default: 1, type: long}
23
- # - {name: id}
24
- # - {name: copy_score, src: score}
25
- # - {name: json, default: "{\"foo\":\"FOO\"}"}
26
- # - {name: $.json.foo}
27
- # - {name: $.json.copy_foo, src: $.json.foo}
1
+ ---
28
2
  in:
29
- type: file
30
- path_prefix: example/sample.csv
31
- parser:
32
- type: csv
33
- charset: UTF-8
34
- newline: CRLF
35
- null_string: 'NULL'
36
- skip_header_lines: 0
37
- comment_line_marker: '#'
38
- columns:
3
+ type: randomj
4
+ rows: 1024
5
+ threads: 1
6
+ # default_timezone: Asia/Tokyo
7
+ primary_key: myid
8
+ schema:
39
9
  - {name: myid, type: long}
40
10
  - {name: named, type: string}
41
11
  - {name: x_flag, type: boolean}
@@ -44,12 +14,9 @@ in:
44
14
  - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
45
15
  - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
46
16
 
47
- exec:
48
- max_threads: 2 # run at most 8 tasks concurrently
49
- min_output_tasks: 1 # disable page scattering
50
-
51
- #out:
52
- # type: stdout
17
+ #exec:
18
+ # max_threads: 6 # run at most 8 tasks concurrently
19
+ # min_output_tasks: 2 # disable page scattering
53
20
 
54
21
  out:
55
22
  type: orc
@@ -1,6 +1,5 @@
1
- #Mon Aug 14 21:51:29 JST 2017
2
1
  distributionBase=GRADLE_USER_HOME
3
2
  distributionPath=wrapper/dists
4
3
  zipStoreBase=GRADLE_USER_HOME
5
4
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-all.zip
5
+ distributionUrl=https\://services.gradle.org/distributions/gradle-4.2.1-bin.zip
data/gradlew CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env bash
1
+ #!/usr/bin/env sh
2
2
 
3
3
  ##############################################################################
4
4
  ##
@@ -33,11 +33,11 @@ DEFAULT_JVM_OPTS=""
33
33
  # Use the maximum available, or set MAX_FD != -1 to use that value.
34
34
  MAX_FD="maximum"
35
35
 
36
- warn ( ) {
36
+ warn () {
37
37
  echo "$*"
38
38
  }
39
39
 
40
- die ( ) {
40
+ die () {
41
41
  echo
42
42
  echo "$*"
43
43
  echo
@@ -154,16 +154,19 @@ if $cygwin ; then
154
154
  esac
155
155
  fi
156
156
 
157
- # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
- function splitJvmOpts() {
159
- JVM_OPTS=("$@")
157
+ # Escape application args
158
+ save () {
159
+ for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
+ echo " "
160
161
  }
161
- eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
- JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
162
+ APP_ARGS=$(save "$@")
163
+
164
+ # Collect all arguments for the java command, following the shell quoting and substitution rules
165
+ eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
163
166
 
164
167
  # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
165
- if [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]]; then
168
+ if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
166
169
  cd "$(dirname "$0")"
167
170
  fi
168
171
 
169
- exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
172
+ exec "$JAVACMD" "$@"
@@ -0,0 +1,21 @@
1
+ package org.embulk.output.orc;
2
+
3
+ public enum OrcCodec
4
+ {
5
+ ZLIB("zlib"),
6
+ SNAPPY("snappy"),
7
+ LZO("lzo"),
8
+ LZ4("lz4"),
9
+ NONE("none"),;
10
+ String kind;
11
+
12
+ OrcCodec(String kind)
13
+ {
14
+ this.kind = kind;
15
+ }
16
+
17
+ public String getKind()
18
+ {
19
+ return kind;
20
+ }
21
+ }
@@ -10,12 +10,14 @@ import org.embulk.spi.ColumnVisitor;
10
10
  import org.embulk.spi.PageReader;
11
11
  import org.embulk.spi.time.Timestamp;
12
12
 
13
+ import java.nio.charset.StandardCharsets;
14
+
13
15
  public class OrcColumnVisitor
14
16
  implements ColumnVisitor
15
17
  {
16
- private PageReader reader;
17
- private VectorizedRowBatch batch;
18
- private Integer i;
18
+ private final PageReader reader;
19
+ private final VectorizedRowBatch batch;
20
+ private final Integer i;
19
21
 
20
22
  public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
21
23
  {
@@ -31,7 +33,6 @@ public class OrcColumnVisitor
31
33
  ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
32
34
  }
33
35
  else {
34
- // TODO; Fix all true bug
35
36
  if (reader.getBoolean(column)) {
36
37
  ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
37
38
  }
@@ -57,7 +58,7 @@ public class OrcColumnVisitor
57
58
  public void stringColumn(Column column)
58
59
  {
59
60
  ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
60
- reader.getString(column).getBytes());
61
+ reader.getString(column).getBytes(StandardCharsets.UTF_8));
61
62
  }
62
63
 
63
64
  @Override
@@ -68,11 +69,8 @@ public class OrcColumnVisitor
68
69
  }
69
70
  else {
70
71
  Timestamp timestamp = reader.getTimestamp(column);
71
- if (!timestamp.equals("")) {
72
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
74
- }
75
- // throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
72
+ java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
+ ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
76
74
  }
77
75
  }
78
76
 
@@ -32,10 +32,9 @@ import org.embulk.spi.util.Timestamps;
32
32
  import org.embulk.util.aws.credentials.AwsCredentials;
33
33
  import org.embulk.util.aws.credentials.AwsCredentialsTask;
34
34
  import org.joda.time.DateTimeZone;
35
- import org.joda.time.format.DateTimeFormat;
36
- import org.joda.time.format.DateTimeFormatter;
37
35
 
38
36
  import java.io.IOException;
37
+ import java.util.ArrayList;
39
38
  import java.util.List;
40
39
  import java.util.Map;
41
40
 
@@ -80,6 +79,10 @@ public class OrcOutputPlugin
80
79
  @Config("default_from_timezone")
81
80
  @ConfigDefault("\"UTC\"")
82
81
  DateTimeZone getDefaultFromTimeZone();
82
+
83
+ @Config("endpoint")
84
+ @ConfigDefault("null")
85
+ Optional<String> getEndpoint();
83
86
  }
84
87
 
85
88
  public interface TimestampColumnOption
@@ -196,7 +199,9 @@ public class OrcOutputPlugin
196
199
  conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
197
200
  conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
198
201
  }
199
-
202
+ if (task.getEndpoint().isPresent()) {
203
+ conf.set("fs.s3a.endpoint", task.getEndpoint().get());
204
+ }
200
205
  return conf;
201
206
  }
202
207
 
@@ -222,7 +227,7 @@ public class OrcOutputPlugin
222
227
  .version(OrcFile.Version.V_0_12));
223
228
  }
224
229
  catch (IOException e) {
225
- e.printStackTrace();
230
+ Throwables.propagate(e);
226
231
  }
227
232
  return writer;
228
233
  }
@@ -259,43 +264,34 @@ public class OrcOutputPlugin
259
264
  class OrcTransactionalPageOutput
260
265
  implements TransactionalPageOutput
261
266
  {
262
- private PageReader reader;
263
- private Writer writer;
264
- private DateTimeFormatter formatter;
267
+ private final PageReader reader;
268
+ private final Writer writer;
269
+ private final ArrayList<VectorizedRowBatch> rowBatches = new ArrayList<>();
265
270
 
266
271
  public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
267
272
  {
268
273
  this.reader = reader;
269
274
  this.writer = writer;
270
-
271
- // formatter
272
- DateTimeZone defaultTimeZone = DateTimeZone
273
- .forTimeZone(task.getDefaultFromTimeZone().toTimeZone());
274
- formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(defaultTimeZone);
275
275
  }
276
276
 
277
277
  @Override
278
278
  public void add(Page page)
279
279
  {
280
280
  int size = page.getStringReferences().size();
281
- TypeDescription schema = getSchema(reader.getSchema());
282
- VectorizedRowBatch batch = schema.createRowBatch();
281
+ final TypeDescription schema = getSchema(reader.getSchema());
282
+ final VectorizedRowBatch batch = schema.createRowBatch();
283
283
  batch.size = size;
284
284
 
285
285
  reader.setPage(page);
286
286
  int i = 0;
287
287
  while (reader.nextRecord()) {
288
- // batch.size = page.getStringReferences().size();
289
288
  reader.getSchema().visitColumns(
290
289
  new OrcColumnVisitor(reader, batch, i)
291
290
  );
292
291
  i++;
293
292
  }
294
- try {
295
- writer.addRowBatch(batch);
296
- }
297
- catch (IOException e) {
298
- e.printStackTrace();
293
+ synchronized (this) {
294
+ rowBatches.add(batch);
299
295
  }
300
296
  }
301
297
 
@@ -303,8 +299,10 @@ public class OrcOutputPlugin
303
299
  public void finish()
304
300
  {
305
301
  try {
302
+ for (VectorizedRowBatch batch : rowBatches) {
303
+ writer.addRowBatch(batch);
304
+ }
306
305
  writer.close();
307
- writer = null;
308
306
  }
309
307
  catch (IOException e) {
310
308
  Throwables.propagate(e);
@@ -314,19 +312,16 @@ public class OrcOutputPlugin
314
312
  @Override
315
313
  public void close()
316
314
  {
317
- // TODO: something
318
315
  }
319
316
 
320
317
  @Override
321
318
  public void abort()
322
319
  {
323
- // TODO: something
324
320
  }
325
321
 
326
322
  @Override
327
323
  public TaskReport commit()
328
324
  {
329
- // TODO: something
330
325
  return Exec.newTaskReport();
331
326
  }
332
327
  }
@@ -1,5 +1,7 @@
1
1
  package org.embulk.output.orc;
2
2
 
3
+ import com.google.common.base.Throwables;
4
+
3
5
  import java.io.IOException;
4
6
  import java.nio.file.Files;
5
7
  import java.nio.file.Path;
@@ -20,7 +22,7 @@ class OrcOutputPluginHelper
20
22
  Files.deleteIfExists(path);
21
23
  }
22
24
  catch (IOException e) {
23
- e.printStackTrace();
25
+ Throwables.propagate(e);
24
26
  }
25
27
  }
26
28
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-31 00:00:00.000000000 Z
11
+ date: 2017-10-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -58,6 +58,7 @@ files:
58
58
  - gradlew
59
59
  - gradlew.bat
60
60
  - lib/embulk/output/orc.rb
61
+ - src/main/java/org/embulk/output/orc/OrcCodec.java
61
62
  - src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
62
63
  - src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
63
64
  - src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
@@ -92,7 +93,7 @@ files:
92
93
  - classpath/curator-client-2.7.1.jar
93
94
  - classpath/curator-framework-2.7.1.jar
94
95
  - classpath/curator-recipes-2.7.1.jar
95
- - classpath/embulk-output-orc-0.2.0.jar
96
+ - classpath/embulk-output-orc-0.2.2.jar
96
97
  - classpath/embulk-util-aws-credentials-0.2.8.jar
97
98
  - classpath/gson-2.2.4.jar
98
99
  - classpath/hadoop-annotations-2.7.3.jar