embulk-output-orc 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a4d872e06384fd47cf8c1707398234974e0ecb3
4
- data.tar.gz: bb3855c44c55f16bc5e1707f09e3a1b6c683aee6
3
+ metadata.gz: 041e79d159d0ffe346b0c28f17c7009438bc65e5
4
+ data.tar.gz: 13d96f705101ce32a4d389c1539ea23c50db3872
5
5
  SHA512:
6
- metadata.gz: a16e0c62d57089d5ff431021b97586a8203a423ab3f8797a4e2274f26cb168a91a59ae72d6d42b4f2540209565d05d9e3f1ae66af242e52c90ab1f239afe90cd
7
- data.tar.gz: 21f77bd871793f06b014ce9df5c08ab7e0f0d610b7cc69b749e415a1f181891288ba06211dbe3d769bfd756bfaaf344d1eeecd42db9f085a7feba28560921c96
6
+ metadata.gz: fdd305bfb25dfd998f0c49e55ef4ffb916f268fa5189678102b3f6e7228532da4d3ea141afe03491a2cf11bbe394975f73868a7620ab0d410306ab998d1f0d95
7
+ data.tar.gz: 88574a4b9e2982b80c93307c361ec1de26c0878d8f08acfc0333eb770df74a7e76ef38747b94a57c351d4cd42e6fce015d141ef8d94e5f47bc6ecab1eeca085e
@@ -10,3 +10,5 @@ sudo: false
10
10
  script:
11
11
  - ./gradlew --info checkstyle
12
12
  - ./gradlew --info check
13
+
14
+ after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
data/README.md CHANGED
@@ -19,6 +19,7 @@
19
19
  - **buffer_size**: Set the ORC buffer size (integer, default: `10000`)
20
20
  - **strip_size**: Set the ORC strip size (integer, default: `100000`)
21
21
  - **compression_kind**: description (string, default: `'ZLIB'`)
22
+ - `NONE`, `ZLIB`, `SNAPPY`
22
23
  - **overwrite**: (LocalFileSystem only) Overwrite if output files already exist. (boolean, default: `false`)
23
24
  - **default_from_timezone** Time zone of timestamp columns. This can be overwritten for each column using column_options (DateTimeZone, default: `UTC`)
24
25
 
@@ -57,3 +58,7 @@ out:
57
58
  ```
58
59
  $ ./gradlew gem # -t to watch change of files and rebuild continuously
59
60
  ```
61
+
62
+ ## SonarQube
63
+
64
+ [embulk-output-orc](https://sonarcloud.io/dashboard?id=embulk-output-orc "embulk-output-orc - Yukihiro Okada")
@@ -3,6 +3,7 @@ plugins {
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
5
  id "checkstyle"
6
+ id "org.sonarqube" version "2.5"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
@@ -17,14 +18,14 @@ configurations {
17
18
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
18
19
  }
19
20
 
20
- version = "0.2.0"
21
+ version = "0.2.2"
21
22
 
22
23
  sourceCompatibility = 1.8
23
24
  targetCompatibility = 1.8
24
25
 
25
26
  dependencies {
26
- compile "org.embulk:embulk-core:0.8.29"
27
- provided "org.embulk:embulk-core:0.8.29"
27
+ compile "org.embulk:embulk-core:0.8.34"
28
+ provided "org.embulk:embulk-core:0.8.34"
28
29
 
29
30
  compile "org.apache.orc:orc:1.4.0"
30
31
  compile "org.apache.orc:orc-core:1.4.0"
@@ -35,8 +36,8 @@ dependencies {
35
36
  compile "org.apache.hadoop:hadoop-aws:2.7.3"
36
37
 
37
38
  testCompile "junit:junit:4.+"
38
- testCompile "org.embulk:embulk-core:0.8.29:tests"
39
- testCompile "org.embulk:embulk-standards:0.8.29"
39
+ testCompile "org.embulk:embulk-core:0.8.34:tests"
40
+ testCompile "org.embulk:embulk-standards:0.8.34"
40
41
  }
41
42
 
42
43
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -1,41 +1,11 @@
1
- # in:
2
- # type: file
3
- # path_prefix: example/example.csv
4
- # parser:
5
- # type: csv
6
- # charset: UTF-8
7
- # newline: CRLF
8
- # null_string: 'NULL'
9
- # skip_header_lines: 1
10
- # comment_line_marker: '#'
11
- # columns:
12
- # #- {name: time, type: timestamp, format: "%Y-%m-%d"}
13
- # - {name: id, type: long}
14
- # - {name: name, type: string}
15
- # - {name: score, type: double}
16
- # - {name: json, type: json}
17
- #filters:
18
- # - type: column
19
- # columns:
20
- # - {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
21
- # - {name: name, default: "foo"}
22
- # - {name: foo, default: 1, type: long}
23
- # - {name: id}
24
- # - {name: copy_score, src: score}
25
- # - {name: json, default: "{\"foo\":\"FOO\"}"}
26
- # - {name: $.json.foo}
27
- # - {name: $.json.copy_foo, src: $.json.foo}
1
+ ---
28
2
  in:
29
- type: file
30
- path_prefix: example/sample.csv
31
- parser:
32
- type: csv
33
- charset: UTF-8
34
- newline: CRLF
35
- null_string: 'NULL'
36
- skip_header_lines: 0
37
- comment_line_marker: '#'
38
- columns:
3
+ type: randomj
4
+ rows: 1024
5
+ threads: 1
6
+ # default_timezone: Asia/Tokyo
7
+ primary_key: myid
8
+ schema:
39
9
  - {name: myid, type: long}
40
10
  - {name: named, type: string}
41
11
  - {name: x_flag, type: boolean}
@@ -44,12 +14,9 @@ in:
44
14
  - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
45
15
  - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
46
16
 
47
- exec:
48
- max_threads: 2 # run at most 8 tasks concurrently
49
- min_output_tasks: 1 # disable page scattering
50
-
51
- #out:
52
- # type: stdout
17
+ #exec:
18
+ # max_threads: 6 # run at most 8 tasks concurrently
19
+ # min_output_tasks: 2 # disable page scattering
53
20
 
54
21
  out:
55
22
  type: orc
@@ -1,6 +1,5 @@
1
- #Mon Aug 14 21:51:29 JST 2017
2
1
  distributionBase=GRADLE_USER_HOME
3
2
  distributionPath=wrapper/dists
4
3
  zipStoreBase=GRADLE_USER_HOME
5
4
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-all.zip
5
+ distributionUrl=https\://services.gradle.org/distributions/gradle-4.2.1-bin.zip
data/gradlew CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env bash
1
+ #!/usr/bin/env sh
2
2
 
3
3
  ##############################################################################
4
4
  ##
@@ -33,11 +33,11 @@ DEFAULT_JVM_OPTS=""
33
33
  # Use the maximum available, or set MAX_FD != -1 to use that value.
34
34
  MAX_FD="maximum"
35
35
 
36
- warn ( ) {
36
+ warn () {
37
37
  echo "$*"
38
38
  }
39
39
 
40
- die ( ) {
40
+ die () {
41
41
  echo
42
42
  echo "$*"
43
43
  echo
@@ -154,16 +154,19 @@ if $cygwin ; then
154
154
  esac
155
155
  fi
156
156
 
157
- # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
- function splitJvmOpts() {
159
- JVM_OPTS=("$@")
157
+ # Escape application args
158
+ save () {
159
+ for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
+ echo " "
160
161
  }
161
- eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
- JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
162
+ APP_ARGS=$(save "$@")
163
+
164
+ # Collect all arguments for the java command, following the shell quoting and substitution rules
165
+ eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
163
166
 
164
167
  # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
165
- if [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]]; then
168
+ if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
166
169
  cd "$(dirname "$0")"
167
170
  fi
168
171
 
169
- exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
172
+ exec "$JAVACMD" "$@"
@@ -0,0 +1,21 @@
1
+ package org.embulk.output.orc;
2
+
3
+ public enum OrcCodec
4
+ {
5
+ ZLIB("zlib"),
6
+ SNAPPY("snappy"),
7
+ LZO("lzo"),
8
+ LZ4("lz4"),
9
+ NONE("none"),;
10
+ String kind;
11
+
12
+ OrcCodec(String kind)
13
+ {
14
+ this.kind = kind;
15
+ }
16
+
17
+ public String getKind()
18
+ {
19
+ return kind;
20
+ }
21
+ }
@@ -10,12 +10,14 @@ import org.embulk.spi.ColumnVisitor;
10
10
  import org.embulk.spi.PageReader;
11
11
  import org.embulk.spi.time.Timestamp;
12
12
 
13
+ import java.nio.charset.StandardCharsets;
14
+
13
15
  public class OrcColumnVisitor
14
16
  implements ColumnVisitor
15
17
  {
16
- private PageReader reader;
17
- private VectorizedRowBatch batch;
18
- private Integer i;
18
+ private final PageReader reader;
19
+ private final VectorizedRowBatch batch;
20
+ private final Integer i;
19
21
 
20
22
  public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
21
23
  {
@@ -31,7 +33,6 @@ public class OrcColumnVisitor
31
33
  ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
32
34
  }
33
35
  else {
34
- // TODO; Fix all true bug
35
36
  if (reader.getBoolean(column)) {
36
37
  ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
37
38
  }
@@ -57,7 +58,7 @@ public class OrcColumnVisitor
57
58
  public void stringColumn(Column column)
58
59
  {
59
60
  ((BytesColumnVector) batch.cols[column.getIndex()]).setVal(i,
60
- reader.getString(column).getBytes());
61
+ reader.getString(column).getBytes(StandardCharsets.UTF_8));
61
62
  }
62
63
 
63
64
  @Override
@@ -68,11 +69,8 @@ public class OrcColumnVisitor
68
69
  }
69
70
  else {
70
71
  Timestamp timestamp = reader.getTimestamp(column);
71
- if (!timestamp.equals("")) {
72
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
74
- }
75
- // throw new UnsupportedOperationException("orc output plugin does not support timestamp yet");
72
+ java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
73
+ ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
76
74
  }
77
75
  }
78
76
 
@@ -32,10 +32,9 @@ import org.embulk.spi.util.Timestamps;
32
32
  import org.embulk.util.aws.credentials.AwsCredentials;
33
33
  import org.embulk.util.aws.credentials.AwsCredentialsTask;
34
34
  import org.joda.time.DateTimeZone;
35
- import org.joda.time.format.DateTimeFormat;
36
- import org.joda.time.format.DateTimeFormatter;
37
35
 
38
36
  import java.io.IOException;
37
+ import java.util.ArrayList;
39
38
  import java.util.List;
40
39
  import java.util.Map;
41
40
 
@@ -80,6 +79,10 @@ public class OrcOutputPlugin
80
79
  @Config("default_from_timezone")
81
80
  @ConfigDefault("\"UTC\"")
82
81
  DateTimeZone getDefaultFromTimeZone();
82
+
83
+ @Config("endpoint")
84
+ @ConfigDefault("null")
85
+ Optional<String> getEndpoint();
83
86
  }
84
87
 
85
88
  public interface TimestampColumnOption
@@ -196,7 +199,9 @@ public class OrcOutputPlugin
196
199
  conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
197
200
  conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
198
201
  }
199
-
202
+ if (task.getEndpoint().isPresent()) {
203
+ conf.set("fs.s3a.endpoint", task.getEndpoint().get());
204
+ }
200
205
  return conf;
201
206
  }
202
207
 
@@ -222,7 +227,7 @@ public class OrcOutputPlugin
222
227
  .version(OrcFile.Version.V_0_12));
223
228
  }
224
229
  catch (IOException e) {
225
- e.printStackTrace();
230
+ Throwables.propagate(e);
226
231
  }
227
232
  return writer;
228
233
  }
@@ -259,43 +264,34 @@ public class OrcOutputPlugin
259
264
  class OrcTransactionalPageOutput
260
265
  implements TransactionalPageOutput
261
266
  {
262
- private PageReader reader;
263
- private Writer writer;
264
- private DateTimeFormatter formatter;
267
+ private final PageReader reader;
268
+ private final Writer writer;
269
+ private final ArrayList<VectorizedRowBatch> rowBatches = new ArrayList<>();
265
270
 
266
271
  public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
267
272
  {
268
273
  this.reader = reader;
269
274
  this.writer = writer;
270
-
271
- // formatter
272
- DateTimeZone defaultTimeZone = DateTimeZone
273
- .forTimeZone(task.getDefaultFromTimeZone().toTimeZone());
274
- formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(defaultTimeZone);
275
275
  }
276
276
 
277
277
  @Override
278
278
  public void add(Page page)
279
279
  {
280
280
  int size = page.getStringReferences().size();
281
- TypeDescription schema = getSchema(reader.getSchema());
282
- VectorizedRowBatch batch = schema.createRowBatch();
281
+ final TypeDescription schema = getSchema(reader.getSchema());
282
+ final VectorizedRowBatch batch = schema.createRowBatch();
283
283
  batch.size = size;
284
284
 
285
285
  reader.setPage(page);
286
286
  int i = 0;
287
287
  while (reader.nextRecord()) {
288
- // batch.size = page.getStringReferences().size();
289
288
  reader.getSchema().visitColumns(
290
289
  new OrcColumnVisitor(reader, batch, i)
291
290
  );
292
291
  i++;
293
292
  }
294
- try {
295
- writer.addRowBatch(batch);
296
- }
297
- catch (IOException e) {
298
- e.printStackTrace();
293
+ synchronized (this) {
294
+ rowBatches.add(batch);
299
295
  }
300
296
  }
301
297
 
@@ -303,8 +299,10 @@ public class OrcOutputPlugin
303
299
  public void finish()
304
300
  {
305
301
  try {
302
+ for (VectorizedRowBatch batch : rowBatches) {
303
+ writer.addRowBatch(batch);
304
+ }
306
305
  writer.close();
307
- writer = null;
308
306
  }
309
307
  catch (IOException e) {
310
308
  Throwables.propagate(e);
@@ -314,19 +312,16 @@ public class OrcOutputPlugin
314
312
  @Override
315
313
  public void close()
316
314
  {
317
- // TODO: something
318
315
  }
319
316
 
320
317
  @Override
321
318
  public void abort()
322
319
  {
323
- // TODO: something
324
320
  }
325
321
 
326
322
  @Override
327
323
  public TaskReport commit()
328
324
  {
329
- // TODO: something
330
325
  return Exec.newTaskReport();
331
326
  }
332
327
  }
@@ -1,5 +1,7 @@
1
1
  package org.embulk.output.orc;
2
2
 
3
+ import com.google.common.base.Throwables;
4
+
3
5
  import java.io.IOException;
4
6
  import java.nio.file.Files;
5
7
  import java.nio.file.Path;
@@ -20,7 +22,7 @@ class OrcOutputPluginHelper
20
22
  Files.deleteIfExists(path);
21
23
  }
22
24
  catch (IOException e) {
23
- e.printStackTrace();
25
+ Throwables.propagate(e);
24
26
  }
25
27
  }
26
28
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-31 00:00:00.000000000 Z
11
+ date: 2017-10-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -58,6 +58,7 @@ files:
58
58
  - gradlew
59
59
  - gradlew.bat
60
60
  - lib/embulk/output/orc.rb
61
+ - src/main/java/org/embulk/output/orc/OrcCodec.java
61
62
  - src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
62
63
  - src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
63
64
  - src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
@@ -92,7 +93,7 @@ files:
92
93
  - classpath/curator-client-2.7.1.jar
93
94
  - classpath/curator-framework-2.7.1.jar
94
95
  - classpath/curator-recipes-2.7.1.jar
95
- - classpath/embulk-output-orc-0.2.0.jar
96
+ - classpath/embulk-output-orc-0.2.2.jar
96
97
  - classpath/embulk-util-aws-credentials-0.2.8.jar
97
98
  - classpath/gson-2.2.4.jar
98
99
  - classpath/hadoop-annotations-2.7.3.jar