RubyGems - embulk-output-orc - Versions diffs - 0.3.0 → 0.3.5 - Mend

embulk-output-orc 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/.github/workflows/gradle.yml +25 -0
data/README.md +30 -9
data/build.gradle +34 -15
data/example/example.yml +4 -6
data/gradle/wrapper/gradle-wrapper.jar +0 -0
data/gradle/wrapper/gradle-wrapper.properties +1 -1
data/gradlew +17 -1
data/gradlew.bat +17 -1
data/src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala +42 -0
data/src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala +156 -0
data/src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala +57 -0
data/src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala +52 -0
data/src/main/scala/org/embulk/output/orc/PluginTask.scala +56 -0
data/src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala +32 -0
data/src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java +71 -0
data/src/test/resources/example-null.yml +25 -0
data/src/test/resources/example.yml +25 -0
metadata +45 -42
data/.travis.yml +0 -14
data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +0 -82
data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +0 -249
data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +0 -28
data/src/main/java/org/embulk/output/orc/PluginTask.java +0 -60
data/src/main/java/org/embulk/output/orc/TimestampColumnOption.java +0 -22
data/src/test/java/org/embulk/output/orc/TestOrcOutputPlugin.java +0 -5

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5bf0784f61bbc808d36ebce5e46aaab889b891a3
-  data.tar.gz: 8937c475721a4f2c347575580982ce2a772f8d63
+  metadata.gz: 6ecb39bb650455937f641f073e9e0b13338f268b
+  data.tar.gz: 393ef796dfdf47239a11186b33988466432b0d02
 SHA512:
-  metadata.gz: 23a1a87ca07df8ebc6d17575a3abcf58cf9c7eb5cd6569ba62cfd7fa3cb52c42cc27a00e65f99d821c396e897ebaac78c3dc5dfe9ee6a750049c2017f08d9fa5
-  data.tar.gz: 0c08613e8c5182987a4bbb03ae3a0ce9eddb474a1b8672aa5fbc25e69da4ea0784a9982c6bc3888263eddb68213ec885eb3bb2000aaf187cb94148fa593a780d
+  metadata.gz: ee733e3cca10bfff236c7ff24d3249a1f8a30629ba314a9b840a2de4d6b552412fa1a6cb9555979dbad499259152b5a24439586105ce0417f629079b26775e9b
+  data.tar.gz: 41f8059f0af1f7eb1accccb18e33c111f7e75b7a69ffba97b26cd74de7349922ebdd852b87ea31f40c257cc57543b88cada9e32aea828f7f8d852adf20d3f328

data/.github/workflows/gradle.yml ADDED

@@ -0,0 +1,25 @@
+name: Java CI
+on: [push]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up JDK 1.8
+      uses: actions/setup-java@v1
+      with:
+        java-version: 1.8
+    - name: Build with Gradle
+      run:
+        ./gradlew build
+    - name: Checkstyle & static check
+      run: |
+        ./gradlew --info checkstyle
+        ./gradlew --info check
+#    - name: Spotbugs
+#      run: |
+#        ./gradlew spotbugsMain spotbugsTest

data/README.md CHANGED

@@ -1,6 +1,6 @@
 # Orc output plugin for Embulk
-[![Build Status](https://travis-ci.org/yuokada/embulk-output-orc.svg?branch=master)](https://travis-ci.org/yuokada/embulk-output-orc)
+[![Build Status](https://github.com/yuokada/embulk-output-orc/workflows/Java%20CI/badge.svg)](https://github.com/yuokada/embulk-output-orc/actions)
 [![Gem Version](https://badge.fury.io/rb/embulk-output-orc.svg)](https://badge.fury.io/rb/embulk-output-orc)
 ## Overview
@@ -13,15 +13,16 @@
 ## Configuration
 - **path_prefix**: A prefix of output path. (string, required)
-  - support: `file`, `s3n` and `s3a`.
+  - support: `file`, `s3`, `s3n` and `s3a`.
 - **file_ext**: An extension of output file. (string, default: `.orc`)
 - **sequence_format**: (string, default: `.%03d`)
-- **buffer_size**: Set the ORC buffer size (integer, default: `262144`)
-- **strip_size**: Set the ORC strip size (integer,  default: `67108864`)
-- **block_size**: Set the ORC block size (integer, default: `268435456`)
+- **buffer_size**: Set the ORC buffer size (integer, default: `262144(256KB)` )
+- **strip_size**: Set the ORC strip size (integer,  default: `67108864(64MB)` )
+- **block_size**: Set the ORC block size (integer, default: `268435456(256MB)`)
 - **compression_kind**: description (string, default: `'ZLIB'`)
-    - `NONE`, `ZLIB`, `SNAPPY`
-- **overwrite**: (LocalFileSystem only) Overwrite if output files already exist. (boolean, default: `false`)
+    - `NONE`, `ZLIB`, `SNAPPY`, `LZO`, `LZ4`
+- **overwrite**: Overwrite if output files already exist. (boolean, default: `false`)
+    - Support: `LocalFileSystem`, `S3(s3, s3a, s3n)`
 - **default_from_timezone** Time zone of timestamp columns. This can be overwritten for each column using column_options (DateTimeZone, default: `UTC`)
 - **auth_method**: name of mechanism to authenticate requests (basic, env, instance, profile, properties, anonymous, or session. default: basic)
@@ -36,14 +37,34 @@
 out:
   type: orc
   path_prefix: "/tmp/output"
-  buffer_size: 8000
-  strip_size:  90000
   compression_kind: ZLIB
   overwrite:   true
 ```
 ## ChangeLog
+### ver 0.3.4
+- Bump `orc` library to `1.5.4`
+- bugfix
+  - https://github.com/yuokada/embulk-output-orc/pull/17
+### ver 0.3.3
+- bugfix
+- Bump `orc` library to `1.4.4`
+### ver 0.3.2
+- Update `orc` libraries to `1.4.3`
+### ver 0.3.0
+- Change default value : (block_size, buffer_size, strip_size)
+    - default value is Hive's default value.
+      (see: https://orc.apache.org/docs/hive-config.html)
 ### ver 0.2.0
 - support: output to s3

data/build.gradle CHANGED

@@ -1,8 +1,10 @@
 plugins {
     id "com.jfrog.bintray" version "1.1"
-    id "com.github.jruby-gradle.base" version "0.1.5"
+    id "com.github.jruby-gradle.base" version "1.5.0"
     id "java"
+    id "scala"
     id "checkstyle"
+//    id "com.github.spotbugs" version "3.0.1"
     id "org.sonarqube" version "2.5"
 }
 import com.github.jrubygradle.JRubyExec
@@ -18,26 +20,41 @@ configurations {
     runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
 }
-version = "0.3.0"
+version = "0.3.5"
 sourceCompatibility = 1.8
 targetCompatibility = 1.8
 dependencies {
-    compile  "org.embulk:embulk-core:0.8.34"
-    provided "org.embulk:embulk-core:0.8.34"
+    compile  "org.embulk:embulk-core:0.9.23"
+    provided "org.embulk:embulk-core:0.9.23"
+    compile  "org.scala-lang:scala-library:2.12.+"
-    compile "org.apache.orc:orc:1.4.0"
-    compile "org.apache.orc:orc-core:1.4.0"
-    compile "org.apache.hadoop:hadoop-hdfs:2.6.4"
+    compile "org.apache.orc:orc:1.5.4"
+    compile "org.apache.orc:orc-core:1.5.4"
+    compile "org.apache.hadoop:hadoop-hdfs:2.7.5"
     compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
     compile "com.amazonaws:aws-java-sdk-s3:1.10.33"
-    compile "org.apache.hadoop:hadoop-aws:2.7.3"
+    compile "org.apache.hadoop:hadoop-aws:2.7.5"
-    testCompile "junit:junit:4.+"
-    testCompile "org.embulk:embulk-core:0.8.34:tests"
-    testCompile "org.embulk:embulk-standards:0.8.34"
+    testCompile 'org.jmockit:jmockit:1.38'
+    // testCompile "junit:junit:4.+"
+    testCompile 'org.hamcrest:hamcrest-core:1.3'
+    testCompile 'org.testng:testng:6.14.2'
+    testCompile "org.embulk:embulk-core:0.8.39:tests"
+    testCompile "org.embulk:embulk-standards:0.8.39"
+}
+sourceSets {
+    main {
+        scala {
+            srcDirs = ['src/main/scala', 'src/main/java']
+        }
+        java {
+            srcDirs = []
+        }
+    }
 }
 task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -65,14 +82,16 @@ task checkstyle(type: Checkstyle) {
 }
 task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
-    jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
-    script "${project.name}.gemspec"
+    jrubyArgs "-S"
+    script "gem"
+    scriptArgs "build", "${project.name}.gemspec"
     doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
 }
 task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
-    jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
-    script "pkg/${project.name}-${project.version}.gem"
+    jrubyArgs "-S"
+    script "gem"
+    scriptArgs "push", "pkg/${project.name}-${project.version}.gem"
 }
 task "package"(dependsOn: ["gemspec", "classpath"]) {

data/example/example.yml CHANGED

@@ -1,7 +1,7 @@
 ---
 in:
   type: randomj
-  rows: 1024
+  rows: 1024000
   threads: 1
   # default_timezone: Asia/Tokyo
   primary_key: myid
@@ -14,14 +14,12 @@ in:
     - {name: time,     type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
     - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
-#exec:
-#  max_threads: 6         # run at most 8 tasks concurrently
-#  min_output_tasks: 2    # disable page scattering
+exec:
+  max_threads: 2         # run at most 8 tasks concurrently
+  min_output_tasks: 1    # disable page scattering
 out:
   type: orc
   overwrite:   true
   path_prefix: "/tmp/output"
-  buffer_size: 8000
-  strip_size:  90000
   compression_kind: ZLIB

data/gradle/wrapper/gradle-wrapper.jar CHANGED

Binary file

data/gradle/wrapper/gradle-wrapper.properties CHANGED

@@ -1,5 +1,5 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.6.4-bin.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.2.1-bin.zip

data/gradlew CHANGED

@@ -1,5 +1,21 @@
 #!/usr/bin/env sh
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 ##############################################################################
 ##
 ##  Gradle start up script for UN*X
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
 APP_BASE_NAME=`basename "$0"`
 # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS=""
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 # Use the maximum available, or set MAX_FD != -1 to use that value.
 MAX_FD="maximum"

data/gradlew.bat CHANGED

@@ -1,3 +1,19 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
 @if "%DEBUG%" == "" @echo off
 @rem ##########################################################################
 @rem
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
 set APP_HOME=%DIRNAME%
 @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
 @rem Find java.exe
 if defined JAVA_HOME goto findJavaFromJavaHome

data/src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala ADDED

@@ -0,0 +1,42 @@
+package org.embulk.output.orc
+import java.nio.charset.StandardCharsets
+import org.apache.hadoop.hive.ql.exec.vector._
+import org.embulk.spi.{Column, ColumnVisitor, PageReader}
+class OrcColumnVisitor(val reader: PageReader, val batch: VectorizedRowBatch, val i: Integer) extends ColumnVisitor {
+  override def booleanColumn(column: Column): Unit = if (reader.isNull(column)) {
+    batch.cols(column.getIndex).noNulls = false
+    batch.cols(column.getIndex).isNull(i) = true
+  }
+  else if (reader.getBoolean(column)) batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 1
+  else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 0
+  override def longColumn(column: Column): Unit = if (reader.isNull(column)) {
+    batch.cols(column.getIndex).noNulls = false
+    batch.cols(column.getIndex).isNull(i) = true
+  }
+  else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = reader.getLong(column)
+  override def doubleColumn(column: Column): Unit = if (reader.isNull(column)) {
+    batch.cols(column.getIndex).noNulls = false
+    batch.cols(column.getIndex).isNull(i) = true
+  }
+  else batch.cols(column.getIndex).asInstanceOf[DoubleColumnVector].vector(i) = reader.getDouble(column)
+  override def stringColumn(column: Column): Unit = if (!reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[BytesColumnVector].setVal(i, reader.getString(column).getBytes(StandardCharsets.UTF_8))
+  else {
+    batch.cols(column.getIndex).noNulls = false
+    batch.cols(column.getIndex).isNull(i) = true
+  }
+  override def timestampColumn(column: Column): Unit = if (reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].setNullValue(i)
+  else {
+    val timestamp = reader.getTimestamp(column)
+    val ts = new java.sql.Timestamp(timestamp.getEpochSecond * 1000)
+    batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].set(i, ts)
+  }
+  override def jsonColumn(column: Column) = throw new UnsupportedOperationException("orc output plugin does not support json type")
+}

data/src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala ADDED

@@ -0,0 +1,156 @@
+package org.embulk.output.orc
+import java.io.IOException
+import java.util
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{LocalFileSystem, Path}
+import org.apache.hadoop.hdfs.DistributedFileSystem
+import org.apache.hadoop.util.VersionInfo
+import org.apache.orc.{CompressionKind, MemoryManager, OrcFile, TypeDescription, Writer}
+import org.embulk.config.{ConfigSource, TaskReport, TaskSource}
+import org.embulk.spi.util.Timestamps
+import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema}
+import org.embulk.util.aws.credentials.AwsCredentials
+object OrcOutputPlugin {
+  private[orc] def getSchema(schema: Schema) = {
+    val oschema = TypeDescription.createStruct
+    for (i <- 0 until schema.size) {
+      val column = schema.getColumn(i)
+      val `type` = column.getType
+      `type`.getName match {
+        case "long" =>
+          oschema.addField(column.getName, TypeDescription.createLong)
+        case "double" =>
+          oschema.addField(column.getName, TypeDescription.createDouble)
+        case "boolean" =>
+          oschema.addField(column.getName, TypeDescription.createBoolean)
+        case "string" =>
+          oschema.addField(column.getName, TypeDescription.createString)
+        case "timestamp" =>
+          oschema.addField(column.getName, TypeDescription.createTimestamp)
+        case _ =>
+          System.out.println("Unsupported type")
+      }
+    }
+    oschema
+  }
+  // We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
+  // Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
+  // As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
+  // notifies checkMemory() only to that instance.
+  private class WriterLocalMemoryManager extends MemoryManager {
+    final private[orc] val rowsBetweenChecks = 10000
+    private var rowsAddedSinceCheck = 0
+    private[orc] var boundCallback: MemoryManager.Callback = _
+    @throws[IOException]
+    override def addWriter(path: Path, requestedAllocation: Long, callback: MemoryManager.Callback): Unit = {
+      if (boundCallback != null) {
+        throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.")
+      } else {
+        boundCallback = callback
+      }
+    }
+    @throws[IOException]
+    override def removeWriter(path: Path): Unit = boundCallback = null
+    @throws[IOException]
+    override def addedRow(rows: Int): Unit = {
+      rowsAddedSinceCheck += rows
+      if (rowsAddedSinceCheck > rowsBetweenChecks) {
+        boundCallback.checkMemory(1)
+        rowsAddedSinceCheck = 0
+      }
+    }
+  }
+}
+class OrcOutputPlugin extends OutputPlugin {
+  override def transaction(config: ConfigSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = {
+    val task = config.loadConfig(classOf[PluginTask])
+    // retryable (idempotent) output:
+    // return resume(task.dump(), schema, taskCount, control);
+    // non-retryable (non-idempotent) output:
+    control.run(task.dump)
+    Exec.newConfigDiff
+  }
+  override def resume(taskSource: TaskSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = throw new UnsupportedOperationException("orc output plugin does not support resuming")
+  override def cleanup(taskSource: TaskSource, schema: Schema, taskCount: Int, successTaskReports: util.List[TaskReport]): Unit = {
+  }
+  override def open(taskSource: TaskSource, schema: Schema, taskIndex: Int) = {
+    val task = taskSource.loadTask(classOf[PluginTask])
+    if (task.getOverwrite) {
+      val credentials = AwsCredentials.getAWSCredentialsProvider(task).getCredentials
+      OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex), task)
+    }
+    val reader = new PageReader(schema)
+    val writer = createWriter(task, schema, taskIndex)
+    new OrcTransactionalPageOutput(reader, writer, task)
+  }
+  private def buildPath(task: PluginTask, processorIndex: Int): String = {
+    val pathPrefix = task.getPathPrefix
+    val pathSuffix = task.getFileNameExtension
+    val sequenceFormat = task.getSequenceFormat
+    val fmt = java.lang.String.format(sequenceFormat, processorIndex.asInstanceOf[AnyRef])
+    pathPrefix + fmt + pathSuffix
+  }
+  private def getHadoopConfiguration(task: PluginTask) = {
+    val conf = new Configuration
+    // see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
+    conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
+    conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
+    // see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
+    AwsCredentials.getAWSCredentialsProvider(task)
+    if (task.getAccessKeyId.isPresent) {
+      conf.set("fs.s3a.access.key", task.getAccessKeyId.get)
+      conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId.get)
+    }
+    if (task.getSecretAccessKey.isPresent) {
+      conf.set("fs.s3a.secret.key", task.getSecretAccessKey.get)
+      conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey.get)
+    }
+    if (task.getEndpoint.isPresent) {
+      conf.set("fs.s3a.endpoint", task.getEndpoint.get)
+      conf.set("fs.s3n.endpoint", task.getEndpoint.get)
+    }
+    conf
+  }
+  private def createWriter(task: PluginTask, schema: Schema, processorIndex: Int): Writer = {
+    val timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
+    val conf = getHadoopConfiguration(task)
+    val oschema = OrcOutputPlugin.getSchema(schema)
+    // see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
+    Thread.currentThread.setContextClassLoader(classOf[VersionInfo].getClassLoader)
+    var writer: Writer = null
+    try { // Make writerOptions
+      val writerOptions = createWriterOptions(task, conf)
+      // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
+      // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
+      writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)), writerOptions.setSchema(oschema).memory(new OrcOutputPlugin.WriterLocalMemoryManager).version(OrcFile.Version.V_0_12))
+    } catch {
+      case e: IOException => throw e
+    }
+    writer
+  }
+  private def createWriterOptions(task: PluginTask, conf: Configuration) = {
+    val bufferSize = task.getBufferSize
+    val stripSize = task.getStripSize
+    val blockSize = task.getBlockSize
+    val kindString = task.getCompressionKind
+    val kind = CompressionKind.valueOf(kindString)
+    OrcFile.writerOptions(conf).bufferSize(bufferSize).blockSize(blockSize.toLong).stripeSize(stripSize.toLong).compress(kind)
+  }
+}