embulk-output-orc 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 725b869f19175110dd6e542619f5bfd9c8c0168b
4
- data.tar.gz: 9c8016b015f380815cb2ad7127c192243ebfa45a
3
+ metadata.gz: 6ecb39bb650455937f641f073e9e0b13338f268b
4
+ data.tar.gz: 393ef796dfdf47239a11186b33988466432b0d02
5
5
  SHA512:
6
- metadata.gz: 197da7bd52b19ccc2c3f30ad0d2c2c36446fa00e48426de9bb333b0ef2a8eddc213f0db947e50215297333a933e5d6d3b63ad0c0388f78e19d984fae6810bb99
7
- data.tar.gz: ccf9bc5b8f73c8d0f48e7d17e16dd4b113b3f9b7a1b9178045109ef3476f4cb47f58e0699e4a9516ab81830d59a4bae3be2fba8f936c4cef0f705f6acc384f2d
6
+ metadata.gz: ee733e3cca10bfff236c7ff24d3249a1f8a30629ba314a9b840a2de4d6b552412fa1a6cb9555979dbad499259152b5a24439586105ce0417f629079b26775e9b
7
+ data.tar.gz: 41f8059f0af1f7eb1accccb18e33c111f7e75b7a69ffba97b26cd74de7349922ebdd852b87ea31f40c257cc57543b88cada9e32aea828f7f8d852adf20d3f328
@@ -0,0 +1,25 @@
1
+ name: Java CI
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ build:
7
+
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v1
12
+ - name: Set up JDK 1.8
13
+ uses: actions/setup-java@v1
14
+ with:
15
+ java-version: 1.8
16
+ - name: Build with Gradle
17
+ run:
18
+ ./gradlew build
19
+ - name: Checkstyle & static check
20
+ run: |
21
+ ./gradlew --info checkstyle
22
+ ./gradlew --info check
23
+ # - name: Spotbugs
24
+ # run: |
25
+ # ./gradlew spotbugsMain spotbugsTest
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Orc output plugin for Embulk
2
2
 
3
- [![Build Status](https://travis-ci.org/yuokada/embulk-output-orc.svg?branch=master)](https://travis-ci.org/yuokada/embulk-output-orc)
3
+ [![Build Status](https://github.com/yuokada/embulk-output-orc/workflows/Java%20CI/badge.svg)](https://github.com/yuokada/embulk-output-orc/actions)
4
4
  [![Gem Version](https://badge.fury.io/rb/embulk-output-orc.svg)](https://badge.fury.io/rb/embulk-output-orc)
5
5
 
6
6
  ## Overview
@@ -2,7 +2,9 @@ plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
3
  id "com.github.jruby-gradle.base" version "1.5.0"
4
4
  id "java"
5
+ id "scala"
5
6
  id "checkstyle"
7
+ // id "com.github.spotbugs" version "3.0.1"
6
8
  id "org.sonarqube" version "2.5"
7
9
  }
8
10
  import com.github.jrubygradle.JRubyExec
@@ -18,14 +20,15 @@ configurations {
18
20
  runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
19
21
  }
20
22
 
21
- version = "0.3.4"
23
+ version = "0.3.5"
22
24
 
23
25
  sourceCompatibility = 1.8
24
26
  targetCompatibility = 1.8
25
27
 
26
28
  dependencies {
27
- compile "org.embulk:embulk-core:0.8.34"
28
- provided "org.embulk:embulk-core:0.8.34"
29
+ compile "org.embulk:embulk-core:0.9.23"
30
+ provided "org.embulk:embulk-core:0.9.23"
31
+ compile "org.scala-lang:scala-library:2.12.+"
29
32
 
30
33
  compile "org.apache.orc:orc:1.5.4"
31
34
  compile "org.apache.orc:orc-core:1.5.4"
@@ -34,14 +37,24 @@ dependencies {
34
37
  compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
35
38
  compile "com.amazonaws:aws-java-sdk-s3:1.10.33"
36
39
  compile "org.apache.hadoop:hadoop-aws:2.7.5"
37
- compile 'com.google.guava:guava:24.1-jre'
38
40
 
39
41
  testCompile 'org.jmockit:jmockit:1.38'
40
42
  // testCompile "junit:junit:4.+"
41
43
  testCompile 'org.hamcrest:hamcrest-core:1.3'
42
44
  testCompile 'org.testng:testng:6.14.2'
43
- testCompile "org.embulk:embulk-core:0.8.34:tests"
44
- testCompile "org.embulk:embulk-standards:0.8.34"
45
+ testCompile "org.embulk:embulk-core:0.8.39:tests"
46
+ testCompile "org.embulk:embulk-standards:0.8.39"
47
+ }
48
+
49
+ sourceSets {
50
+ main {
51
+ scala {
52
+ srcDirs = ['src/main/scala', 'src/main/java']
53
+ }
54
+ java {
55
+ srcDirs = []
56
+ }
57
+ }
45
58
  }
46
59
 
47
60
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -1,6 +1,5 @@
1
- #Wed Jan 09 23:08:09 JST 2019
2
1
  distributionBase=GRADLE_USER_HOME
3
2
  distributionPath=wrapper/dists
3
+ distributionUrl=https\://services.gradle.org/distributions/gradle-5.6.4-bin.zip
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-4.10-all.zip
data/gradlew CHANGED
@@ -1,5 +1,21 @@
1
1
  #!/usr/bin/env sh
2
2
 
3
+ #
4
+ # Copyright 2015 the original author or authors.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # https://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+
3
19
  ##############################################################################
4
20
  ##
5
21
  ## Gradle start up script for UN*X
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
28
44
  APP_BASE_NAME=`basename "$0"`
29
45
 
30
46
  # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
- DEFAULT_JVM_OPTS=""
47
+ DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
32
48
 
33
49
  # Use the maximum available, or set MAX_FD != -1 to use that value.
34
50
  MAX_FD="maximum"
@@ -1,3 +1,19 @@
1
+ @rem
2
+ @rem Copyright 2015 the original author or authors.
3
+ @rem
4
+ @rem Licensed under the Apache License, Version 2.0 (the "License");
5
+ @rem you may not use this file except in compliance with the License.
6
+ @rem You may obtain a copy of the License at
7
+ @rem
8
+ @rem https://www.apache.org/licenses/LICENSE-2.0
9
+ @rem
10
+ @rem Unless required by applicable law or agreed to in writing, software
11
+ @rem distributed under the License is distributed on an "AS IS" BASIS,
12
+ @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ @rem See the License for the specific language governing permissions and
14
+ @rem limitations under the License.
15
+ @rem
16
+
1
17
  @if "%DEBUG%" == "" @echo off
2
18
  @rem ##########################################################################
3
19
  @rem
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
14
30
  set APP_HOME=%DIRNAME%
15
31
 
16
32
  @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
- set DEFAULT_JVM_OPTS=
33
+ set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
18
34
 
19
35
  @rem Find java.exe
20
36
  if defined JAVA_HOME goto findJavaFromJavaHome
@@ -0,0 +1,42 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.nio.charset.StandardCharsets
4
+
5
+ import org.apache.hadoop.hive.ql.exec.vector._
6
+ import org.embulk.spi.{Column, ColumnVisitor, PageReader}
7
+
8
+ class OrcColumnVisitor(val reader: PageReader, val batch: VectorizedRowBatch, val i: Integer) extends ColumnVisitor {
9
+ override def booleanColumn(column: Column): Unit = if (reader.isNull(column)) {
10
+ batch.cols(column.getIndex).noNulls = false
11
+ batch.cols(column.getIndex).isNull(i) = true
12
+ }
13
+ else if (reader.getBoolean(column)) batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 1
14
+ else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 0
15
+
16
+ override def longColumn(column: Column): Unit = if (reader.isNull(column)) {
17
+ batch.cols(column.getIndex).noNulls = false
18
+ batch.cols(column.getIndex).isNull(i) = true
19
+ }
20
+ else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = reader.getLong(column)
21
+
22
+ override def doubleColumn(column: Column): Unit = if (reader.isNull(column)) {
23
+ batch.cols(column.getIndex).noNulls = false
24
+ batch.cols(column.getIndex).isNull(i) = true
25
+ }
26
+ else batch.cols(column.getIndex).asInstanceOf[DoubleColumnVector].vector(i) = reader.getDouble(column)
27
+
28
+ override def stringColumn(column: Column): Unit = if (!reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[BytesColumnVector].setVal(i, reader.getString(column).getBytes(StandardCharsets.UTF_8))
29
+ else {
30
+ batch.cols(column.getIndex).noNulls = false
31
+ batch.cols(column.getIndex).isNull(i) = true
32
+ }
33
+
34
+ override def timestampColumn(column: Column): Unit = if (reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].setNullValue(i)
35
+ else {
36
+ val timestamp = reader.getTimestamp(column)
37
+ val ts = new java.sql.Timestamp(timestamp.getEpochSecond * 1000)
38
+ batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].set(i, ts)
39
+ }
40
+
41
+ override def jsonColumn(column: Column) = throw new UnsupportedOperationException("orc output plugin does not support json type")
42
+ }
@@ -0,0 +1,156 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+ import java.util
5
+
6
+ import org.apache.hadoop.conf.Configuration
7
+ import org.apache.hadoop.fs.{LocalFileSystem, Path}
8
+ import org.apache.hadoop.hdfs.DistributedFileSystem
9
+ import org.apache.hadoop.util.VersionInfo
10
+ import org.apache.orc.{CompressionKind, MemoryManager, OrcFile, TypeDescription, Writer}
11
+ import org.embulk.config.{ConfigSource, TaskReport, TaskSource}
12
+ import org.embulk.spi.util.Timestamps
13
+ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema}
14
+ import org.embulk.util.aws.credentials.AwsCredentials
15
+
16
+ object OrcOutputPlugin {
17
+ private[orc] def getSchema(schema: Schema) = {
18
+ val oschema = TypeDescription.createStruct
19
+ for (i <- 0 until schema.size) {
20
+ val column = schema.getColumn(i)
21
+ val `type` = column.getType
22
+ `type`.getName match {
23
+ case "long" =>
24
+ oschema.addField(column.getName, TypeDescription.createLong)
25
+ case "double" =>
26
+ oschema.addField(column.getName, TypeDescription.createDouble)
27
+ case "boolean" =>
28
+ oschema.addField(column.getName, TypeDescription.createBoolean)
29
+ case "string" =>
30
+ oschema.addField(column.getName, TypeDescription.createString)
31
+ case "timestamp" =>
32
+ oschema.addField(column.getName, TypeDescription.createTimestamp)
33
+ case _ =>
34
+ System.out.println("Unsupported type")
35
+ }
36
+ }
37
+ oschema
38
+ }
39
+
40
+ // We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
41
+ // Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
42
+ // As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
43
+ // notifies checkMemory() only to that instance.
44
+ private class WriterLocalMemoryManager extends MemoryManager {
45
+ final private[orc] val rowsBetweenChecks = 10000
46
+ private var rowsAddedSinceCheck = 0
47
+ private[orc] var boundCallback: MemoryManager.Callback = _
48
+
49
+ @throws[IOException]
50
+ override def addWriter(path: Path, requestedAllocation: Long, callback: MemoryManager.Callback): Unit = {
51
+ if (boundCallback != null) {
52
+ throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.")
53
+ } else {
54
+ boundCallback = callback
55
+ }
56
+ }
57
+
58
+ @throws[IOException]
59
+ override def removeWriter(path: Path): Unit = boundCallback = null
60
+
61
+ @throws[IOException]
62
+ override def addedRow(rows: Int): Unit = {
63
+ rowsAddedSinceCheck += rows
64
+ if (rowsAddedSinceCheck > rowsBetweenChecks) {
65
+ boundCallback.checkMemory(1)
66
+ rowsAddedSinceCheck = 0
67
+ }
68
+ }
69
+ }
70
+
71
+ }
72
+
73
+ class OrcOutputPlugin extends OutputPlugin {
74
+ override def transaction(config: ConfigSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = {
75
+ val task = config.loadConfig(classOf[PluginTask])
76
+ // retryable (idempotent) output:
77
+ // return resume(task.dump(), schema, taskCount, control);
78
+ // non-retryable (non-idempotent) output:
79
+ control.run(task.dump)
80
+ Exec.newConfigDiff
81
+ }
82
+
83
+ override def resume(taskSource: TaskSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = throw new UnsupportedOperationException("orc output plugin does not support resuming")
84
+
85
+ override def cleanup(taskSource: TaskSource, schema: Schema, taskCount: Int, successTaskReports: util.List[TaskReport]): Unit = {
86
+ }
87
+
88
+ override def open(taskSource: TaskSource, schema: Schema, taskIndex: Int) = {
89
+ val task = taskSource.loadTask(classOf[PluginTask])
90
+ if (task.getOverwrite) {
91
+ val credentials = AwsCredentials.getAWSCredentialsProvider(task).getCredentials
92
+ OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex), task)
93
+ }
94
+ val reader = new PageReader(schema)
95
+ val writer = createWriter(task, schema, taskIndex)
96
+ new OrcTransactionalPageOutput(reader, writer, task)
97
+ }
98
+
99
+ private def buildPath(task: PluginTask, processorIndex: Int): String = {
100
+ val pathPrefix = task.getPathPrefix
101
+ val pathSuffix = task.getFileNameExtension
102
+ val sequenceFormat = task.getSequenceFormat
103
+ val fmt = java.lang.String.format(sequenceFormat, processorIndex.asInstanceOf[AnyRef])
104
+ pathPrefix + fmt + pathSuffix
105
+ }
106
+
107
+ private def getHadoopConfiguration(task: PluginTask) = {
108
+ val conf = new Configuration
109
+ // see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
110
+ conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
111
+ conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
112
+ // see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
113
+ AwsCredentials.getAWSCredentialsProvider(task)
114
+ if (task.getAccessKeyId.isPresent) {
115
+ conf.set("fs.s3a.access.key", task.getAccessKeyId.get)
116
+ conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId.get)
117
+ }
118
+ if (task.getSecretAccessKey.isPresent) {
119
+ conf.set("fs.s3a.secret.key", task.getSecretAccessKey.get)
120
+ conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey.get)
121
+ }
122
+ if (task.getEndpoint.isPresent) {
123
+ conf.set("fs.s3a.endpoint", task.getEndpoint.get)
124
+ conf.set("fs.s3n.endpoint", task.getEndpoint.get)
125
+ }
126
+ conf
127
+ }
128
+
129
+ private def createWriter(task: PluginTask, schema: Schema, processorIndex: Int): Writer = {
130
+ val timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
131
+ val conf = getHadoopConfiguration(task)
132
+ val oschema = OrcOutputPlugin.getSchema(schema)
133
+ // see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
134
+ Thread.currentThread.setContextClassLoader(classOf[VersionInfo].getClassLoader)
135
+
136
+ var writer: Writer = null
137
+ try { // Make writerOptions
138
+ val writerOptions = createWriterOptions(task, conf)
139
+ // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
140
+ // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
141
+ writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)), writerOptions.setSchema(oschema).memory(new OrcOutputPlugin.WriterLocalMemoryManager).version(OrcFile.Version.V_0_12))
142
+ } catch {
143
+ case e: IOException => throw e
144
+ }
145
+ writer
146
+ }
147
+
148
+ private def createWriterOptions(task: PluginTask, conf: Configuration) = {
149
+ val bufferSize = task.getBufferSize
150
+ val stripSize = task.getStripSize
151
+ val blockSize = task.getBlockSize
152
+ val kindString = task.getCompressionKind
153
+ val kind = CompressionKind.valueOf(kindString)
154
+ OrcFile.writerOptions(conf).bufferSize(bufferSize).blockSize(blockSize.toLong).stripeSize(stripSize.toLong).compress(kind)
155
+ }
156
+ }
@@ -0,0 +1,57 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+ import java.nio.file.{Files, Paths}
5
+
6
+ import com.amazonaws.auth.profile.ProfileCredentialsProvider
7
+ import com.amazonaws.services.s3.AmazonS3Client
8
+ import com.amazonaws.services.s3.model.DeleteObjectRequest
9
+
10
+ import scala.beans.BeanProperty
11
+
12
+ object OrcOutputPluginHelper {
13
+ def removeOldFile(fpath: String, task: PluginTask): Unit = {
14
+ // NOTE: Delete a file if local-filesystem, not HDFS or S3.
15
+ val schema = getSchema(fpath)
16
+ if (isDeleteTarget(schema)) schema match {
17
+ case "file" =>
18
+ try Files.deleteIfExists(Paths.get(fpath))
19
+ catch {
20
+ case e: IOException => throw e
21
+ }
22
+ case "s3" | "s3n" | "s3a" =>
23
+ val s3Url = parseS3Url(fpath)
24
+ val s3client = new AmazonS3Client(new ProfileCredentialsProvider)
25
+ if (task.getEndpoint.isPresent) s3client.setEndpoint(task.getEndpoint.get)
26
+ s3client.deleteObject(new DeleteObjectRequest(s3Url.bucket, s3Url.key))
27
+ case _ =>
28
+ }
29
+ }
30
+
31
+ def isDeleteTarget(schema: String): Boolean = schema match {
32
+ case "file" => true
33
+ case "s3" | "s3n" | "s3a" => true
34
+ case _ => false
35
+ }
36
+
37
+ def getSchema(fpath: String): String = {
38
+ val schema = fpath.split("://").toList.head
39
+ schema match {
40
+ case "s3" | "s3a" | "s3n" => schema
41
+ case _ => {
42
+ val path = Paths.get(fpath)
43
+ path.getFileSystem.provider.getScheme
44
+ }
45
+ }
46
+ }
47
+
48
+ def parseS3Url(s3url: String): AmazonS3URILikeObject = {
49
+ val parts = s3url.split("(://|/)").toList
50
+ val bucket = parts.apply(1)
51
+ val key = parts.slice(2, parts.size).mkString("/")
52
+ OrcOutputPluginHelper.AmazonS3URILikeObject(bucket, key)
53
+ }
54
+
55
+ case class AmazonS3URILikeObject(@BeanProperty bucket: String, @BeanProperty key: String)
56
+
57
+ }
@@ -0,0 +1,52 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+
5
+ import org.apache.orc.Writer
6
+ import org.embulk.config.TaskReport
7
+ import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
8
+
9
+ class OrcTransactionalPageOutput(val reader: PageReader, val writer: Writer, val task: PluginTask) extends TransactionalPageOutput {
10
+ override def add(page: Page): Unit = synchronized {
11
+ try {
12
+ // int size = page.getStringReferences().size();
13
+ val schema = OrcOutputPlugin.getSchema(reader.getSchema)
14
+ val batch = schema.createRowBatch
15
+ // batch.size = size;
16
+ reader.setPage(page)
17
+ while ( {
18
+ reader.nextRecord
19
+ }) {
20
+ val row = {
21
+ batch.size += 1;
22
+ batch.size - 1
23
+ }
24
+ reader.getSchema.visitColumns(new OrcColumnVisitor(reader, batch, row))
25
+ if (batch.size >= batch.getMaxSize) {
26
+ writer.addRowBatch(batch)
27
+ batch.reset()
28
+ }
29
+ }
30
+ if (batch.size != 0) {
31
+ writer.addRowBatch(batch)
32
+ batch.reset()
33
+ }
34
+ } catch {
35
+ case e: IOException =>
36
+ e.printStackTrace()
37
+ }
38
+ }
39
+
40
+ override def finish(): Unit = {
41
+ try writer.close()
42
+ catch {
43
+ case e: IOException => throw e
44
+ }
45
+ }
46
+
47
+ override def close(): Unit = {}
48
+
49
+ override def abort(): Unit = {}
50
+
51
+ override def commit: TaskReport = Exec.newTaskReport
52
+ }
@@ -0,0 +1,56 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.util
4
+
5
+ import com.google.common.base.Optional
6
+ import org.embulk.config.{Config, ConfigDefault, Task}
7
+ import org.embulk.spi.time.TimestampFormatter
8
+ import org.embulk.util.aws.credentials.AwsCredentialsTask
9
+ import org.joda.time.DateTimeZone
10
+
11
+ trait PluginTask extends Task with TimestampFormatter.Task with AwsCredentialsTask {
12
+ @Config("path_prefix")
13
+ def getPathPrefix: String
14
+
15
+ @Config("file_ext")
16
+ @ConfigDefault("\".orc\"")
17
+ def getFileNameExtension: String
18
+
19
+ @Config("column_options")
20
+ @ConfigDefault("{}")
21
+ def getColumnOptions: util.Map[String, TimestampColumnOption]
22
+
23
+ @Config("sequence_format")
24
+ @ConfigDefault("\".%03d\"")
25
+ def getSequenceFormat: String
26
+
27
+ // see: https://orc.apache.org/docs/hive-config.html
28
+ // ORC File options
29
+ @Config("strip_size")
30
+ @ConfigDefault("67108864") // 64MB
31
+ def getStripSize: Integer
32
+
33
+ @Config("buffer_size")
34
+ @ConfigDefault("262144") // 256KB
35
+ def getBufferSize: Integer
36
+
37
+ @Config("block_size")
38
+ @ConfigDefault("268435456") // 256MB
39
+ def getBlockSize: Integer
40
+
41
+ @Config("compression_kind")
42
+ @ConfigDefault("ZLIB")
43
+ def getCompressionKind: String
44
+
45
+ @Config("overwrite")
46
+ @ConfigDefault("false")
47
+ def getOverwrite: Boolean
48
+
49
+ @Config("default_from_timezone")
50
+ @ConfigDefault("\"UTC\"")
51
+ def getDefaultFromTimeZone: DateTimeZone
52
+
53
+ @Config("endpoint")
54
+ @ConfigDefault("null")
55
+ def getEndpoint: Optional[String]
56
+ }
@@ -0,0 +1,32 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.util
4
+
5
+ import com.google.common.base.Optional
6
+ import org.embulk.config.{Config, ConfigDefault, Task}
7
+ import org.embulk.spi.time.TimestampFormatter
8
+ import org.joda.time.DateTimeZone
9
+
10
+ /*
11
+ public interface TimestampColumnOption
12
+ extends Task, TimestampFormatter.TimestampColumnOption
13
+ {
14
+ @Config("from_timezone")
15
+ @ConfigDefault("null")
16
+ Optional<DateTimeZone> getFromTimeZone();
17
+
18
+ @Config("from_format")
19
+ @ConfigDefault("null")
20
+ Optional<List<String>> getFromFormat();
21
+ }
22
+ */
23
+
24
+ trait TimestampColumnOption extends Task with TimestampFormatter.TimestampColumnOption {
25
+ @Config("from_timezone")
26
+ @ConfigDefault("null")
27
+ def getFromTimeZone: Optional[DateTimeZone]
28
+
29
+ @Config("from_format")
30
+ @ConfigDefault("null")
31
+ def getFromFormat: Optional[util.List[String]]
32
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-25 00:00:00.000000000 Z
11
+ date: 2020-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -45,13 +45,12 @@ executables: []
45
45
  extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
+ - ".github/workflows/gradle.yml"
48
49
  - ".gitignore"
49
- - ".travis.yml"
50
50
  - LICENSE.txt
51
51
  - README.md
52
52
  - build.gradle
53
53
  - classpath/aircompressor-0.10.jar
54
- - classpath/animal-sniffer-annotations-1.14.jar
55
54
  - classpath/apacheds-i18n-2.0.0-M15.jar
56
55
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
57
56
  - classpath/api-asn1-api-1.0.0-M20.jar
@@ -62,7 +61,6 @@ files:
62
61
  - classpath/aws-java-sdk-core-1.10.33.jar
63
62
  - classpath/aws-java-sdk-kms-1.10.33.jar
64
63
  - classpath/aws-java-sdk-s3-1.10.33.jar
65
- - classpath/checker-compat-qual-2.0.0.jar
66
64
  - classpath/commons-beanutils-1.7.0.jar
67
65
  - classpath/commons-cli-1.2.jar
68
66
  - classpath/commons-codec-1.6.jar
@@ -80,11 +78,9 @@ files:
80
78
  - classpath/curator-client-2.7.1.jar
81
79
  - classpath/curator-framework-2.7.1.jar
82
80
  - classpath/curator-recipes-2.7.1.jar
83
- - classpath/embulk-output-orc-0.3.4.jar
81
+ - classpath/embulk-output-orc-0.3.5.jar
84
82
  - classpath/embulk-util-aws-credentials-0.2.8.jar
85
- - classpath/error_prone_annotations-2.1.3.jar
86
83
  - classpath/gson-2.2.4.jar
87
- - classpath/guava-24.1-jre.jar
88
84
  - classpath/hadoop-annotations-2.7.5.jar
89
85
  - classpath/hadoop-auth-2.7.5.jar
90
86
  - classpath/hadoop-aws-2.7.5.jar
@@ -94,7 +90,6 @@ files:
94
90
  - classpath/htrace-core-3.1.0-incubating.jar
95
91
  - classpath/httpclient-4.3.6.jar
96
92
  - classpath/httpcore-4.3.3.jar
97
- - classpath/j2objc-annotations-1.1.jar
98
93
  - classpath/jackson-core-asl-1.9.13.jar
99
94
  - classpath/jackson-jaxrs-1.8.3.jar
100
95
  - classpath/jackson-mapper-asl-1.9.13.jar
@@ -123,6 +118,7 @@ files:
123
118
  - classpath/orc-shims-1.5.4.jar
124
119
  - classpath/paranamer-2.3.jar
125
120
  - classpath/protobuf-java-2.5.0.jar
121
+ - classpath/scala-library-2.12.12.jar
126
122
  - classpath/servlet-api-2.5-20081211.jar
127
123
  - classpath/servlet-api-2.5.jar
128
124
  - classpath/snappy-java-1.0.4.1.jar
@@ -139,11 +135,12 @@ files:
139
135
  - gradlew
140
136
  - gradlew.bat
141
137
  - lib/embulk/output/orc.rb
142
- - src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
143
- - src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
144
- - src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
145
- - src/main/java/org/embulk/output/orc/PluginTask.java
146
- - src/main/java/org/embulk/output/orc/TimestampColumnOption.java
138
+ - src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala
139
+ - src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala
140
+ - src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala
141
+ - src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala
142
+ - src/main/scala/org/embulk/output/orc/PluginTask.scala
143
+ - src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala
147
144
  - src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java
148
145
  - src/test/resources/example-null.yml
149
146
  - src/test/resources/example.yml
@@ -1,14 +0,0 @@
1
- language: java
2
- jdk:
3
- - oraclejdk8
4
-
5
- cache:
6
- directories: # run "travis cache --delete" to delete caches
7
- - $HOME/.gradle
8
-
9
- sudo: false
10
- script:
11
- - ./gradlew --info checkstyle
12
- - ./gradlew --info check
13
-
14
- after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
@@ -1,101 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
4
- import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
5
- import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
6
- import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
7
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
8
- import org.embulk.spi.Column;
9
- import org.embulk.spi.ColumnVisitor;
10
- import org.embulk.spi.PageReader;
11
- import org.embulk.spi.time.Timestamp;
12
-
13
- import java.nio.charset.StandardCharsets;
14
-
15
- public class OrcColumnVisitor
16
- implements ColumnVisitor
17
- {
18
- private final PageReader reader;
19
- private final VectorizedRowBatch batch;
20
- private final Integer i;
21
-
22
- public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
23
- {
24
- this.reader = pageReader;
25
- this.batch = rowBatch;
26
- this.i = i;
27
- }
28
-
29
- @Override
30
- public void booleanColumn(Column column)
31
- {
32
- if (reader.isNull(column)) {
33
- batch.cols[column.getIndex()].noNulls = false;
34
- batch.cols[column.getIndex()].isNull[i] = true;
35
- }
36
- else {
37
- if (reader.getBoolean(column)) {
38
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
39
- }
40
- else {
41
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
42
- }
43
- }
44
- }
45
-
46
- @Override
47
- public void longColumn(Column column)
48
- {
49
- if (reader.isNull(column)) {
50
- batch.cols[column.getIndex()].noNulls = false;
51
- batch.cols[column.getIndex()].isNull[i] = true;
52
- }
53
- else {
54
- ((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
55
- }
56
- }
57
-
58
- @Override
59
- public void doubleColumn(Column column)
60
- {
61
- if (reader.isNull(column)) {
62
- batch.cols[column.getIndex()].noNulls = false;
63
- batch.cols[column.getIndex()].isNull[i] = true;
64
- }
65
- else {
66
- ((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
67
- }
68
- }
69
-
70
- @Override
71
- public void stringColumn(Column column)
72
- {
73
- if (!reader.isNull(column)) {
74
- ((BytesColumnVector) batch.cols[column.getIndex()])
75
- .setVal(i, reader.getString(column).getBytes(StandardCharsets.UTF_8));
76
- }
77
- else {
78
- batch.cols[column.getIndex()].noNulls = false;
79
- batch.cols[column.getIndex()].isNull[i] = true;
80
- }
81
- }
82
-
83
- @Override
84
- public void timestampColumn(Column column)
85
- {
86
- if (reader.isNull(column)) {
87
- ((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
88
- }
89
- else {
90
- Timestamp timestamp = reader.getTimestamp(column);
91
- java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
92
- ((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
93
- }
94
- }
95
-
96
- @Override
97
- public void jsonColumn(Column column)
98
- {
99
- throw new UnsupportedOperationException("orc output plugin does not support json type");
100
- }
101
- }
@@ -1,298 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.amazonaws.auth.AWSCredentials;
4
- import com.google.common.base.Throwables;
5
- import org.apache.hadoop.conf.Configuration;
6
- import org.apache.hadoop.fs.LocalFileSystem;
7
- import org.apache.hadoop.fs.Path;
8
- import org.apache.hadoop.hdfs.DistributedFileSystem;
9
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
10
- import org.apache.hadoop.util.VersionInfo;
11
- import org.apache.orc.CompressionKind;
12
- import org.apache.orc.MemoryManager;
13
- import org.apache.orc.OrcFile;
14
- import org.apache.orc.TypeDescription;
15
- import org.apache.orc.Writer;
16
- import org.embulk.config.ConfigDiff;
17
- import org.embulk.config.ConfigSource;
18
- import org.embulk.config.TaskReport;
19
- import org.embulk.config.TaskSource;
20
- import org.embulk.spi.Column;
21
- import org.embulk.spi.Exec;
22
- import org.embulk.spi.OutputPlugin;
23
- import org.embulk.spi.Page;
24
- import org.embulk.spi.PageReader;
25
- import org.embulk.spi.Schema;
26
- import org.embulk.spi.TransactionalPageOutput;
27
- import org.embulk.spi.time.TimestampFormatter;
28
- import org.embulk.spi.type.Type;
29
- import org.embulk.spi.util.Timestamps;
30
- import org.embulk.util.aws.credentials.AwsCredentials;
31
-
32
- import java.io.IOException;
33
- import java.util.List;
34
-
35
- public class OrcOutputPlugin
36
- implements OutputPlugin
37
- {
38
- @Override
39
- public ConfigDiff transaction(ConfigSource config,
40
- Schema schema, int taskCount,
41
- OutputPlugin.Control control)
42
- {
43
- PluginTask task = config.loadConfig(PluginTask.class);
44
-
45
- // retryable (idempotent) output:
46
- // return resume(task.dump(), schema, taskCount, control);
47
-
48
- // non-retryable (non-idempotent) output:
49
- control.run(task.dump());
50
- return Exec.newConfigDiff();
51
- }
52
-
53
- @Override
54
- public ConfigDiff resume(TaskSource taskSource,
55
- Schema schema, int taskCount,
56
- OutputPlugin.Control control)
57
- {
58
- throw new UnsupportedOperationException("orc output plugin does not support resuming");
59
- }
60
-
61
- @Override
62
- public void cleanup(TaskSource taskSource,
63
- Schema schema, int taskCount,
64
- List<TaskReport> successTaskReports)
65
- {
66
- }
67
-
68
- @Override
69
- public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
70
- {
71
- PluginTask task = taskSource.loadTask(PluginTask.class);
72
-
73
- if (task.getOverwrite()) {
74
- AWSCredentials credentials = AwsCredentials.getAWSCredentialsProvider(task).getCredentials();
75
- OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex), task);
76
- }
77
-
78
- final PageReader reader = new PageReader(schema);
79
- Writer writer = createWriter(task, schema, taskIndex);
80
-
81
- return new OrcTransactionalPageOutput(reader, writer, task);
82
- }
83
-
84
- private String buildPath(PluginTask task, int processorIndex)
85
- {
86
- final String pathPrefix = task.getPathPrefix();
87
- final String pathSuffix = task.getFileNameExtension();
88
- final String sequenceFormat = task.getSequenceFormat();
89
- return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
90
- }
91
-
92
- private TypeDescription getSchema(Schema schema)
93
- {
94
- TypeDescription oschema = TypeDescription.createStruct();
95
- for (int i = 0; i < schema.size(); i++) {
96
- Column column = schema.getColumn(i);
97
- Type type = column.getType();
98
- switch (type.getName()) {
99
- case "long":
100
- oschema.addField(column.getName(), TypeDescription.createLong());
101
- break;
102
- case "double":
103
- oschema.addField(column.getName(), TypeDescription.createDouble());
104
- break;
105
- case "boolean":
106
- oschema.addField(column.getName(), TypeDescription.createBoolean());
107
- break;
108
- case "string":
109
- oschema.addField(column.getName(), TypeDescription.createString());
110
- break;
111
- case "timestamp":
112
- oschema.addField(column.getName(), TypeDescription.createTimestamp());
113
- break;
114
- default:
115
- System.out.println("Unsupported type");
116
- break;
117
- }
118
- }
119
- return oschema;
120
- }
121
-
122
- private Configuration getHadoopConfiguration(PluginTask task)
123
- {
124
- Configuration conf = new Configuration();
125
-
126
- // see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
127
- conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
128
- conf.set("fs.file.impl", LocalFileSystem.class.getName());
129
- // see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
130
-
131
- AwsCredentials.getAWSCredentialsProvider(task);
132
- if (task.getAccessKeyId().isPresent()) {
133
- conf.set("fs.s3a.access.key", task.getAccessKeyId().get());
134
- conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId().get());
135
- }
136
- if (task.getSecretAccessKey().isPresent()) {
137
- conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
138
- conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
139
- }
140
- if (task.getEndpoint().isPresent()) {
141
- conf.set("fs.s3a.endpoint", task.getEndpoint().get());
142
- conf.set("fs.s3n.endpoint", task.getEndpoint().get());
143
- }
144
- return conf;
145
- }
146
-
147
- private Writer createWriter(PluginTask task, Schema schema, int processorIndex)
148
- {
149
- final TimestampFormatter[] timestampFormatters = Timestamps
150
- .newTimestampColumnFormatters(task, schema, task.getColumnOptions());
151
-
152
- Configuration conf = getHadoopConfiguration(task);
153
- TypeDescription oschema = getSchema(schema);
154
-
155
- // see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
156
- Thread.currentThread().setContextClassLoader(VersionInfo.class.getClassLoader());
157
-
158
- Writer writer = null;
159
- try {
160
- // Make writerOptions
161
- OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
162
- // see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
163
- // see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
164
- writer = OrcFile.createWriter(
165
- new Path(buildPath(task, processorIndex)),
166
- writerOptions.setSchema(oschema)
167
- .memory(new WriterLocalMemoryManager())
168
- .version(OrcFile.Version.V_0_12)
169
- );
170
- }
171
- catch (IOException e) {
172
- Throwables.propagate(e);
173
- }
174
- return writer;
175
- }
176
-
177
- private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
178
- {
179
- final Integer bufferSize = task.getBufferSize();
180
- final Integer stripSize = task.getStripSize();
181
- final Integer blockSize = task.getBlockSize();
182
- final String kindString = task.getCompressionKind();
183
- CompressionKind kind = CompressionKind.valueOf(kindString);
184
- return OrcFile.writerOptions(conf)
185
- .bufferSize(bufferSize)
186
- .blockSize(blockSize)
187
- .stripeSize(stripSize)
188
- .compress(kind);
189
- }
190
-
191
- class OrcTransactionalPageOutput
192
- implements TransactionalPageOutput
193
- {
194
- private final PageReader reader;
195
- private final Writer writer;
196
-
197
- public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
198
- {
199
- this.reader = reader;
200
- this.writer = writer;
201
- }
202
-
203
- @Override
204
- public void add(Page page)
205
- {
206
- try {
207
- // int size = page.getStringReferences().size();
208
- final TypeDescription schema = getSchema(reader.getSchema());
209
- final VectorizedRowBatch batch = schema.createRowBatch();
210
- // batch.size = size;
211
-
212
- reader.setPage(page);
213
- while (reader.nextRecord()) {
214
- final int row = batch.size++;
215
- reader.getSchema().visitColumns(
216
- new OrcColumnVisitor(reader, batch, row)
217
- );
218
- if (batch.size >= batch.getMaxSize()) {
219
- writer.addRowBatch(batch);
220
- batch.reset();
221
- }
222
- }
223
- if (batch.size != 0) {
224
- writer.addRowBatch(batch);
225
- batch.reset();
226
- }
227
- }
228
- catch (IOException e) {
229
- e.printStackTrace();
230
- }
231
- }
232
-
233
- @Override
234
- public void finish()
235
- {
236
- try {
237
- writer.close();
238
- }
239
- catch (IOException e) {
240
- Throwables.propagate(e);
241
- }
242
- }
243
-
244
- @Override
245
- public void close()
246
- {
247
- }
248
-
249
- @Override
250
- public void abort()
251
- {
252
- }
253
-
254
- @Override
255
- public TaskReport commit()
256
- {
257
- return Exec.newTaskReport();
258
- }
259
- }
260
-
261
- // We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
262
- // Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
263
- // As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
264
- // notifies checkMemory() only to that instance.
265
- private static class WriterLocalMemoryManager implements MemoryManager
266
- {
267
- final long rowsBetweenChecks = 10000;
268
-
269
- private int rowsAddedSinceCheck = 0;
270
- Callback boundCallback = null;
271
-
272
- @Override
273
- public void addWriter(Path path, long requestedAllocation, Callback callback) throws IOException
274
- {
275
- if (boundCallback != null) {
276
- throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.");
277
- }
278
-
279
- boundCallback = callback;
280
- }
281
-
282
- @Override
283
- public void removeWriter(Path path) throws IOException
284
- {
285
- boundCallback = null;
286
- }
287
-
288
- @Override
289
- public void addedRow(int rows) throws IOException
290
- {
291
- rowsAddedSinceCheck += rows;
292
- if (rowsAddedSinceCheck > rowsBetweenChecks) {
293
- boundCallback.checkMemory(1);
294
- rowsAddedSinceCheck = 0;
295
- }
296
- }
297
- }
298
- }
@@ -1,111 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.amazonaws.auth.profile.ProfileCredentialsProvider;
4
- import com.amazonaws.services.s3.AmazonS3;
5
- import com.amazonaws.services.s3.AmazonS3Client;
6
- import com.amazonaws.services.s3.model.DeleteObjectRequest;
7
- import com.google.common.base.Joiner;
8
- import com.google.common.base.Splitter;
9
- import com.google.common.base.Throwables;
10
-
11
- import java.io.IOException;
12
- import java.nio.file.Files;
13
- import java.nio.file.Path;
14
- import java.nio.file.Paths;
15
- import java.util.Arrays;
16
- import java.util.List;
17
-
18
- class OrcOutputPluginHelper
19
- {
20
- protected OrcOutputPluginHelper()
21
- {
22
- throw new UnsupportedOperationException();
23
- }
24
-
25
- static void removeOldFile(String fpath, PluginTask task)
26
- {
27
- // NOTE: Delete a file if local-filesystem, not HDFS or S3.
28
- String schema = getSchema(fpath);
29
- if (isDeleteTarget(schema)) {
30
- switch (schema) {
31
- case "file":
32
- try {
33
- Files.deleteIfExists(Paths.get(fpath));
34
- }
35
- catch (IOException e) {
36
- Throwables.propagate(e);
37
- }
38
- break;
39
- case "s3":
40
- case "s3n":
41
- case "s3a":
42
- AmazonS3URILikeObject s3Url = parseS3Url(fpath);
43
- AmazonS3 s3client = new AmazonS3Client(new ProfileCredentialsProvider());
44
- if (task.getEndpoint().isPresent()) {
45
- s3client.setEndpoint(task.getEndpoint().get());
46
- }
47
- s3client.deleteObject(new DeleteObjectRequest(s3Url.getBucket(), s3Url.getKey()));
48
- default:
49
- // TODO: Unsupported
50
- }
51
- }
52
- }
53
-
54
- public static boolean isDeleteTarget(String schema)
55
- {
56
- switch (schema) {
57
- case "file":
58
- return true;
59
- case "s3":
60
- case "s3a":
61
- case "s3n":
62
- return true;
63
- default:
64
- return false;
65
- }
66
- }
67
-
68
- static String getSchema(String fpath)
69
- {
70
- String schema = Splitter.on("://")
71
- .splitToList(fpath).get(0);
72
- if (schema.equals("s3a") || schema.equals("s3n") || schema.equals("s3")) {
73
- return schema;
74
- }
75
- else {
76
- Path path = Paths.get(fpath);
77
- return path.getFileSystem().provider().getScheme();
78
- }
79
- }
80
-
81
- static AmazonS3URILikeObject parseS3Url(String s3url)
82
- {
83
- List<String> parts = Arrays.asList(
84
- s3url.split("(://|/)"));
85
- String bucket = parts.get(1);
86
- String key = Joiner.on("/").join(parts.subList(2, parts.size()));
87
- return new AmazonS3URILikeObject(bucket, key);
88
- }
89
-
90
- static class AmazonS3URILikeObject
91
- {
92
- String bucket;
93
- String key;
94
-
95
- public AmazonS3URILikeObject(String bucket, String key)
96
- {
97
- this.bucket = bucket;
98
- this.key = key;
99
- }
100
-
101
- public String getBucket()
102
- {
103
- return bucket;
104
- }
105
-
106
- public String getKey()
107
- {
108
- return key;
109
- }
110
- }
111
- }
@@ -1,60 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Optional;
4
- import org.embulk.config.Config;
5
- import org.embulk.config.ConfigDefault;
6
- import org.embulk.config.Task;
7
- import org.embulk.spi.time.TimestampFormatter;
8
- import org.embulk.util.aws.credentials.AwsCredentialsTask;
9
- import org.joda.time.DateTimeZone;
10
-
11
- import java.util.Map;
12
-
13
- public interface PluginTask
14
- extends Task, TimestampFormatter.Task, AwsCredentialsTask
15
- {
16
- @Config("path_prefix")
17
- String getPathPrefix();
18
-
19
- @Config("file_ext")
20
- @ConfigDefault("\".orc\"")
21
- String getFileNameExtension();
22
-
23
- @Config("column_options")
24
- @ConfigDefault("{}")
25
- Map<String, TimestampColumnOption> getColumnOptions();
26
-
27
- @Config("sequence_format")
28
- @ConfigDefault("\".%03d\"")
29
- String getSequenceFormat();
30
-
31
- // see: https://orc.apache.org/docs/hive-config.html
32
- // ORC File options
33
- @Config("strip_size")
34
- @ConfigDefault("67108864") // 64MB
35
- Integer getStripSize();
36
-
37
- @Config("buffer_size")
38
- @ConfigDefault("262144") // 256KB
39
- Integer getBufferSize();
40
-
41
- @Config("block_size")
42
- @ConfigDefault("268435456") // 256MB
43
- Integer getBlockSize();
44
-
45
- @Config("compression_kind")
46
- @ConfigDefault("ZLIB")
47
- public String getCompressionKind();
48
-
49
- @Config("overwrite")
50
- @ConfigDefault("false")
51
- boolean getOverwrite();
52
-
53
- @Config("default_from_timezone")
54
- @ConfigDefault("\"UTC\"")
55
- DateTimeZone getDefaultFromTimeZone();
56
-
57
- @Config("endpoint")
58
- @ConfigDefault("null")
59
- Optional<String> getEndpoint();
60
- }
@@ -1,22 +0,0 @@
1
- package org.embulk.output.orc;
2
-
3
- import com.google.common.base.Optional;
4
- import org.embulk.config.Config;
5
- import org.embulk.config.ConfigDefault;
6
- import org.embulk.config.Task;
7
- import org.embulk.spi.time.TimestampFormatter;
8
- import org.joda.time.DateTimeZone;
9
-
10
- import java.util.List;
11
-
12
- public interface TimestampColumnOption
13
- extends Task, TimestampFormatter.TimestampColumnOption
14
- {
15
- @Config("from_timezone")
16
- @ConfigDefault("null")
17
- Optional<DateTimeZone> getFromTimeZone();
18
-
19
- @Config("from_format")
20
- @ConfigDefault("null")
21
- Optional<List<String>> getFromFormat();
22
- }