embulk-output-orc 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/gradle.yml +25 -0
- data/README.md +1 -1
- data/build.gradle +19 -6
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -2
- data/gradlew +17 -1
- data/gradlew.bat +17 -1
- data/src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala +42 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala +156 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala +57 -0
- data/src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala +52 -0
- data/src/main/scala/org/embulk/output/orc/PluginTask.scala +56 -0
- data/src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala +32 -0
- metadata +11 -14
- data/.travis.yml +0 -14
- data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +0 -101
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +0 -298
- data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +0 -111
- data/src/main/java/org/embulk/output/orc/PluginTask.java +0 -60
- data/src/main/java/org/embulk/output/orc/TimestampColumnOption.java +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ecb39bb650455937f641f073e9e0b13338f268b
|
4
|
+
data.tar.gz: 393ef796dfdf47239a11186b33988466432b0d02
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ee733e3cca10bfff236c7ff24d3249a1f8a30629ba314a9b840a2de4d6b552412fa1a6cb9555979dbad499259152b5a24439586105ce0417f629079b26775e9b
|
7
|
+
data.tar.gz: 41f8059f0af1f7eb1accccb18e33c111f7e75b7a69ffba97b26cd74de7349922ebdd852b87ea31f40c257cc57543b88cada9e32aea828f7f8d852adf20d3f328
|
@@ -0,0 +1,25 @@
|
|
1
|
+
name: Java CI
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v1
|
12
|
+
- name: Set up JDK 1.8
|
13
|
+
uses: actions/setup-java@v1
|
14
|
+
with:
|
15
|
+
java-version: 1.8
|
16
|
+
- name: Build with Gradle
|
17
|
+
run:
|
18
|
+
./gradlew build
|
19
|
+
- name: Checkstyle & static check
|
20
|
+
run: |
|
21
|
+
./gradlew --info checkstyle
|
22
|
+
./gradlew --info check
|
23
|
+
# - name: Spotbugs
|
24
|
+
# run: |
|
25
|
+
# ./gradlew spotbugsMain spotbugsTest
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Orc output plugin for Embulk
|
2
2
|
|
3
|
-
[![Build Status](https://
|
3
|
+
[![Build Status](https://github.com/yuokada/embulk-output-orc/workflows/Java%20CI/badge.svg)](https://github.com/yuokada/embulk-output-orc/actions)
|
4
4
|
[![Gem Version](https://badge.fury.io/rb/embulk-output-orc.svg)](https://badge.fury.io/rb/embulk-output-orc)
|
5
5
|
|
6
6
|
## Overview
|
data/build.gradle
CHANGED
@@ -2,7 +2,9 @@ plugins {
|
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
3
|
id "com.github.jruby-gradle.base" version "1.5.0"
|
4
4
|
id "java"
|
5
|
+
id "scala"
|
5
6
|
id "checkstyle"
|
7
|
+
// id "com.github.spotbugs" version "3.0.1"
|
6
8
|
id "org.sonarqube" version "2.5"
|
7
9
|
}
|
8
10
|
import com.github.jrubygradle.JRubyExec
|
@@ -18,14 +20,15 @@ configurations {
|
|
18
20
|
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
19
21
|
}
|
20
22
|
|
21
|
-
version = "0.3.
|
23
|
+
version = "0.3.5"
|
22
24
|
|
23
25
|
sourceCompatibility = 1.8
|
24
26
|
targetCompatibility = 1.8
|
25
27
|
|
26
28
|
dependencies {
|
27
|
-
compile "org.embulk:embulk-core:0.
|
28
|
-
provided "org.embulk:embulk-core:0.
|
29
|
+
compile "org.embulk:embulk-core:0.9.23"
|
30
|
+
provided "org.embulk:embulk-core:0.9.23"
|
31
|
+
compile "org.scala-lang:scala-library:2.12.+"
|
29
32
|
|
30
33
|
compile "org.apache.orc:orc:1.5.4"
|
31
34
|
compile "org.apache.orc:orc-core:1.5.4"
|
@@ -34,14 +37,24 @@ dependencies {
|
|
34
37
|
compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
|
35
38
|
compile "com.amazonaws:aws-java-sdk-s3:1.10.33"
|
36
39
|
compile "org.apache.hadoop:hadoop-aws:2.7.5"
|
37
|
-
compile 'com.google.guava:guava:24.1-jre'
|
38
40
|
|
39
41
|
testCompile 'org.jmockit:jmockit:1.38'
|
40
42
|
// testCompile "junit:junit:4.+"
|
41
43
|
testCompile 'org.hamcrest:hamcrest-core:1.3'
|
42
44
|
testCompile 'org.testng:testng:6.14.2'
|
43
|
-
testCompile "org.embulk:embulk-core:0.8.
|
44
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
45
|
+
testCompile "org.embulk:embulk-core:0.8.39:tests"
|
46
|
+
testCompile "org.embulk:embulk-standards:0.8.39"
|
47
|
+
}
|
48
|
+
|
49
|
+
sourceSets {
|
50
|
+
main {
|
51
|
+
scala {
|
52
|
+
srcDirs = ['src/main/scala', 'src/main/java']
|
53
|
+
}
|
54
|
+
java {
|
55
|
+
srcDirs = []
|
56
|
+
}
|
57
|
+
}
|
45
58
|
}
|
46
59
|
|
47
60
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
Binary file
|
@@ -1,6 +1,5 @@
|
|
1
|
-
#Wed Jan 09 23:08:09 JST 2019
|
2
1
|
distributionBase=GRADLE_USER_HOME
|
3
2
|
distributionPath=wrapper/dists
|
3
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-5.6.4-bin.zip
|
4
4
|
zipStoreBase=GRADLE_USER_HOME
|
5
5
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10-all.zip
|
data/gradlew
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
#!/usr/bin/env sh
|
2
2
|
|
3
|
+
#
|
4
|
+
# Copyright 2015 the original author or authors.
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
|
3
19
|
##############################################################################
|
4
20
|
##
|
5
21
|
## Gradle start up script for UN*X
|
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
|
|
28
44
|
APP_BASE_NAME=`basename "$0"`
|
29
45
|
|
30
46
|
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
-
DEFAULT_JVM_OPTS=""
|
47
|
+
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
32
48
|
|
33
49
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
50
|
MAX_FD="maximum"
|
data/gradlew.bat
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
@rem
|
2
|
+
@rem Copyright 2015 the original author or authors.
|
3
|
+
@rem
|
4
|
+
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
@rem you may not use this file except in compliance with the License.
|
6
|
+
@rem You may obtain a copy of the License at
|
7
|
+
@rem
|
8
|
+
@rem https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
@rem
|
10
|
+
@rem Unless required by applicable law or agreed to in writing, software
|
11
|
+
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
@rem See the License for the specific language governing permissions and
|
14
|
+
@rem limitations under the License.
|
15
|
+
@rem
|
16
|
+
|
1
17
|
@if "%DEBUG%" == "" @echo off
|
2
18
|
@rem ##########################################################################
|
3
19
|
@rem
|
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
|
|
14
30
|
set APP_HOME=%DIRNAME%
|
15
31
|
|
16
32
|
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
-
set DEFAULT_JVM_OPTS=
|
33
|
+
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
18
34
|
|
19
35
|
@rem Find java.exe
|
20
36
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
@@ -0,0 +1,42 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.nio.charset.StandardCharsets
|
4
|
+
|
5
|
+
import org.apache.hadoop.hive.ql.exec.vector._
|
6
|
+
import org.embulk.spi.{Column, ColumnVisitor, PageReader}
|
7
|
+
|
8
|
+
class OrcColumnVisitor(val reader: PageReader, val batch: VectorizedRowBatch, val i: Integer) extends ColumnVisitor {
|
9
|
+
override def booleanColumn(column: Column): Unit = if (reader.isNull(column)) {
|
10
|
+
batch.cols(column.getIndex).noNulls = false
|
11
|
+
batch.cols(column.getIndex).isNull(i) = true
|
12
|
+
}
|
13
|
+
else if (reader.getBoolean(column)) batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 1
|
14
|
+
else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 0
|
15
|
+
|
16
|
+
override def longColumn(column: Column): Unit = if (reader.isNull(column)) {
|
17
|
+
batch.cols(column.getIndex).noNulls = false
|
18
|
+
batch.cols(column.getIndex).isNull(i) = true
|
19
|
+
}
|
20
|
+
else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = reader.getLong(column)
|
21
|
+
|
22
|
+
override def doubleColumn(column: Column): Unit = if (reader.isNull(column)) {
|
23
|
+
batch.cols(column.getIndex).noNulls = false
|
24
|
+
batch.cols(column.getIndex).isNull(i) = true
|
25
|
+
}
|
26
|
+
else batch.cols(column.getIndex).asInstanceOf[DoubleColumnVector].vector(i) = reader.getDouble(column)
|
27
|
+
|
28
|
+
override def stringColumn(column: Column): Unit = if (!reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[BytesColumnVector].setVal(i, reader.getString(column).getBytes(StandardCharsets.UTF_8))
|
29
|
+
else {
|
30
|
+
batch.cols(column.getIndex).noNulls = false
|
31
|
+
batch.cols(column.getIndex).isNull(i) = true
|
32
|
+
}
|
33
|
+
|
34
|
+
override def timestampColumn(column: Column): Unit = if (reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].setNullValue(i)
|
35
|
+
else {
|
36
|
+
val timestamp = reader.getTimestamp(column)
|
37
|
+
val ts = new java.sql.Timestamp(timestamp.getEpochSecond * 1000)
|
38
|
+
batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].set(i, ts)
|
39
|
+
}
|
40
|
+
|
41
|
+
override def jsonColumn(column: Column) = throw new UnsupportedOperationException("orc output plugin does not support json type")
|
42
|
+
}
|
@@ -0,0 +1,156 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.io.IOException
|
4
|
+
import java.util
|
5
|
+
|
6
|
+
import org.apache.hadoop.conf.Configuration
|
7
|
+
import org.apache.hadoop.fs.{LocalFileSystem, Path}
|
8
|
+
import org.apache.hadoop.hdfs.DistributedFileSystem
|
9
|
+
import org.apache.hadoop.util.VersionInfo
|
10
|
+
import org.apache.orc.{CompressionKind, MemoryManager, OrcFile, TypeDescription, Writer}
|
11
|
+
import org.embulk.config.{ConfigSource, TaskReport, TaskSource}
|
12
|
+
import org.embulk.spi.util.Timestamps
|
13
|
+
import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema}
|
14
|
+
import org.embulk.util.aws.credentials.AwsCredentials
|
15
|
+
|
16
|
+
object OrcOutputPlugin {
|
17
|
+
private[orc] def getSchema(schema: Schema) = {
|
18
|
+
val oschema = TypeDescription.createStruct
|
19
|
+
for (i <- 0 until schema.size) {
|
20
|
+
val column = schema.getColumn(i)
|
21
|
+
val `type` = column.getType
|
22
|
+
`type`.getName match {
|
23
|
+
case "long" =>
|
24
|
+
oschema.addField(column.getName, TypeDescription.createLong)
|
25
|
+
case "double" =>
|
26
|
+
oschema.addField(column.getName, TypeDescription.createDouble)
|
27
|
+
case "boolean" =>
|
28
|
+
oschema.addField(column.getName, TypeDescription.createBoolean)
|
29
|
+
case "string" =>
|
30
|
+
oschema.addField(column.getName, TypeDescription.createString)
|
31
|
+
case "timestamp" =>
|
32
|
+
oschema.addField(column.getName, TypeDescription.createTimestamp)
|
33
|
+
case _ =>
|
34
|
+
System.out.println("Unsupported type")
|
35
|
+
}
|
36
|
+
}
|
37
|
+
oschema
|
38
|
+
}
|
39
|
+
|
40
|
+
// We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
|
41
|
+
// Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
|
42
|
+
// As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
|
43
|
+
// notifies checkMemory() only to that instance.
|
44
|
+
private class WriterLocalMemoryManager extends MemoryManager {
|
45
|
+
final private[orc] val rowsBetweenChecks = 10000
|
46
|
+
private var rowsAddedSinceCheck = 0
|
47
|
+
private[orc] var boundCallback: MemoryManager.Callback = _
|
48
|
+
|
49
|
+
@throws[IOException]
|
50
|
+
override def addWriter(path: Path, requestedAllocation: Long, callback: MemoryManager.Callback): Unit = {
|
51
|
+
if (boundCallback != null) {
|
52
|
+
throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.")
|
53
|
+
} else {
|
54
|
+
boundCallback = callback
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
@throws[IOException]
|
59
|
+
override def removeWriter(path: Path): Unit = boundCallback = null
|
60
|
+
|
61
|
+
@throws[IOException]
|
62
|
+
override def addedRow(rows: Int): Unit = {
|
63
|
+
rowsAddedSinceCheck += rows
|
64
|
+
if (rowsAddedSinceCheck > rowsBetweenChecks) {
|
65
|
+
boundCallback.checkMemory(1)
|
66
|
+
rowsAddedSinceCheck = 0
|
67
|
+
}
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
}
|
72
|
+
|
73
|
+
class OrcOutputPlugin extends OutputPlugin {
|
74
|
+
override def transaction(config: ConfigSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = {
|
75
|
+
val task = config.loadConfig(classOf[PluginTask])
|
76
|
+
// retryable (idempotent) output:
|
77
|
+
// return resume(task.dump(), schema, taskCount, control);
|
78
|
+
// non-retryable (non-idempotent) output:
|
79
|
+
control.run(task.dump)
|
80
|
+
Exec.newConfigDiff
|
81
|
+
}
|
82
|
+
|
83
|
+
override def resume(taskSource: TaskSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = throw new UnsupportedOperationException("orc output plugin does not support resuming")
|
84
|
+
|
85
|
+
override def cleanup(taskSource: TaskSource, schema: Schema, taskCount: Int, successTaskReports: util.List[TaskReport]): Unit = {
|
86
|
+
}
|
87
|
+
|
88
|
+
override def open(taskSource: TaskSource, schema: Schema, taskIndex: Int) = {
|
89
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
90
|
+
if (task.getOverwrite) {
|
91
|
+
val credentials = AwsCredentials.getAWSCredentialsProvider(task).getCredentials
|
92
|
+
OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex), task)
|
93
|
+
}
|
94
|
+
val reader = new PageReader(schema)
|
95
|
+
val writer = createWriter(task, schema, taskIndex)
|
96
|
+
new OrcTransactionalPageOutput(reader, writer, task)
|
97
|
+
}
|
98
|
+
|
99
|
+
private def buildPath(task: PluginTask, processorIndex: Int): String = {
|
100
|
+
val pathPrefix = task.getPathPrefix
|
101
|
+
val pathSuffix = task.getFileNameExtension
|
102
|
+
val sequenceFormat = task.getSequenceFormat
|
103
|
+
val fmt = java.lang.String.format(sequenceFormat, processorIndex.asInstanceOf[AnyRef])
|
104
|
+
pathPrefix + fmt + pathSuffix
|
105
|
+
}
|
106
|
+
|
107
|
+
private def getHadoopConfiguration(task: PluginTask) = {
|
108
|
+
val conf = new Configuration
|
109
|
+
// see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
|
110
|
+
conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
|
111
|
+
conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
|
112
|
+
// see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
|
113
|
+
AwsCredentials.getAWSCredentialsProvider(task)
|
114
|
+
if (task.getAccessKeyId.isPresent) {
|
115
|
+
conf.set("fs.s3a.access.key", task.getAccessKeyId.get)
|
116
|
+
conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId.get)
|
117
|
+
}
|
118
|
+
if (task.getSecretAccessKey.isPresent) {
|
119
|
+
conf.set("fs.s3a.secret.key", task.getSecretAccessKey.get)
|
120
|
+
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey.get)
|
121
|
+
}
|
122
|
+
if (task.getEndpoint.isPresent) {
|
123
|
+
conf.set("fs.s3a.endpoint", task.getEndpoint.get)
|
124
|
+
conf.set("fs.s3n.endpoint", task.getEndpoint.get)
|
125
|
+
}
|
126
|
+
conf
|
127
|
+
}
|
128
|
+
|
129
|
+
private def createWriter(task: PluginTask, schema: Schema, processorIndex: Int): Writer = {
|
130
|
+
val timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
131
|
+
val conf = getHadoopConfiguration(task)
|
132
|
+
val oschema = OrcOutputPlugin.getSchema(schema)
|
133
|
+
// see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
|
134
|
+
Thread.currentThread.setContextClassLoader(classOf[VersionInfo].getClassLoader)
|
135
|
+
|
136
|
+
var writer: Writer = null
|
137
|
+
try { // Make writerOptions
|
138
|
+
val writerOptions = createWriterOptions(task, conf)
|
139
|
+
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
140
|
+
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
141
|
+
writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)), writerOptions.setSchema(oschema).memory(new OrcOutputPlugin.WriterLocalMemoryManager).version(OrcFile.Version.V_0_12))
|
142
|
+
} catch {
|
143
|
+
case e: IOException => throw e
|
144
|
+
}
|
145
|
+
writer
|
146
|
+
}
|
147
|
+
|
148
|
+
private def createWriterOptions(task: PluginTask, conf: Configuration) = {
|
149
|
+
val bufferSize = task.getBufferSize
|
150
|
+
val stripSize = task.getStripSize
|
151
|
+
val blockSize = task.getBlockSize
|
152
|
+
val kindString = task.getCompressionKind
|
153
|
+
val kind = CompressionKind.valueOf(kindString)
|
154
|
+
OrcFile.writerOptions(conf).bufferSize(bufferSize).blockSize(blockSize.toLong).stripeSize(stripSize.toLong).compress(kind)
|
155
|
+
}
|
156
|
+
}
|
@@ -0,0 +1,57 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.io.IOException
|
4
|
+
import java.nio.file.{Files, Paths}
|
5
|
+
|
6
|
+
import com.amazonaws.auth.profile.ProfileCredentialsProvider
|
7
|
+
import com.amazonaws.services.s3.AmazonS3Client
|
8
|
+
import com.amazonaws.services.s3.model.DeleteObjectRequest
|
9
|
+
|
10
|
+
import scala.beans.BeanProperty
|
11
|
+
|
12
|
+
object OrcOutputPluginHelper {
|
13
|
+
def removeOldFile(fpath: String, task: PluginTask): Unit = {
|
14
|
+
// NOTE: Delete a file if local-filesystem, not HDFS or S3.
|
15
|
+
val schema = getSchema(fpath)
|
16
|
+
if (isDeleteTarget(schema)) schema match {
|
17
|
+
case "file" =>
|
18
|
+
try Files.deleteIfExists(Paths.get(fpath))
|
19
|
+
catch {
|
20
|
+
case e: IOException => throw e
|
21
|
+
}
|
22
|
+
case "s3" | "s3n" | "s3a" =>
|
23
|
+
val s3Url = parseS3Url(fpath)
|
24
|
+
val s3client = new AmazonS3Client(new ProfileCredentialsProvider)
|
25
|
+
if (task.getEndpoint.isPresent) s3client.setEndpoint(task.getEndpoint.get)
|
26
|
+
s3client.deleteObject(new DeleteObjectRequest(s3Url.bucket, s3Url.key))
|
27
|
+
case _ =>
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
def isDeleteTarget(schema: String): Boolean = schema match {
|
32
|
+
case "file" => true
|
33
|
+
case "s3" | "s3n" | "s3a" => true
|
34
|
+
case _ => false
|
35
|
+
}
|
36
|
+
|
37
|
+
def getSchema(fpath: String): String = {
|
38
|
+
val schema = fpath.split("://").toList.head
|
39
|
+
schema match {
|
40
|
+
case "s3" | "s3a" | "s3n" => schema
|
41
|
+
case _ => {
|
42
|
+
val path = Paths.get(fpath)
|
43
|
+
path.getFileSystem.provider.getScheme
|
44
|
+
}
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
def parseS3Url(s3url: String): AmazonS3URILikeObject = {
|
49
|
+
val parts = s3url.split("(://|/)").toList
|
50
|
+
val bucket = parts.apply(1)
|
51
|
+
val key = parts.slice(2, parts.size).mkString("/")
|
52
|
+
OrcOutputPluginHelper.AmazonS3URILikeObject(bucket, key)
|
53
|
+
}
|
54
|
+
|
55
|
+
case class AmazonS3URILikeObject(@BeanProperty bucket: String, @BeanProperty key: String)
|
56
|
+
|
57
|
+
}
|
@@ -0,0 +1,52 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.io.IOException
|
4
|
+
|
5
|
+
import org.apache.orc.Writer
|
6
|
+
import org.embulk.config.TaskReport
|
7
|
+
import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
|
8
|
+
|
9
|
+
class OrcTransactionalPageOutput(val reader: PageReader, val writer: Writer, val task: PluginTask) extends TransactionalPageOutput {
|
10
|
+
override def add(page: Page): Unit = synchronized {
|
11
|
+
try {
|
12
|
+
// int size = page.getStringReferences().size();
|
13
|
+
val schema = OrcOutputPlugin.getSchema(reader.getSchema)
|
14
|
+
val batch = schema.createRowBatch
|
15
|
+
// batch.size = size;
|
16
|
+
reader.setPage(page)
|
17
|
+
while ( {
|
18
|
+
reader.nextRecord
|
19
|
+
}) {
|
20
|
+
val row = {
|
21
|
+
batch.size += 1;
|
22
|
+
batch.size - 1
|
23
|
+
}
|
24
|
+
reader.getSchema.visitColumns(new OrcColumnVisitor(reader, batch, row))
|
25
|
+
if (batch.size >= batch.getMaxSize) {
|
26
|
+
writer.addRowBatch(batch)
|
27
|
+
batch.reset()
|
28
|
+
}
|
29
|
+
}
|
30
|
+
if (batch.size != 0) {
|
31
|
+
writer.addRowBatch(batch)
|
32
|
+
batch.reset()
|
33
|
+
}
|
34
|
+
} catch {
|
35
|
+
case e: IOException =>
|
36
|
+
e.printStackTrace()
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
override def finish(): Unit = {
|
41
|
+
try writer.close()
|
42
|
+
catch {
|
43
|
+
case e: IOException => throw e
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
override def close(): Unit = {}
|
48
|
+
|
49
|
+
override def abort(): Unit = {}
|
50
|
+
|
51
|
+
override def commit: TaskReport = Exec.newTaskReport
|
52
|
+
}
|
@@ -0,0 +1,56 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.util
|
4
|
+
|
5
|
+
import com.google.common.base.Optional
|
6
|
+
import org.embulk.config.{Config, ConfigDefault, Task}
|
7
|
+
import org.embulk.spi.time.TimestampFormatter
|
8
|
+
import org.embulk.util.aws.credentials.AwsCredentialsTask
|
9
|
+
import org.joda.time.DateTimeZone
|
10
|
+
|
11
|
+
trait PluginTask extends Task with TimestampFormatter.Task with AwsCredentialsTask {
|
12
|
+
@Config("path_prefix")
|
13
|
+
def getPathPrefix: String
|
14
|
+
|
15
|
+
@Config("file_ext")
|
16
|
+
@ConfigDefault("\".orc\"")
|
17
|
+
def getFileNameExtension: String
|
18
|
+
|
19
|
+
@Config("column_options")
|
20
|
+
@ConfigDefault("{}")
|
21
|
+
def getColumnOptions: util.Map[String, TimestampColumnOption]
|
22
|
+
|
23
|
+
@Config("sequence_format")
|
24
|
+
@ConfigDefault("\".%03d\"")
|
25
|
+
def getSequenceFormat: String
|
26
|
+
|
27
|
+
// see: https://orc.apache.org/docs/hive-config.html
|
28
|
+
// ORC File options
|
29
|
+
@Config("strip_size")
|
30
|
+
@ConfigDefault("67108864") // 64MB
|
31
|
+
def getStripSize: Integer
|
32
|
+
|
33
|
+
@Config("buffer_size")
|
34
|
+
@ConfigDefault("262144") // 256KB
|
35
|
+
def getBufferSize: Integer
|
36
|
+
|
37
|
+
@Config("block_size")
|
38
|
+
@ConfigDefault("268435456") // 256MB
|
39
|
+
def getBlockSize: Integer
|
40
|
+
|
41
|
+
@Config("compression_kind")
|
42
|
+
@ConfigDefault("ZLIB")
|
43
|
+
def getCompressionKind: String
|
44
|
+
|
45
|
+
@Config("overwrite")
|
46
|
+
@ConfigDefault("false")
|
47
|
+
def getOverwrite: Boolean
|
48
|
+
|
49
|
+
@Config("default_from_timezone")
|
50
|
+
@ConfigDefault("\"UTC\"")
|
51
|
+
def getDefaultFromTimeZone: DateTimeZone
|
52
|
+
|
53
|
+
@Config("endpoint")
|
54
|
+
@ConfigDefault("null")
|
55
|
+
def getEndpoint: Optional[String]
|
56
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.util
|
4
|
+
|
5
|
+
import com.google.common.base.Optional
|
6
|
+
import org.embulk.config.{Config, ConfigDefault, Task}
|
7
|
+
import org.embulk.spi.time.TimestampFormatter
|
8
|
+
import org.joda.time.DateTimeZone
|
9
|
+
|
10
|
+
/*
|
11
|
+
public interface TimestampColumnOption
|
12
|
+
extends Task, TimestampFormatter.TimestampColumnOption
|
13
|
+
{
|
14
|
+
@Config("from_timezone")
|
15
|
+
@ConfigDefault("null")
|
16
|
+
Optional<DateTimeZone> getFromTimeZone();
|
17
|
+
|
18
|
+
@Config("from_format")
|
19
|
+
@ConfigDefault("null")
|
20
|
+
Optional<List<String>> getFromFormat();
|
21
|
+
}
|
22
|
+
*/
|
23
|
+
|
24
|
+
trait TimestampColumnOption extends Task with TimestampFormatter.TimestampColumnOption {
|
25
|
+
@Config("from_timezone")
|
26
|
+
@ConfigDefault("null")
|
27
|
+
def getFromTimeZone: Optional[DateTimeZone]
|
28
|
+
|
29
|
+
@Config("from_format")
|
30
|
+
@ConfigDefault("null")
|
31
|
+
def getFromFormat: Optional[util.List[String]]
|
32
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-orc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuokada
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -45,13 +45,12 @@ executables: []
|
|
45
45
|
extensions: []
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
|
+
- ".github/workflows/gradle.yml"
|
48
49
|
- ".gitignore"
|
49
|
-
- ".travis.yml"
|
50
50
|
- LICENSE.txt
|
51
51
|
- README.md
|
52
52
|
- build.gradle
|
53
53
|
- classpath/aircompressor-0.10.jar
|
54
|
-
- classpath/animal-sniffer-annotations-1.14.jar
|
55
54
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
56
55
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
57
56
|
- classpath/api-asn1-api-1.0.0-M20.jar
|
@@ -62,7 +61,6 @@ files:
|
|
62
61
|
- classpath/aws-java-sdk-core-1.10.33.jar
|
63
62
|
- classpath/aws-java-sdk-kms-1.10.33.jar
|
64
63
|
- classpath/aws-java-sdk-s3-1.10.33.jar
|
65
|
-
- classpath/checker-compat-qual-2.0.0.jar
|
66
64
|
- classpath/commons-beanutils-1.7.0.jar
|
67
65
|
- classpath/commons-cli-1.2.jar
|
68
66
|
- classpath/commons-codec-1.6.jar
|
@@ -80,11 +78,9 @@ files:
|
|
80
78
|
- classpath/curator-client-2.7.1.jar
|
81
79
|
- classpath/curator-framework-2.7.1.jar
|
82
80
|
- classpath/curator-recipes-2.7.1.jar
|
83
|
-
- classpath/embulk-output-orc-0.3.
|
81
|
+
- classpath/embulk-output-orc-0.3.5.jar
|
84
82
|
- classpath/embulk-util-aws-credentials-0.2.8.jar
|
85
|
-
- classpath/error_prone_annotations-2.1.3.jar
|
86
83
|
- classpath/gson-2.2.4.jar
|
87
|
-
- classpath/guava-24.1-jre.jar
|
88
84
|
- classpath/hadoop-annotations-2.7.5.jar
|
89
85
|
- classpath/hadoop-auth-2.7.5.jar
|
90
86
|
- classpath/hadoop-aws-2.7.5.jar
|
@@ -94,7 +90,6 @@ files:
|
|
94
90
|
- classpath/htrace-core-3.1.0-incubating.jar
|
95
91
|
- classpath/httpclient-4.3.6.jar
|
96
92
|
- classpath/httpcore-4.3.3.jar
|
97
|
-
- classpath/j2objc-annotations-1.1.jar
|
98
93
|
- classpath/jackson-core-asl-1.9.13.jar
|
99
94
|
- classpath/jackson-jaxrs-1.8.3.jar
|
100
95
|
- classpath/jackson-mapper-asl-1.9.13.jar
|
@@ -123,6 +118,7 @@ files:
|
|
123
118
|
- classpath/orc-shims-1.5.4.jar
|
124
119
|
- classpath/paranamer-2.3.jar
|
125
120
|
- classpath/protobuf-java-2.5.0.jar
|
121
|
+
- classpath/scala-library-2.12.12.jar
|
126
122
|
- classpath/servlet-api-2.5-20081211.jar
|
127
123
|
- classpath/servlet-api-2.5.jar
|
128
124
|
- classpath/snappy-java-1.0.4.1.jar
|
@@ -139,11 +135,12 @@ files:
|
|
139
135
|
- gradlew
|
140
136
|
- gradlew.bat
|
141
137
|
- lib/embulk/output/orc.rb
|
142
|
-
- src/main/
|
143
|
-
- src/main/
|
144
|
-
- src/main/
|
145
|
-
- src/main/
|
146
|
-
- src/main/
|
138
|
+
- src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala
|
139
|
+
- src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala
|
140
|
+
- src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala
|
141
|
+
- src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala
|
142
|
+
- src/main/scala/org/embulk/output/orc/PluginTask.scala
|
143
|
+
- src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala
|
147
144
|
- src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java
|
148
145
|
- src/test/resources/example-null.yml
|
149
146
|
- src/test/resources/example.yml
|
data/.travis.yml
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
language: java
|
2
|
-
jdk:
|
3
|
-
- oraclejdk8
|
4
|
-
|
5
|
-
cache:
|
6
|
-
directories: # run "travis cache --delete" to delete caches
|
7
|
-
- $HOME/.gradle
|
8
|
-
|
9
|
-
sudo: false
|
10
|
-
script:
|
11
|
-
- ./gradlew --info checkstyle
|
12
|
-
- ./gradlew --info check
|
13
|
-
|
14
|
-
after_success: ./gradlew sonarqube -Dsonar.organization=yuokada-github -Dsonar.host.url=https://sonarcloud.io -Dsonar.login=${SONAR_TOKEN}
|
@@ -1,101 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
|
4
|
-
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
|
5
|
-
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
|
6
|
-
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
|
7
|
-
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
8
|
-
import org.embulk.spi.Column;
|
9
|
-
import org.embulk.spi.ColumnVisitor;
|
10
|
-
import org.embulk.spi.PageReader;
|
11
|
-
import org.embulk.spi.time.Timestamp;
|
12
|
-
|
13
|
-
import java.nio.charset.StandardCharsets;
|
14
|
-
|
15
|
-
public class OrcColumnVisitor
|
16
|
-
implements ColumnVisitor
|
17
|
-
{
|
18
|
-
private final PageReader reader;
|
19
|
-
private final VectorizedRowBatch batch;
|
20
|
-
private final Integer i;
|
21
|
-
|
22
|
-
public OrcColumnVisitor(PageReader pageReader, VectorizedRowBatch rowBatch, Integer i)
|
23
|
-
{
|
24
|
-
this.reader = pageReader;
|
25
|
-
this.batch = rowBatch;
|
26
|
-
this.i = i;
|
27
|
-
}
|
28
|
-
|
29
|
-
@Override
|
30
|
-
public void booleanColumn(Column column)
|
31
|
-
{
|
32
|
-
if (reader.isNull(column)) {
|
33
|
-
batch.cols[column.getIndex()].noNulls = false;
|
34
|
-
batch.cols[column.getIndex()].isNull[i] = true;
|
35
|
-
}
|
36
|
-
else {
|
37
|
-
if (reader.getBoolean(column)) {
|
38
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 1;
|
39
|
-
}
|
40
|
-
else {
|
41
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = 0;
|
42
|
-
}
|
43
|
-
}
|
44
|
-
}
|
45
|
-
|
46
|
-
@Override
|
47
|
-
public void longColumn(Column column)
|
48
|
-
{
|
49
|
-
if (reader.isNull(column)) {
|
50
|
-
batch.cols[column.getIndex()].noNulls = false;
|
51
|
-
batch.cols[column.getIndex()].isNull[i] = true;
|
52
|
-
}
|
53
|
-
else {
|
54
|
-
((LongColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getLong(column);
|
55
|
-
}
|
56
|
-
}
|
57
|
-
|
58
|
-
@Override
|
59
|
-
public void doubleColumn(Column column)
|
60
|
-
{
|
61
|
-
if (reader.isNull(column)) {
|
62
|
-
batch.cols[column.getIndex()].noNulls = false;
|
63
|
-
batch.cols[column.getIndex()].isNull[i] = true;
|
64
|
-
}
|
65
|
-
else {
|
66
|
-
((DoubleColumnVector) batch.cols[column.getIndex()]).vector[i] = reader.getDouble(column);
|
67
|
-
}
|
68
|
-
}
|
69
|
-
|
70
|
-
@Override
|
71
|
-
public void stringColumn(Column column)
|
72
|
-
{
|
73
|
-
if (!reader.isNull(column)) {
|
74
|
-
((BytesColumnVector) batch.cols[column.getIndex()])
|
75
|
-
.setVal(i, reader.getString(column).getBytes(StandardCharsets.UTF_8));
|
76
|
-
}
|
77
|
-
else {
|
78
|
-
batch.cols[column.getIndex()].noNulls = false;
|
79
|
-
batch.cols[column.getIndex()].isNull[i] = true;
|
80
|
-
}
|
81
|
-
}
|
82
|
-
|
83
|
-
@Override
|
84
|
-
public void timestampColumn(Column column)
|
85
|
-
{
|
86
|
-
if (reader.isNull(column)) {
|
87
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).setNullValue(i);
|
88
|
-
}
|
89
|
-
else {
|
90
|
-
Timestamp timestamp = reader.getTimestamp(column);
|
91
|
-
java.sql.Timestamp ts = new java.sql.Timestamp(timestamp.getEpochSecond() * 1000);
|
92
|
-
((TimestampColumnVector) batch.cols[column.getIndex()]).set(i, ts);
|
93
|
-
}
|
94
|
-
}
|
95
|
-
|
96
|
-
@Override
|
97
|
-
public void jsonColumn(Column column)
|
98
|
-
{
|
99
|
-
throw new UnsupportedOperationException("orc output plugin does not support json type");
|
100
|
-
}
|
101
|
-
}
|
@@ -1,298 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.amazonaws.auth.AWSCredentials;
|
4
|
-
import com.google.common.base.Throwables;
|
5
|
-
import org.apache.hadoop.conf.Configuration;
|
6
|
-
import org.apache.hadoop.fs.LocalFileSystem;
|
7
|
-
import org.apache.hadoop.fs.Path;
|
8
|
-
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
9
|
-
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
|
10
|
-
import org.apache.hadoop.util.VersionInfo;
|
11
|
-
import org.apache.orc.CompressionKind;
|
12
|
-
import org.apache.orc.MemoryManager;
|
13
|
-
import org.apache.orc.OrcFile;
|
14
|
-
import org.apache.orc.TypeDescription;
|
15
|
-
import org.apache.orc.Writer;
|
16
|
-
import org.embulk.config.ConfigDiff;
|
17
|
-
import org.embulk.config.ConfigSource;
|
18
|
-
import org.embulk.config.TaskReport;
|
19
|
-
import org.embulk.config.TaskSource;
|
20
|
-
import org.embulk.spi.Column;
|
21
|
-
import org.embulk.spi.Exec;
|
22
|
-
import org.embulk.spi.OutputPlugin;
|
23
|
-
import org.embulk.spi.Page;
|
24
|
-
import org.embulk.spi.PageReader;
|
25
|
-
import org.embulk.spi.Schema;
|
26
|
-
import org.embulk.spi.TransactionalPageOutput;
|
27
|
-
import org.embulk.spi.time.TimestampFormatter;
|
28
|
-
import org.embulk.spi.type.Type;
|
29
|
-
import org.embulk.spi.util.Timestamps;
|
30
|
-
import org.embulk.util.aws.credentials.AwsCredentials;
|
31
|
-
|
32
|
-
import java.io.IOException;
|
33
|
-
import java.util.List;
|
34
|
-
|
35
|
-
public class OrcOutputPlugin
|
36
|
-
implements OutputPlugin
|
37
|
-
{
|
38
|
-
@Override
|
39
|
-
public ConfigDiff transaction(ConfigSource config,
|
40
|
-
Schema schema, int taskCount,
|
41
|
-
OutputPlugin.Control control)
|
42
|
-
{
|
43
|
-
PluginTask task = config.loadConfig(PluginTask.class);
|
44
|
-
|
45
|
-
// retryable (idempotent) output:
|
46
|
-
// return resume(task.dump(), schema, taskCount, control);
|
47
|
-
|
48
|
-
// non-retryable (non-idempotent) output:
|
49
|
-
control.run(task.dump());
|
50
|
-
return Exec.newConfigDiff();
|
51
|
-
}
|
52
|
-
|
53
|
-
@Override
|
54
|
-
public ConfigDiff resume(TaskSource taskSource,
|
55
|
-
Schema schema, int taskCount,
|
56
|
-
OutputPlugin.Control control)
|
57
|
-
{
|
58
|
-
throw new UnsupportedOperationException("orc output plugin does not support resuming");
|
59
|
-
}
|
60
|
-
|
61
|
-
@Override
|
62
|
-
public void cleanup(TaskSource taskSource,
|
63
|
-
Schema schema, int taskCount,
|
64
|
-
List<TaskReport> successTaskReports)
|
65
|
-
{
|
66
|
-
}
|
67
|
-
|
68
|
-
@Override
|
69
|
-
public TransactionalPageOutput open(TaskSource taskSource, Schema schema, int taskIndex)
|
70
|
-
{
|
71
|
-
PluginTask task = taskSource.loadTask(PluginTask.class);
|
72
|
-
|
73
|
-
if (task.getOverwrite()) {
|
74
|
-
AWSCredentials credentials = AwsCredentials.getAWSCredentialsProvider(task).getCredentials();
|
75
|
-
OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex), task);
|
76
|
-
}
|
77
|
-
|
78
|
-
final PageReader reader = new PageReader(schema);
|
79
|
-
Writer writer = createWriter(task, schema, taskIndex);
|
80
|
-
|
81
|
-
return new OrcTransactionalPageOutput(reader, writer, task);
|
82
|
-
}
|
83
|
-
|
84
|
-
private String buildPath(PluginTask task, int processorIndex)
|
85
|
-
{
|
86
|
-
final String pathPrefix = task.getPathPrefix();
|
87
|
-
final String pathSuffix = task.getFileNameExtension();
|
88
|
-
final String sequenceFormat = task.getSequenceFormat();
|
89
|
-
return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
|
90
|
-
}
|
91
|
-
|
92
|
-
private TypeDescription getSchema(Schema schema)
|
93
|
-
{
|
94
|
-
TypeDescription oschema = TypeDescription.createStruct();
|
95
|
-
for (int i = 0; i < schema.size(); i++) {
|
96
|
-
Column column = schema.getColumn(i);
|
97
|
-
Type type = column.getType();
|
98
|
-
switch (type.getName()) {
|
99
|
-
case "long":
|
100
|
-
oschema.addField(column.getName(), TypeDescription.createLong());
|
101
|
-
break;
|
102
|
-
case "double":
|
103
|
-
oschema.addField(column.getName(), TypeDescription.createDouble());
|
104
|
-
break;
|
105
|
-
case "boolean":
|
106
|
-
oschema.addField(column.getName(), TypeDescription.createBoolean());
|
107
|
-
break;
|
108
|
-
case "string":
|
109
|
-
oschema.addField(column.getName(), TypeDescription.createString());
|
110
|
-
break;
|
111
|
-
case "timestamp":
|
112
|
-
oschema.addField(column.getName(), TypeDescription.createTimestamp());
|
113
|
-
break;
|
114
|
-
default:
|
115
|
-
System.out.println("Unsupported type");
|
116
|
-
break;
|
117
|
-
}
|
118
|
-
}
|
119
|
-
return oschema;
|
120
|
-
}
|
121
|
-
|
122
|
-
private Configuration getHadoopConfiguration(PluginTask task)
|
123
|
-
{
|
124
|
-
Configuration conf = new Configuration();
|
125
|
-
|
126
|
-
// see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
|
127
|
-
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
|
128
|
-
conf.set("fs.file.impl", LocalFileSystem.class.getName());
|
129
|
-
// see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
|
130
|
-
|
131
|
-
AwsCredentials.getAWSCredentialsProvider(task);
|
132
|
-
if (task.getAccessKeyId().isPresent()) {
|
133
|
-
conf.set("fs.s3a.access.key", task.getAccessKeyId().get());
|
134
|
-
conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId().get());
|
135
|
-
}
|
136
|
-
if (task.getSecretAccessKey().isPresent()) {
|
137
|
-
conf.set("fs.s3a.secret.key", task.getSecretAccessKey().get());
|
138
|
-
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey().get());
|
139
|
-
}
|
140
|
-
if (task.getEndpoint().isPresent()) {
|
141
|
-
conf.set("fs.s3a.endpoint", task.getEndpoint().get());
|
142
|
-
conf.set("fs.s3n.endpoint", task.getEndpoint().get());
|
143
|
-
}
|
144
|
-
return conf;
|
145
|
-
}
|
146
|
-
|
147
|
-
private Writer createWriter(PluginTask task, Schema schema, int processorIndex)
|
148
|
-
{
|
149
|
-
final TimestampFormatter[] timestampFormatters = Timestamps
|
150
|
-
.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
|
151
|
-
|
152
|
-
Configuration conf = getHadoopConfiguration(task);
|
153
|
-
TypeDescription oschema = getSchema(schema);
|
154
|
-
|
155
|
-
// see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
|
156
|
-
Thread.currentThread().setContextClassLoader(VersionInfo.class.getClassLoader());
|
157
|
-
|
158
|
-
Writer writer = null;
|
159
|
-
try {
|
160
|
-
// Make writerOptions
|
161
|
-
OrcFile.WriterOptions writerOptions = createWriterOptions(task, conf);
|
162
|
-
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
163
|
-
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
164
|
-
writer = OrcFile.createWriter(
|
165
|
-
new Path(buildPath(task, processorIndex)),
|
166
|
-
writerOptions.setSchema(oschema)
|
167
|
-
.memory(new WriterLocalMemoryManager())
|
168
|
-
.version(OrcFile.Version.V_0_12)
|
169
|
-
);
|
170
|
-
}
|
171
|
-
catch (IOException e) {
|
172
|
-
Throwables.propagate(e);
|
173
|
-
}
|
174
|
-
return writer;
|
175
|
-
}
|
176
|
-
|
177
|
-
private OrcFile.WriterOptions createWriterOptions(PluginTask task, Configuration conf)
|
178
|
-
{
|
179
|
-
final Integer bufferSize = task.getBufferSize();
|
180
|
-
final Integer stripSize = task.getStripSize();
|
181
|
-
final Integer blockSize = task.getBlockSize();
|
182
|
-
final String kindString = task.getCompressionKind();
|
183
|
-
CompressionKind kind = CompressionKind.valueOf(kindString);
|
184
|
-
return OrcFile.writerOptions(conf)
|
185
|
-
.bufferSize(bufferSize)
|
186
|
-
.blockSize(blockSize)
|
187
|
-
.stripeSize(stripSize)
|
188
|
-
.compress(kind);
|
189
|
-
}
|
190
|
-
|
191
|
-
class OrcTransactionalPageOutput
|
192
|
-
implements TransactionalPageOutput
|
193
|
-
{
|
194
|
-
private final PageReader reader;
|
195
|
-
private final Writer writer;
|
196
|
-
|
197
|
-
public OrcTransactionalPageOutput(PageReader reader, Writer writer, PluginTask task)
|
198
|
-
{
|
199
|
-
this.reader = reader;
|
200
|
-
this.writer = writer;
|
201
|
-
}
|
202
|
-
|
203
|
-
@Override
|
204
|
-
public void add(Page page)
|
205
|
-
{
|
206
|
-
try {
|
207
|
-
// int size = page.getStringReferences().size();
|
208
|
-
final TypeDescription schema = getSchema(reader.getSchema());
|
209
|
-
final VectorizedRowBatch batch = schema.createRowBatch();
|
210
|
-
// batch.size = size;
|
211
|
-
|
212
|
-
reader.setPage(page);
|
213
|
-
while (reader.nextRecord()) {
|
214
|
-
final int row = batch.size++;
|
215
|
-
reader.getSchema().visitColumns(
|
216
|
-
new OrcColumnVisitor(reader, batch, row)
|
217
|
-
);
|
218
|
-
if (batch.size >= batch.getMaxSize()) {
|
219
|
-
writer.addRowBatch(batch);
|
220
|
-
batch.reset();
|
221
|
-
}
|
222
|
-
}
|
223
|
-
if (batch.size != 0) {
|
224
|
-
writer.addRowBatch(batch);
|
225
|
-
batch.reset();
|
226
|
-
}
|
227
|
-
}
|
228
|
-
catch (IOException e) {
|
229
|
-
e.printStackTrace();
|
230
|
-
}
|
231
|
-
}
|
232
|
-
|
233
|
-
@Override
|
234
|
-
public void finish()
|
235
|
-
{
|
236
|
-
try {
|
237
|
-
writer.close();
|
238
|
-
}
|
239
|
-
catch (IOException e) {
|
240
|
-
Throwables.propagate(e);
|
241
|
-
}
|
242
|
-
}
|
243
|
-
|
244
|
-
@Override
|
245
|
-
public void close()
|
246
|
-
{
|
247
|
-
}
|
248
|
-
|
249
|
-
@Override
|
250
|
-
public void abort()
|
251
|
-
{
|
252
|
-
}
|
253
|
-
|
254
|
-
@Override
|
255
|
-
public TaskReport commit()
|
256
|
-
{
|
257
|
-
return Exec.newTaskReport();
|
258
|
-
}
|
259
|
-
}
|
260
|
-
|
261
|
-
// We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
|
262
|
-
// Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
|
263
|
-
// As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
|
264
|
-
// notifies checkMemory() only to that instance.
|
265
|
-
private static class WriterLocalMemoryManager implements MemoryManager
|
266
|
-
{
|
267
|
-
final long rowsBetweenChecks = 10000;
|
268
|
-
|
269
|
-
private int rowsAddedSinceCheck = 0;
|
270
|
-
Callback boundCallback = null;
|
271
|
-
|
272
|
-
@Override
|
273
|
-
public void addWriter(Path path, long requestedAllocation, Callback callback) throws IOException
|
274
|
-
{
|
275
|
-
if (boundCallback != null) {
|
276
|
-
throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.");
|
277
|
-
}
|
278
|
-
|
279
|
-
boundCallback = callback;
|
280
|
-
}
|
281
|
-
|
282
|
-
@Override
|
283
|
-
public void removeWriter(Path path) throws IOException
|
284
|
-
{
|
285
|
-
boundCallback = null;
|
286
|
-
}
|
287
|
-
|
288
|
-
@Override
|
289
|
-
public void addedRow(int rows) throws IOException
|
290
|
-
{
|
291
|
-
rowsAddedSinceCheck += rows;
|
292
|
-
if (rowsAddedSinceCheck > rowsBetweenChecks) {
|
293
|
-
boundCallback.checkMemory(1);
|
294
|
-
rowsAddedSinceCheck = 0;
|
295
|
-
}
|
296
|
-
}
|
297
|
-
}
|
298
|
-
}
|
@@ -1,111 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.amazonaws.auth.profile.ProfileCredentialsProvider;
|
4
|
-
import com.amazonaws.services.s3.AmazonS3;
|
5
|
-
import com.amazonaws.services.s3.AmazonS3Client;
|
6
|
-
import com.amazonaws.services.s3.model.DeleteObjectRequest;
|
7
|
-
import com.google.common.base.Joiner;
|
8
|
-
import com.google.common.base.Splitter;
|
9
|
-
import com.google.common.base.Throwables;
|
10
|
-
|
11
|
-
import java.io.IOException;
|
12
|
-
import java.nio.file.Files;
|
13
|
-
import java.nio.file.Path;
|
14
|
-
import java.nio.file.Paths;
|
15
|
-
import java.util.Arrays;
|
16
|
-
import java.util.List;
|
17
|
-
|
18
|
-
class OrcOutputPluginHelper
|
19
|
-
{
|
20
|
-
protected OrcOutputPluginHelper()
|
21
|
-
{
|
22
|
-
throw new UnsupportedOperationException();
|
23
|
-
}
|
24
|
-
|
25
|
-
static void removeOldFile(String fpath, PluginTask task)
|
26
|
-
{
|
27
|
-
// NOTE: Delete a file if local-filesystem, not HDFS or S3.
|
28
|
-
String schema = getSchema(fpath);
|
29
|
-
if (isDeleteTarget(schema)) {
|
30
|
-
switch (schema) {
|
31
|
-
case "file":
|
32
|
-
try {
|
33
|
-
Files.deleteIfExists(Paths.get(fpath));
|
34
|
-
}
|
35
|
-
catch (IOException e) {
|
36
|
-
Throwables.propagate(e);
|
37
|
-
}
|
38
|
-
break;
|
39
|
-
case "s3":
|
40
|
-
case "s3n":
|
41
|
-
case "s3a":
|
42
|
-
AmazonS3URILikeObject s3Url = parseS3Url(fpath);
|
43
|
-
AmazonS3 s3client = new AmazonS3Client(new ProfileCredentialsProvider());
|
44
|
-
if (task.getEndpoint().isPresent()) {
|
45
|
-
s3client.setEndpoint(task.getEndpoint().get());
|
46
|
-
}
|
47
|
-
s3client.deleteObject(new DeleteObjectRequest(s3Url.getBucket(), s3Url.getKey()));
|
48
|
-
default:
|
49
|
-
// TODO: Unsupported
|
50
|
-
}
|
51
|
-
}
|
52
|
-
}
|
53
|
-
|
54
|
-
public static boolean isDeleteTarget(String schema)
|
55
|
-
{
|
56
|
-
switch (schema) {
|
57
|
-
case "file":
|
58
|
-
return true;
|
59
|
-
case "s3":
|
60
|
-
case "s3a":
|
61
|
-
case "s3n":
|
62
|
-
return true;
|
63
|
-
default:
|
64
|
-
return false;
|
65
|
-
}
|
66
|
-
}
|
67
|
-
|
68
|
-
static String getSchema(String fpath)
|
69
|
-
{
|
70
|
-
String schema = Splitter.on("://")
|
71
|
-
.splitToList(fpath).get(0);
|
72
|
-
if (schema.equals("s3a") || schema.equals("s3n") || schema.equals("s3")) {
|
73
|
-
return schema;
|
74
|
-
}
|
75
|
-
else {
|
76
|
-
Path path = Paths.get(fpath);
|
77
|
-
return path.getFileSystem().provider().getScheme();
|
78
|
-
}
|
79
|
-
}
|
80
|
-
|
81
|
-
static AmazonS3URILikeObject parseS3Url(String s3url)
|
82
|
-
{
|
83
|
-
List<String> parts = Arrays.asList(
|
84
|
-
s3url.split("(://|/)"));
|
85
|
-
String bucket = parts.get(1);
|
86
|
-
String key = Joiner.on("/").join(parts.subList(2, parts.size()));
|
87
|
-
return new AmazonS3URILikeObject(bucket, key);
|
88
|
-
}
|
89
|
-
|
90
|
-
static class AmazonS3URILikeObject
|
91
|
-
{
|
92
|
-
String bucket;
|
93
|
-
String key;
|
94
|
-
|
95
|
-
public AmazonS3URILikeObject(String bucket, String key)
|
96
|
-
{
|
97
|
-
this.bucket = bucket;
|
98
|
-
this.key = key;
|
99
|
-
}
|
100
|
-
|
101
|
-
public String getBucket()
|
102
|
-
{
|
103
|
-
return bucket;
|
104
|
-
}
|
105
|
-
|
106
|
-
public String getKey()
|
107
|
-
{
|
108
|
-
return key;
|
109
|
-
}
|
110
|
-
}
|
111
|
-
}
|
@@ -1,60 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import org.embulk.config.Config;
|
5
|
-
import org.embulk.config.ConfigDefault;
|
6
|
-
import org.embulk.config.Task;
|
7
|
-
import org.embulk.spi.time.TimestampFormatter;
|
8
|
-
import org.embulk.util.aws.credentials.AwsCredentialsTask;
|
9
|
-
import org.joda.time.DateTimeZone;
|
10
|
-
|
11
|
-
import java.util.Map;
|
12
|
-
|
13
|
-
public interface PluginTask
|
14
|
-
extends Task, TimestampFormatter.Task, AwsCredentialsTask
|
15
|
-
{
|
16
|
-
@Config("path_prefix")
|
17
|
-
String getPathPrefix();
|
18
|
-
|
19
|
-
@Config("file_ext")
|
20
|
-
@ConfigDefault("\".orc\"")
|
21
|
-
String getFileNameExtension();
|
22
|
-
|
23
|
-
@Config("column_options")
|
24
|
-
@ConfigDefault("{}")
|
25
|
-
Map<String, TimestampColumnOption> getColumnOptions();
|
26
|
-
|
27
|
-
@Config("sequence_format")
|
28
|
-
@ConfigDefault("\".%03d\"")
|
29
|
-
String getSequenceFormat();
|
30
|
-
|
31
|
-
// see: https://orc.apache.org/docs/hive-config.html
|
32
|
-
// ORC File options
|
33
|
-
@Config("strip_size")
|
34
|
-
@ConfigDefault("67108864") // 64MB
|
35
|
-
Integer getStripSize();
|
36
|
-
|
37
|
-
@Config("buffer_size")
|
38
|
-
@ConfigDefault("262144") // 256KB
|
39
|
-
Integer getBufferSize();
|
40
|
-
|
41
|
-
@Config("block_size")
|
42
|
-
@ConfigDefault("268435456") // 256MB
|
43
|
-
Integer getBlockSize();
|
44
|
-
|
45
|
-
@Config("compression_kind")
|
46
|
-
@ConfigDefault("ZLIB")
|
47
|
-
public String getCompressionKind();
|
48
|
-
|
49
|
-
@Config("overwrite")
|
50
|
-
@ConfigDefault("false")
|
51
|
-
boolean getOverwrite();
|
52
|
-
|
53
|
-
@Config("default_from_timezone")
|
54
|
-
@ConfigDefault("\"UTC\"")
|
55
|
-
DateTimeZone getDefaultFromTimeZone();
|
56
|
-
|
57
|
-
@Config("endpoint")
|
58
|
-
@ConfigDefault("null")
|
59
|
-
Optional<String> getEndpoint();
|
60
|
-
}
|
@@ -1,22 +0,0 @@
|
|
1
|
-
package org.embulk.output.orc;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import org.embulk.config.Config;
|
5
|
-
import org.embulk.config.ConfigDefault;
|
6
|
-
import org.embulk.config.Task;
|
7
|
-
import org.embulk.spi.time.TimestampFormatter;
|
8
|
-
import org.joda.time.DateTimeZone;
|
9
|
-
|
10
|
-
import java.util.List;
|
11
|
-
|
12
|
-
public interface TimestampColumnOption
|
13
|
-
extends Task, TimestampFormatter.TimestampColumnOption
|
14
|
-
{
|
15
|
-
@Config("from_timezone")
|
16
|
-
@ConfigDefault("null")
|
17
|
-
Optional<DateTimeZone> getFromTimeZone();
|
18
|
-
|
19
|
-
@Config("from_format")
|
20
|
-
@ConfigDefault("null")
|
21
|
-
Optional<List<String>> getFromFormat();
|
22
|
-
}
|