embulk-output-orc 0.3.0 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/gradle.yml +25 -0
- data/README.md +30 -9
- data/build.gradle +34 -15
- data/example/example.yml +4 -6
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +17 -1
- data/gradlew.bat +17 -1
- data/src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala +42 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala +156 -0
- data/src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala +57 -0
- data/src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala +52 -0
- data/src/main/scala/org/embulk/output/orc/PluginTask.scala +56 -0
- data/src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala +32 -0
- data/src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java +71 -0
- data/src/test/resources/example-null.yml +25 -0
- data/src/test/resources/example.yml +25 -0
- metadata +45 -42
- data/.travis.yml +0 -14
- data/src/main/java/org/embulk/output/orc/OrcColumnVisitor.java +0 -82
- data/src/main/java/org/embulk/output/orc/OrcOutputPlugin.java +0 -249
- data/src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java +0 -28
- data/src/main/java/org/embulk/output/orc/PluginTask.java +0 -60
- data/src/main/java/org/embulk/output/orc/TimestampColumnOption.java +0 -22
- data/src/test/java/org/embulk/output/orc/TestOrcOutputPlugin.java +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ecb39bb650455937f641f073e9e0b13338f268b
|
4
|
+
data.tar.gz: 393ef796dfdf47239a11186b33988466432b0d02
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ee733e3cca10bfff236c7ff24d3249a1f8a30629ba314a9b840a2de4d6b552412fa1a6cb9555979dbad499259152b5a24439586105ce0417f629079b26775e9b
|
7
|
+
data.tar.gz: 41f8059f0af1f7eb1accccb18e33c111f7e75b7a69ffba97b26cd74de7349922ebdd852b87ea31f40c257cc57543b88cada9e32aea828f7f8d852adf20d3f328
|
@@ -0,0 +1,25 @@
|
|
1
|
+
name: Java CI
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v1
|
12
|
+
- name: Set up JDK 1.8
|
13
|
+
uses: actions/setup-java@v1
|
14
|
+
with:
|
15
|
+
java-version: 1.8
|
16
|
+
- name: Build with Gradle
|
17
|
+
run:
|
18
|
+
./gradlew build
|
19
|
+
- name: Checkstyle & static check
|
20
|
+
run: |
|
21
|
+
./gradlew --info checkstyle
|
22
|
+
./gradlew --info check
|
23
|
+
# - name: Spotbugs
|
24
|
+
# run: |
|
25
|
+
# ./gradlew spotbugsMain spotbugsTest
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Orc output plugin for Embulk
|
2
2
|
|
3
|
-
[![Build Status](https://
|
3
|
+
[![Build Status](https://github.com/yuokada/embulk-output-orc/workflows/Java%20CI/badge.svg)](https://github.com/yuokada/embulk-output-orc/actions)
|
4
4
|
[![Gem Version](https://badge.fury.io/rb/embulk-output-orc.svg)](https://badge.fury.io/rb/embulk-output-orc)
|
5
5
|
|
6
6
|
## Overview
|
@@ -13,15 +13,16 @@
|
|
13
13
|
## Configuration
|
14
14
|
|
15
15
|
- **path_prefix**: A prefix of output path. (string, required)
|
16
|
-
- support: `file`, `s3n` and `s3a`.
|
16
|
+
- support: `file`, `s3`, `s3n` and `s3a`.
|
17
17
|
- **file_ext**: An extension of output file. (string, default: `.orc`)
|
18
18
|
- **sequence_format**: (string, default: `.%03d`)
|
19
|
-
- **buffer_size**: Set the ORC buffer size (integer, default: `262144`)
|
20
|
-
- **strip_size**: Set the ORC strip size (integer, default: `67108864`)
|
21
|
-
- **block_size**: Set the ORC block size (integer, default: `268435456`)
|
19
|
+
- **buffer_size**: Set the ORC buffer size (integer, default: `262144(256KB)` )
|
20
|
+
- **strip_size**: Set the ORC strip size (integer, default: `67108864(64MB)` )
|
21
|
+
- **block_size**: Set the ORC block size (integer, default: `268435456(256MB)`)
|
22
22
|
- **compression_kind**: description (string, default: `'ZLIB'`)
|
23
|
-
- `NONE`, `ZLIB`, `SNAPPY`
|
24
|
-
- **overwrite**:
|
23
|
+
- `NONE`, `ZLIB`, `SNAPPY`, `LZO`, `LZ4`
|
24
|
+
- **overwrite**: Overwrite if output files already exist. (boolean, default: `false`)
|
25
|
+
- Support: `LocalFileSystem`, `S3(s3, s3a, s3n)`
|
25
26
|
- **default_from_timezone** Time zone of timestamp columns. This can be overwritten for each column using column_options (DateTimeZone, default: `UTC`)
|
26
27
|
|
27
28
|
- **auth_method**: name of mechanism to authenticate requests (basic, env, instance, profile, properties, anonymous, or session. default: basic)
|
@@ -36,14 +37,34 @@
|
|
36
37
|
out:
|
37
38
|
type: orc
|
38
39
|
path_prefix: "/tmp/output"
|
39
|
-
buffer_size: 8000
|
40
|
-
strip_size: 90000
|
41
40
|
compression_kind: ZLIB
|
42
41
|
overwrite: true
|
43
42
|
```
|
44
43
|
|
45
44
|
## ChangeLog
|
46
45
|
|
46
|
+
### ver 0.3.4
|
47
|
+
|
48
|
+
- Bump `orc` library to `1.5.4`
|
49
|
+
- bugfix
|
50
|
+
- https://github.com/yuokada/embulk-output-orc/pull/17
|
51
|
+
|
52
|
+
### ver 0.3.3
|
53
|
+
|
54
|
+
- bugfix
|
55
|
+
- Bump `orc` library to `1.4.4`
|
56
|
+
|
57
|
+
### ver 0.3.2
|
58
|
+
|
59
|
+
- Update `orc` libraries to `1.4.3`
|
60
|
+
|
61
|
+
### ver 0.3.0
|
62
|
+
|
63
|
+
- Change default value : (block_size, buffer_size, strip_size)
|
64
|
+
|
65
|
+
- default value is Hive's default value.
|
66
|
+
(see: https://orc.apache.org/docs/hive-config.html)
|
67
|
+
|
47
68
|
### ver 0.2.0
|
48
69
|
|
49
70
|
- support: output to s3
|
data/build.gradle
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
plugins {
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
|
-
id "com.github.jruby-gradle.base" version "
|
3
|
+
id "com.github.jruby-gradle.base" version "1.5.0"
|
4
4
|
id "java"
|
5
|
+
id "scala"
|
5
6
|
id "checkstyle"
|
7
|
+
// id "com.github.spotbugs" version "3.0.1"
|
6
8
|
id "org.sonarqube" version "2.5"
|
7
9
|
}
|
8
10
|
import com.github.jrubygradle.JRubyExec
|
@@ -18,26 +20,41 @@ configurations {
|
|
18
20
|
runtime.exclude group: "org.slf4j", module: "slf4j-log4j12"
|
19
21
|
}
|
20
22
|
|
21
|
-
version = "0.3.
|
23
|
+
version = "0.3.5"
|
22
24
|
|
23
25
|
sourceCompatibility = 1.8
|
24
26
|
targetCompatibility = 1.8
|
25
27
|
|
26
28
|
dependencies {
|
27
|
-
compile "org.embulk:embulk-core:0.
|
28
|
-
provided "org.embulk:embulk-core:0.
|
29
|
+
compile "org.embulk:embulk-core:0.9.23"
|
30
|
+
provided "org.embulk:embulk-core:0.9.23"
|
31
|
+
compile "org.scala-lang:scala-library:2.12.+"
|
29
32
|
|
30
|
-
compile "org.apache.orc:orc:1.4
|
31
|
-
compile "org.apache.orc:orc-core:1.4
|
32
|
-
compile "org.apache.hadoop:hadoop-hdfs:2.
|
33
|
+
compile "org.apache.orc:orc:1.5.4"
|
34
|
+
compile "org.apache.orc:orc-core:1.5.4"
|
35
|
+
compile "org.apache.hadoop:hadoop-hdfs:2.7.5"
|
33
36
|
|
34
37
|
compile 'org.embulk.input.s3:embulk-util-aws-credentials:0.2.8'
|
35
38
|
compile "com.amazonaws:aws-java-sdk-s3:1.10.33"
|
36
|
-
compile "org.apache.hadoop:hadoop-aws:2.7.
|
39
|
+
compile "org.apache.hadoop:hadoop-aws:2.7.5"
|
37
40
|
|
38
|
-
testCompile
|
39
|
-
testCompile "
|
40
|
-
testCompile
|
41
|
+
testCompile 'org.jmockit:jmockit:1.38'
|
42
|
+
// testCompile "junit:junit:4.+"
|
43
|
+
testCompile 'org.hamcrest:hamcrest-core:1.3'
|
44
|
+
testCompile 'org.testng:testng:6.14.2'
|
45
|
+
testCompile "org.embulk:embulk-core:0.8.39:tests"
|
46
|
+
testCompile "org.embulk:embulk-standards:0.8.39"
|
47
|
+
}
|
48
|
+
|
49
|
+
sourceSets {
|
50
|
+
main {
|
51
|
+
scala {
|
52
|
+
srcDirs = ['src/main/scala', 'src/main/java']
|
53
|
+
}
|
54
|
+
java {
|
55
|
+
srcDirs = []
|
56
|
+
}
|
57
|
+
}
|
41
58
|
}
|
42
59
|
|
43
60
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -65,14 +82,16 @@ task checkstyle(type: Checkstyle) {
|
|
65
82
|
}
|
66
83
|
|
67
84
|
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
68
|
-
jrubyArgs "-
|
69
|
-
script "
|
85
|
+
jrubyArgs "-S"
|
86
|
+
script "gem"
|
87
|
+
scriptArgs "build", "${project.name}.gemspec"
|
70
88
|
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
71
89
|
}
|
72
90
|
|
73
91
|
task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
|
74
|
-
jrubyArgs "-
|
75
|
-
script "
|
92
|
+
jrubyArgs "-S"
|
93
|
+
script "gem"
|
94
|
+
scriptArgs "push", "pkg/${project.name}-${project.version}.gem"
|
76
95
|
}
|
77
96
|
|
78
97
|
task "package"(dependsOn: ["gemspec", "classpath"]) {
|
data/example/example.yml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
in:
|
3
3
|
type: randomj
|
4
|
-
rows:
|
4
|
+
rows: 1024000
|
5
5
|
threads: 1
|
6
6
|
# default_timezone: Asia/Tokyo
|
7
7
|
primary_key: myid
|
@@ -14,14 +14,12 @@ in:
|
|
14
14
|
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
15
15
|
- {name: purchase, type: timestamp, format: '%Y/%m/%d'}
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
exec:
|
18
|
+
max_threads: 2 # run at most 8 tasks concurrently
|
19
|
+
min_output_tasks: 1 # disable page scattering
|
20
20
|
|
21
21
|
out:
|
22
22
|
type: orc
|
23
23
|
overwrite: true
|
24
24
|
path_prefix: "/tmp/output"
|
25
|
-
buffer_size: 8000
|
26
|
-
strip_size: 90000
|
27
25
|
compression_kind: ZLIB
|
Binary file
|
@@ -1,5 +1,5 @@
|
|
1
1
|
distributionBase=GRADLE_USER_HOME
|
2
2
|
distributionPath=wrapper/dists
|
3
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-5.6.4-bin.zip
|
3
4
|
zipStoreBase=GRADLE_USER_HOME
|
4
5
|
zipStorePath=wrapper/dists
|
5
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-4.2.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
#!/usr/bin/env sh
|
2
2
|
|
3
|
+
#
|
4
|
+
# Copyright 2015 the original author or authors.
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
|
3
19
|
##############################################################################
|
4
20
|
##
|
5
21
|
## Gradle start up script for UN*X
|
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
|
|
28
44
|
APP_BASE_NAME=`basename "$0"`
|
29
45
|
|
30
46
|
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
-
DEFAULT_JVM_OPTS=""
|
47
|
+
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
32
48
|
|
33
49
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
50
|
MAX_FD="maximum"
|
data/gradlew.bat
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
@rem
|
2
|
+
@rem Copyright 2015 the original author or authors.
|
3
|
+
@rem
|
4
|
+
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
@rem you may not use this file except in compliance with the License.
|
6
|
+
@rem You may obtain a copy of the License at
|
7
|
+
@rem
|
8
|
+
@rem https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
@rem
|
10
|
+
@rem Unless required by applicable law or agreed to in writing, software
|
11
|
+
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
@rem See the License for the specific language governing permissions and
|
14
|
+
@rem limitations under the License.
|
15
|
+
@rem
|
16
|
+
|
1
17
|
@if "%DEBUG%" == "" @echo off
|
2
18
|
@rem ##########################################################################
|
3
19
|
@rem
|
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
|
|
14
30
|
set APP_HOME=%DIRNAME%
|
15
31
|
|
16
32
|
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
-
set DEFAULT_JVM_OPTS=
|
33
|
+
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
18
34
|
|
19
35
|
@rem Find java.exe
|
20
36
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
@@ -0,0 +1,42 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.nio.charset.StandardCharsets
|
4
|
+
|
5
|
+
import org.apache.hadoop.hive.ql.exec.vector._
|
6
|
+
import org.embulk.spi.{Column, ColumnVisitor, PageReader}
|
7
|
+
|
8
|
+
class OrcColumnVisitor(val reader: PageReader, val batch: VectorizedRowBatch, val i: Integer) extends ColumnVisitor {
|
9
|
+
override def booleanColumn(column: Column): Unit = if (reader.isNull(column)) {
|
10
|
+
batch.cols(column.getIndex).noNulls = false
|
11
|
+
batch.cols(column.getIndex).isNull(i) = true
|
12
|
+
}
|
13
|
+
else if (reader.getBoolean(column)) batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 1
|
14
|
+
else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = 0
|
15
|
+
|
16
|
+
override def longColumn(column: Column): Unit = if (reader.isNull(column)) {
|
17
|
+
batch.cols(column.getIndex).noNulls = false
|
18
|
+
batch.cols(column.getIndex).isNull(i) = true
|
19
|
+
}
|
20
|
+
else batch.cols(column.getIndex).asInstanceOf[LongColumnVector].vector(i) = reader.getLong(column)
|
21
|
+
|
22
|
+
override def doubleColumn(column: Column): Unit = if (reader.isNull(column)) {
|
23
|
+
batch.cols(column.getIndex).noNulls = false
|
24
|
+
batch.cols(column.getIndex).isNull(i) = true
|
25
|
+
}
|
26
|
+
else batch.cols(column.getIndex).asInstanceOf[DoubleColumnVector].vector(i) = reader.getDouble(column)
|
27
|
+
|
28
|
+
override def stringColumn(column: Column): Unit = if (!reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[BytesColumnVector].setVal(i, reader.getString(column).getBytes(StandardCharsets.UTF_8))
|
29
|
+
else {
|
30
|
+
batch.cols(column.getIndex).noNulls = false
|
31
|
+
batch.cols(column.getIndex).isNull(i) = true
|
32
|
+
}
|
33
|
+
|
34
|
+
override def timestampColumn(column: Column): Unit = if (reader.isNull(column)) batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].setNullValue(i)
|
35
|
+
else {
|
36
|
+
val timestamp = reader.getTimestamp(column)
|
37
|
+
val ts = new java.sql.Timestamp(timestamp.getEpochSecond * 1000)
|
38
|
+
batch.cols(column.getIndex).asInstanceOf[TimestampColumnVector].set(i, ts)
|
39
|
+
}
|
40
|
+
|
41
|
+
override def jsonColumn(column: Column) = throw new UnsupportedOperationException("orc output plugin does not support json type")
|
42
|
+
}
|
@@ -0,0 +1,156 @@
|
|
1
|
+
package org.embulk.output.orc
|
2
|
+
|
3
|
+
import java.io.IOException
|
4
|
+
import java.util
|
5
|
+
|
6
|
+
import org.apache.hadoop.conf.Configuration
|
7
|
+
import org.apache.hadoop.fs.{LocalFileSystem, Path}
|
8
|
+
import org.apache.hadoop.hdfs.DistributedFileSystem
|
9
|
+
import org.apache.hadoop.util.VersionInfo
|
10
|
+
import org.apache.orc.{CompressionKind, MemoryManager, OrcFile, TypeDescription, Writer}
|
11
|
+
import org.embulk.config.{ConfigSource, TaskReport, TaskSource}
|
12
|
+
import org.embulk.spi.util.Timestamps
|
13
|
+
import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema}
|
14
|
+
import org.embulk.util.aws.credentials.AwsCredentials
|
15
|
+
|
16
|
+
object OrcOutputPlugin {
|
17
|
+
private[orc] def getSchema(schema: Schema) = {
|
18
|
+
val oschema = TypeDescription.createStruct
|
19
|
+
for (i <- 0 until schema.size) {
|
20
|
+
val column = schema.getColumn(i)
|
21
|
+
val `type` = column.getType
|
22
|
+
`type`.getName match {
|
23
|
+
case "long" =>
|
24
|
+
oschema.addField(column.getName, TypeDescription.createLong)
|
25
|
+
case "double" =>
|
26
|
+
oschema.addField(column.getName, TypeDescription.createDouble)
|
27
|
+
case "boolean" =>
|
28
|
+
oschema.addField(column.getName, TypeDescription.createBoolean)
|
29
|
+
case "string" =>
|
30
|
+
oschema.addField(column.getName, TypeDescription.createString)
|
31
|
+
case "timestamp" =>
|
32
|
+
oschema.addField(column.getName, TypeDescription.createTimestamp)
|
33
|
+
case _ =>
|
34
|
+
System.out.println("Unsupported type")
|
35
|
+
}
|
36
|
+
}
|
37
|
+
oschema
|
38
|
+
}
|
39
|
+
|
40
|
+
// We avoid using orc.MemoryManagerImpl since it is not threadsafe, but embulk is multi-threaded.
|
41
|
+
// Embulk creates and uses multiple instances of TransactionalPageOutput in worker threads.
|
42
|
+
// As a workaround, WriterLocalMemoryManager is bound to a single orc.Writer instance, and
|
43
|
+
// notifies checkMemory() only to that instance.
|
44
|
+
private class WriterLocalMemoryManager extends MemoryManager {
|
45
|
+
final private[orc] val rowsBetweenChecks = 10000
|
46
|
+
private var rowsAddedSinceCheck = 0
|
47
|
+
private[orc] var boundCallback: MemoryManager.Callback = _
|
48
|
+
|
49
|
+
@throws[IOException]
|
50
|
+
override def addWriter(path: Path, requestedAllocation: Long, callback: MemoryManager.Callback): Unit = {
|
51
|
+
if (boundCallback != null) {
|
52
|
+
throw new IllegalStateException("WriterLocalMemoryManager should be bound to a single orc.Writer instance.")
|
53
|
+
} else {
|
54
|
+
boundCallback = callback
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
@throws[IOException]
|
59
|
+
override def removeWriter(path: Path): Unit = boundCallback = null
|
60
|
+
|
61
|
+
@throws[IOException]
|
62
|
+
override def addedRow(rows: Int): Unit = {
|
63
|
+
rowsAddedSinceCheck += rows
|
64
|
+
if (rowsAddedSinceCheck > rowsBetweenChecks) {
|
65
|
+
boundCallback.checkMemory(1)
|
66
|
+
rowsAddedSinceCheck = 0
|
67
|
+
}
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
}
|
72
|
+
|
73
|
+
class OrcOutputPlugin extends OutputPlugin {
|
74
|
+
override def transaction(config: ConfigSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = {
|
75
|
+
val task = config.loadConfig(classOf[PluginTask])
|
76
|
+
// retryable (idempotent) output:
|
77
|
+
// return resume(task.dump(), schema, taskCount, control);
|
78
|
+
// non-retryable (non-idempotent) output:
|
79
|
+
control.run(task.dump)
|
80
|
+
Exec.newConfigDiff
|
81
|
+
}
|
82
|
+
|
83
|
+
override def resume(taskSource: TaskSource, schema: Schema, taskCount: Int, control: OutputPlugin.Control) = throw new UnsupportedOperationException("orc output plugin does not support resuming")
|
84
|
+
|
85
|
+
override def cleanup(taskSource: TaskSource, schema: Schema, taskCount: Int, successTaskReports: util.List[TaskReport]): Unit = {
|
86
|
+
}
|
87
|
+
|
88
|
+
override def open(taskSource: TaskSource, schema: Schema, taskIndex: Int) = {
|
89
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
90
|
+
if (task.getOverwrite) {
|
91
|
+
val credentials = AwsCredentials.getAWSCredentialsProvider(task).getCredentials
|
92
|
+
OrcOutputPluginHelper.removeOldFile(buildPath(task, taskIndex), task)
|
93
|
+
}
|
94
|
+
val reader = new PageReader(schema)
|
95
|
+
val writer = createWriter(task, schema, taskIndex)
|
96
|
+
new OrcTransactionalPageOutput(reader, writer, task)
|
97
|
+
}
|
98
|
+
|
99
|
+
private def buildPath(task: PluginTask, processorIndex: Int): String = {
|
100
|
+
val pathPrefix = task.getPathPrefix
|
101
|
+
val pathSuffix = task.getFileNameExtension
|
102
|
+
val sequenceFormat = task.getSequenceFormat
|
103
|
+
val fmt = java.lang.String.format(sequenceFormat, processorIndex.asInstanceOf[AnyRef])
|
104
|
+
pathPrefix + fmt + pathSuffix
|
105
|
+
}
|
106
|
+
|
107
|
+
private def getHadoopConfiguration(task: PluginTask) = {
|
108
|
+
val conf = new Configuration
|
109
|
+
// see: https://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
|
110
|
+
conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
|
111
|
+
conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
|
112
|
+
// see: https://stackoverflow.com/questions/20833444/how-to-set-objects-in-hadoop-configuration
|
113
|
+
AwsCredentials.getAWSCredentialsProvider(task)
|
114
|
+
if (task.getAccessKeyId.isPresent) {
|
115
|
+
conf.set("fs.s3a.access.key", task.getAccessKeyId.get)
|
116
|
+
conf.set("fs.s3n.awsAccessKeyId", task.getAccessKeyId.get)
|
117
|
+
}
|
118
|
+
if (task.getSecretAccessKey.isPresent) {
|
119
|
+
conf.set("fs.s3a.secret.key", task.getSecretAccessKey.get)
|
120
|
+
conf.set("fs.s3n.awsSecretAccessKey", task.getSecretAccessKey.get)
|
121
|
+
}
|
122
|
+
if (task.getEndpoint.isPresent) {
|
123
|
+
conf.set("fs.s3a.endpoint", task.getEndpoint.get)
|
124
|
+
conf.set("fs.s3n.endpoint", task.getEndpoint.get)
|
125
|
+
}
|
126
|
+
conf
|
127
|
+
}
|
128
|
+
|
129
|
+
private def createWriter(task: PluginTask, schema: Schema, processorIndex: Int): Writer = {
|
130
|
+
val timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
131
|
+
val conf = getHadoopConfiguration(task)
|
132
|
+
val oschema = OrcOutputPlugin.getSchema(schema)
|
133
|
+
// see: https://groups.google.com/forum/#!topic/vertx/lLb-slzpWVg
|
134
|
+
Thread.currentThread.setContextClassLoader(classOf[VersionInfo].getClassLoader)
|
135
|
+
|
136
|
+
var writer: Writer = null
|
137
|
+
try { // Make writerOptions
|
138
|
+
val writerOptions = createWriterOptions(task, conf)
|
139
|
+
// see: https://stackoverflow.com/questions/9256733/how-to-connect-hive-in-ireport
|
140
|
+
// see: https://community.hortonworks.com/content/kbentry/73458/connecting-dbvisualizer-and-datagrip-to-hive-with.html
|
141
|
+
writer = OrcFile.createWriter(new Path(buildPath(task, processorIndex)), writerOptions.setSchema(oschema).memory(new OrcOutputPlugin.WriterLocalMemoryManager).version(OrcFile.Version.V_0_12))
|
142
|
+
} catch {
|
143
|
+
case e: IOException => throw e
|
144
|
+
}
|
145
|
+
writer
|
146
|
+
}
|
147
|
+
|
148
|
+
private def createWriterOptions(task: PluginTask, conf: Configuration) = {
|
149
|
+
val bufferSize = task.getBufferSize
|
150
|
+
val stripSize = task.getStripSize
|
151
|
+
val blockSize = task.getBlockSize
|
152
|
+
val kindString = task.getCompressionKind
|
153
|
+
val kind = CompressionKind.valueOf(kindString)
|
154
|
+
OrcFile.writerOptions(conf).bufferSize(bufferSize).blockSize(blockSize.toLong).stripeSize(stripSize.toLong).compress(kind)
|
155
|
+
}
|
156
|
+
}
|