embulk-output-s3_parquet 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69eeaa8791df4a9dce1d4746d881805e7f8c2ea4
|
4
|
+
data.tar.gz: 322f28022072631766fb7f862b4465f04f8f0745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6cfbe96838e1960f5097ee9c33f78a2d02f111c9b06014954f18b7cebf97b89d265b22affd755bd1318b4a6a9e9953599aeaa013cde8bc3a7e5d91264abeed71
|
7
|
+
data.tar.gz: e5eac48dd2822412acff3d0612cff714d77fa9a15d8fa33a27b8d3c668f226eb1fbbdbfd4dbf6649ebc6667fcc01273d4da7a896987537d0e2fa0ca654dbbaed
|
@@ -25,6 +25,9 @@ jobs:
|
|
25
25
|
uses: actions/setup-java@v1
|
26
26
|
with:
|
27
27
|
java-version: 1.8
|
28
|
+
- name: scalafmt
|
29
|
+
if: github.event.pull_request.merged == true
|
30
|
+
run: ./gradlew spotlessCheck
|
28
31
|
- name: Test with Gradle
|
29
32
|
if: github.event.pull_request.merged == true
|
30
33
|
run: ./gradlew test
|
data/.github/workflows/test.yml
CHANGED
data/.scalafmt.conf
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
0.2.0 (2020-03-10)
|
2
|
+
==================
|
3
|
+
|
4
|
+
* [Enhancement] [#23](https://github.com/civitaspo/embulk-output-s3_parquet/pull/23) Limit the usage of swapping ContextClassLoader
|
5
|
+
* [BugFix] [#24](https://github.com/civitaspo/embulk-output-s3_parquet/pull/24) Use basic credentials correctly
|
6
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update gradle 4.1 -> 6.1
|
7
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update parquet-{column,common,encoding,hadoop,jackson,tools} 1.10.1 -> 1.11.0 with the latest parquet-format 2.4.0 -> 2.7.0
|
8
|
+
* [parquet-format CHANGELOG](https://github.com/apache/parquet-format/blob/master/CHANGES.md)
|
9
|
+
* [parquet-mr CHANGELOG](https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/CHANGES.md#version-1110)
|
10
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update aws-java-sdk 1.11.676 -> 1.11.739
|
11
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update embulk 0.9.20 -> 0.9.23 with embulk-deps-{config,buffer}
|
12
|
+
* [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt instead of the Intellij formatter.
|
13
|
+
* [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt in CI.
|
14
|
+
* [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Enable to run examples locally with some prepared scripts.
|
15
|
+
|
1
16
|
0.1.0 (2019-11-17)
|
2
17
|
==================
|
3
18
|
|
data/README.md
CHANGED
@@ -131,6 +131,8 @@ out:
|
|
131
131
|
### Run example:
|
132
132
|
|
133
133
|
```shell
|
134
|
+
$ ./run_s3_local.sh
|
135
|
+
$ ./example/prepare_s3_bucket.sh
|
134
136
|
$ ./gradlew classpath
|
135
137
|
$ embulk run example/config.yml -Ilib
|
136
138
|
```
|
@@ -138,8 +140,7 @@ $ embulk run example/config.yml -Ilib
|
|
138
140
|
### Run test:
|
139
141
|
|
140
142
|
```shell
|
141
|
-
|
142
|
-
$ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
|
143
|
+
$ ./run_s3_local.sh
|
143
144
|
$ ./gradlew test
|
144
145
|
```
|
145
146
|
|
data/build.gradle
CHANGED
@@ -3,6 +3,7 @@ plugins {
|
|
3
3
|
id "com.jfrog.bintray" version "1.1"
|
4
4
|
id "com.github.jruby-gradle.base" version "1.5.0"
|
5
5
|
id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
|
6
|
+
id "com.diffplug.gradle.spotless" version "3.27.1"
|
6
7
|
}
|
7
8
|
import com.github.jrubygradle.JRubyExec
|
8
9
|
repositories {
|
@@ -13,29 +14,32 @@ configurations {
|
|
13
14
|
provided
|
14
15
|
}
|
15
16
|
|
16
|
-
version = "0.
|
17
|
+
version = "0.2.0"
|
17
18
|
|
18
19
|
sourceCompatibility = 1.8
|
19
20
|
targetCompatibility = 1.8
|
20
21
|
|
21
22
|
dependencies {
|
22
|
-
compile "org.embulk:embulk-core:0.9.
|
23
|
-
provided "org.embulk:embulk-core:0.9.
|
23
|
+
compile "org.embulk:embulk-core:0.9.23"
|
24
|
+
provided "org.embulk:embulk-core:0.9.23"
|
24
25
|
|
25
26
|
compile 'org.scala-lang:scala-library:2.13.1'
|
26
27
|
['glue', 's3', 'sts'].each { v ->
|
27
|
-
compile "com.amazonaws:aws-java-sdk-${v}:1.11.
|
28
|
+
compile "com.amazonaws:aws-java-sdk-${v}:1.11.739"
|
28
29
|
}
|
29
|
-
['column', 'common', 'encoding', '
|
30
|
-
compile "org.apache.parquet:parquet-${v}:1.
|
30
|
+
['column', 'common', 'encoding', 'hadoop', 'jackson'].each { v ->
|
31
|
+
compile "org.apache.parquet:parquet-${v}:1.11.0"
|
31
32
|
}
|
33
|
+
// ref. https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/pom.xml#L85
|
34
|
+
compile 'org.apache.parquet:parquet-format:2.7.0'
|
32
35
|
compile 'org.apache.hadoop:hadoop-common:2.9.2'
|
33
36
|
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
|
34
37
|
|
38
|
+
['test', 'standards', 'deps-buffer', 'deps-config'].each { v ->
|
39
|
+
testCompile "org.embulk:embulk-${v}:0.9.23"
|
40
|
+
}
|
35
41
|
testCompile 'org.scalatest:scalatest_2.13:3.0.8'
|
36
|
-
testCompile 'org.
|
37
|
-
testCompile 'org.embulk:embulk-standards:0.9.20'
|
38
|
-
testCompile 'org.apache.parquet:parquet-tools:1.10.1'
|
42
|
+
testCompile 'org.apache.parquet:parquet-tools:1.11.0'
|
39
43
|
testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
|
40
44
|
}
|
41
45
|
|
@@ -43,6 +47,12 @@ testlogger {
|
|
43
47
|
theme "mocha"
|
44
48
|
}
|
45
49
|
|
50
|
+
spotless {
|
51
|
+
scala {
|
52
|
+
scalafmt('2.3.2').configFile('.scalafmt.conf')
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
46
56
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
47
57
|
doFirst { file("classpath").deleteDir() }
|
48
58
|
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
data/example/config.yml
CHANGED
data/example/with_catalog.yml
CHANGED
Binary file
|
@@ -1,5 +1,5 @@
|
|
1
1
|
distributionBase=GRADLE_USER_HOME
|
2
2
|
distributionPath=wrapper/dists
|
3
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-6.1-bin.zip
|
3
4
|
zipStoreBase=GRADLE_USER_HOME
|
4
5
|
zipStorePath=wrapper/dists
|
5
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
#!/usr/bin/env sh
|
2
2
|
|
3
|
+
#
|
4
|
+
# Copyright 2015 the original author or authors.
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
|
3
19
|
##############################################################################
|
4
20
|
##
|
5
21
|
## Gradle start up script for UN*X
|
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
|
|
28
44
|
APP_BASE_NAME=`basename "$0"`
|
29
45
|
|
30
46
|
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
-
DEFAULT_JVM_OPTS=""
|
47
|
+
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
32
48
|
|
33
49
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
50
|
MAX_FD="maximum"
|
@@ -109,8 +125,8 @@ if $darwin; then
|
|
109
125
|
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
110
126
|
fi
|
111
127
|
|
112
|
-
# For Cygwin, switch paths to Windows format before running java
|
113
|
-
if $cygwin ; then
|
128
|
+
# For Cygwin or MSYS, switch paths to Windows format before running java
|
129
|
+
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
|
114
130
|
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
115
131
|
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
116
132
|
JAVACMD=`cygpath --unix "$JAVACMD"`
|
@@ -138,19 +154,19 @@ if $cygwin ; then
|
|
138
154
|
else
|
139
155
|
eval `echo args$i`="\"$arg\""
|
140
156
|
fi
|
141
|
-
i
|
157
|
+
i=`expr $i + 1`
|
142
158
|
done
|
143
159
|
case $i in
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
160
|
+
0) set -- ;;
|
161
|
+
1) set -- "$args0" ;;
|
162
|
+
2) set -- "$args0" "$args1" ;;
|
163
|
+
3) set -- "$args0" "$args1" "$args2" ;;
|
164
|
+
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
165
|
+
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
166
|
+
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
167
|
+
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
168
|
+
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
169
|
+
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
154
170
|
esac
|
155
171
|
fi
|
156
172
|
|
@@ -159,14 +175,9 @@ save () {
|
|
159
175
|
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
176
|
echo " "
|
161
177
|
}
|
162
|
-
APP_ARGS
|
178
|
+
APP_ARGS=`save "$@"`
|
163
179
|
|
164
180
|
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
181
|
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
166
182
|
|
167
|
-
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
168
|
-
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
169
|
-
cd "$(dirname "$0")"
|
170
|
-
fi
|
171
|
-
|
172
183
|
exec "$JAVACMD" "$@"
|
data/gradlew.bat
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
@rem
|
2
|
+
@rem Copyright 2015 the original author or authors.
|
3
|
+
@rem
|
4
|
+
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
@rem you may not use this file except in compliance with the License.
|
6
|
+
@rem You may obtain a copy of the License at
|
7
|
+
@rem
|
8
|
+
@rem https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
@rem
|
10
|
+
@rem Unless required by applicable law or agreed to in writing, software
|
11
|
+
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
@rem See the License for the specific language governing permissions and
|
14
|
+
@rem limitations under the License.
|
15
|
+
@rem
|
16
|
+
|
1
17
|
@if "%DEBUG%" == "" @echo off
|
2
18
|
@rem ##########################################################################
|
3
19
|
@rem
|
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
|
|
14
30
|
set APP_HOME=%DIRNAME%
|
15
31
|
|
16
32
|
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
-
set DEFAULT_JVM_OPTS=
|
33
|
+
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
18
34
|
|
19
35
|
@rem Find java.exe
|
20
36
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
data/run_s3_local.sh
ADDED
@@ -1,202 +1,250 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.util.{Optional, Map => JMap}
|
5
4
|
|
6
|
-
import com.amazonaws.services.glue.model.{
|
5
|
+
import com.amazonaws.services.glue.model.{
|
6
|
+
Column,
|
7
|
+
CreateTableRequest,
|
8
|
+
DeleteTableRequest,
|
9
|
+
GetTableRequest,
|
10
|
+
SerDeInfo,
|
11
|
+
StorageDescriptor,
|
12
|
+
TableInput
|
13
|
+
}
|
7
14
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
8
15
|
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
9
16
|
import org.embulk.output.s3_parquet.aws.Aws
|
10
17
|
import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
|
11
18
|
import org.embulk.spi.Schema
|
12
|
-
import org.embulk.spi.`type`.{
|
19
|
+
import org.embulk.spi.`type`.{
|
20
|
+
BooleanType,
|
21
|
+
DoubleType,
|
22
|
+
JsonType,
|
23
|
+
LongType,
|
24
|
+
StringType,
|
25
|
+
TimestampType,
|
26
|
+
Type
|
27
|
+
}
|
13
28
|
import org.slf4j.{Logger, LoggerFactory}
|
14
29
|
|
15
30
|
import scala.jdk.CollectionConverters._
|
16
31
|
import scala.util.Try
|
17
32
|
|
18
|
-
|
19
|
-
|
20
|
-
{
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
33
|
+
object CatalogRegistrator {
|
34
|
+
|
35
|
+
trait Task extends org.embulk.config.Task {
|
36
|
+
|
37
|
+
@Config("catalog_id")
|
38
|
+
@ConfigDefault("null")
|
39
|
+
def getCatalogId: Optional[String]
|
40
|
+
|
41
|
+
@Config("database")
|
42
|
+
def getDatabase: String
|
43
|
+
|
44
|
+
@Config("table")
|
45
|
+
def getTable: String
|
46
|
+
|
47
|
+
@Config("column_options")
|
48
|
+
@ConfigDefault("{}")
|
49
|
+
def getColumnOptions: JMap[String, ColumnOptions]
|
50
|
+
|
51
|
+
@Config("operation_if_exists")
|
52
|
+
@ConfigDefault("\"delete\"")
|
53
|
+
def getOperationIfExists: String
|
54
|
+
}
|
55
|
+
|
56
|
+
trait ColumnOptions {
|
57
|
+
|
58
|
+
@Config("type")
|
59
|
+
def getType: String
|
60
|
+
}
|
61
|
+
|
62
|
+
def apply(
|
63
|
+
aws: Aws,
|
64
|
+
task: Task,
|
65
|
+
schema: Schema,
|
66
|
+
location: String,
|
67
|
+
compressionCodec: CompressionCodecName,
|
68
|
+
loggerOption: Option[Logger] = None,
|
69
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty
|
70
|
+
): CatalogRegistrator = {
|
71
|
+
new CatalogRegistrator(
|
72
|
+
aws,
|
73
|
+
task,
|
74
|
+
schema,
|
75
|
+
location,
|
76
|
+
compressionCodec,
|
77
|
+
loggerOption,
|
78
|
+
parquetColumnLogicalTypes
|
79
|
+
)
|
80
|
+
}
|
59
81
|
}
|
60
82
|
|
61
|
-
class CatalogRegistrator(
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
83
|
+
class CatalogRegistrator(
|
84
|
+
aws: Aws,
|
85
|
+
task: CatalogRegistrator.Task,
|
86
|
+
schema: Schema,
|
87
|
+
location: String,
|
88
|
+
compressionCodec: CompressionCodecName,
|
89
|
+
loggerOption: Option[Logger] = None,
|
90
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty
|
91
|
+
) {
|
92
|
+
|
93
|
+
val logger: Logger =
|
94
|
+
loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
|
95
|
+
|
96
|
+
def run(): Unit = {
|
97
|
+
if (doesTableExists()) {
|
98
|
+
task.getOperationIfExists match {
|
99
|
+
case "skip" =>
|
100
|
+
logger.info(
|
101
|
+
s"Skip to register the table: ${task.getDatabase}.${task.getTable}"
|
102
|
+
)
|
103
|
+
return
|
104
|
+
|
105
|
+
case "delete" =>
|
106
|
+
logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
|
107
|
+
deleteTable()
|
108
|
+
|
109
|
+
case unknown =>
|
110
|
+
throw new ConfigException(s"Unsupported operation: $unknown")
|
111
|
+
}
|
89
112
|
}
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
113
|
+
registerNewParquetTable()
|
114
|
+
showNewTableInfo()
|
115
|
+
}
|
116
|
+
|
117
|
+
def showNewTableInfo(): Unit = {
|
118
|
+
val req = new GetTableRequest()
|
119
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
120
|
+
req.setDatabaseName(task.getDatabase)
|
121
|
+
req.setName(task.getTable)
|
122
|
+
|
123
|
+
val t = aws.withGlue(_.getTable(req)).getTable
|
124
|
+
logger.info(s"Created a table: ${t.toString}")
|
125
|
+
}
|
126
|
+
|
127
|
+
def doesTableExists(): Boolean = {
|
128
|
+
val req = new GetTableRequest()
|
129
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
130
|
+
req.setDatabaseName(task.getDatabase)
|
131
|
+
req.setName(task.getTable)
|
132
|
+
|
133
|
+
Try(aws.withGlue(_.getTable(req))).isSuccess
|
134
|
+
}
|
135
|
+
|
136
|
+
def deleteTable(): Unit = {
|
137
|
+
val req = new DeleteTableRequest()
|
138
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
139
|
+
req.setDatabaseName(task.getDatabase)
|
140
|
+
req.setName(task.getTable)
|
141
|
+
aws.withGlue(_.deleteTable(req))
|
142
|
+
}
|
143
|
+
|
144
|
+
def registerNewParquetTable(): Unit = {
|
145
|
+
logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
|
146
|
+
val req = new CreateTableRequest()
|
147
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
148
|
+
req.setDatabaseName(task.getDatabase)
|
149
|
+
req.setTableInput(
|
150
|
+
new TableInput()
|
151
|
+
.withName(task.getTable)
|
152
|
+
.withDescription("Created by embulk-output-s3_parquet")
|
153
|
+
.withTableType("EXTERNAL_TABLE")
|
154
|
+
.withParameters(
|
155
|
+
Map(
|
156
|
+
"EXTERNAL" -> "TRUE",
|
157
|
+
"classification" -> "parquet",
|
158
|
+
"parquet.compression" -> compressionCodec.name()
|
159
|
+
).asJava
|
160
|
+
)
|
161
|
+
.withStorageDescriptor(
|
162
|
+
new StorageDescriptor()
|
163
|
+
.withColumns(getGlueSchema: _*)
|
164
|
+
.withLocation(location)
|
165
|
+
.withCompressed(isCompressed)
|
166
|
+
.withInputFormat(
|
167
|
+
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
|
168
|
+
)
|
169
|
+
.withOutputFormat(
|
170
|
+
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
|
171
|
+
)
|
172
|
+
.withSerdeInfo(
|
173
|
+
new SerDeInfo()
|
174
|
+
.withSerializationLibrary(
|
175
|
+
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
|
176
|
+
)
|
177
|
+
.withParameters(Map("serialization.format" -> "1").asJava)
|
178
|
+
)
|
179
|
+
)
|
180
|
+
)
|
181
|
+
aws.withGlue(_.createTable(req))
|
182
|
+
}
|
183
|
+
|
184
|
+
private def getGlueSchema: Seq[Column] = {
|
185
|
+
val columnOptions: Map[String, ColumnOptions] =
|
186
|
+
task.getColumnOptions.asScala.toMap
|
187
|
+
schema.getColumns.asScala.toSeq.map { c =>
|
188
|
+
val cType: String =
|
189
|
+
if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
|
190
|
+
else if (parquetColumnLogicalTypes.contains(c.getName))
|
191
|
+
convertParquetLogicalTypeToGlueType(
|
192
|
+
parquetColumnLogicalTypes(c.getName)
|
193
|
+
)
|
194
|
+
else convertEmbulkTypeToGlueType(c.getType)
|
195
|
+
new Column()
|
196
|
+
.withName(c.getName)
|
197
|
+
.withType(cType)
|
110
198
|
}
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
.withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
|
140
|
-
.withSerdeInfo(new SerDeInfo()
|
141
|
-
.withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
|
142
|
-
.withParameters(Map("serialization.format" -> "1").asJava)
|
143
|
-
)
|
144
|
-
)
|
145
|
-
)
|
146
|
-
aws.withGlue(_.createTable(req))
|
199
|
+
}
|
200
|
+
|
201
|
+
private def convertParquetLogicalTypeToGlueType(t: String): String = {
|
202
|
+
t match {
|
203
|
+
case "timestamp-millis" => "timestamp"
|
204
|
+
case "timestamp-micros" =>
|
205
|
+
"bigint" // Glue cannot recognize timestamp-micros.
|
206
|
+
case "int8" => "tinyint"
|
207
|
+
case "int16" => "smallint"
|
208
|
+
case "int32" => "int"
|
209
|
+
case "int64" => "bigint"
|
210
|
+
case "uint8" =>
|
211
|
+
"smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
|
212
|
+
case "uint16" =>
|
213
|
+
"int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
|
214
|
+
case "uint32" =>
|
215
|
+
"bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
|
216
|
+
case "uint64" =>
|
217
|
+
throw new ConfigException(
|
218
|
+
"Cannot convert uint64 to Glue data types automatically" +
|
219
|
+
" because the Glue bigint supports a 64-bit signed integer." +
|
220
|
+
" Please use `catalog.column_options` to define the type."
|
221
|
+
)
|
222
|
+
case "json" => "string"
|
223
|
+
case _ =>
|
224
|
+
throw new ConfigException(
|
225
|
+
s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type."
|
226
|
+
)
|
147
227
|
}
|
148
228
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
229
|
+
}
|
230
|
+
|
231
|
+
private def convertEmbulkTypeToGlueType(t: Type): String = {
|
232
|
+
t match {
|
233
|
+
case _: BooleanType => "boolean"
|
234
|
+
case _: LongType => "bigint"
|
235
|
+
case _: DoubleType => "double"
|
236
|
+
case _: StringType => "string"
|
237
|
+
case _: TimestampType => "string"
|
238
|
+
case _: JsonType => "string"
|
239
|
+
case unknown =>
|
240
|
+
throw new ConfigException(
|
241
|
+
s"Unsupported embulk type: ${unknown.getName}"
|
242
|
+
)
|
161
243
|
}
|
244
|
+
}
|
162
245
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
case "timestamp-millis" => "timestamp"
|
167
|
-
case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
|
168
|
-
case "int8" => "tinyint"
|
169
|
-
case "int16" => "smallint"
|
170
|
-
case "int32" => "int"
|
171
|
-
case "int64" => "bigint"
|
172
|
-
case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
|
173
|
-
case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
|
174
|
-
case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
|
175
|
-
case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
|
176
|
-
" because the Glue bigint supports a 64-bit signed integer." +
|
177
|
-
" Please use `catalog.column_options` to define the type.")
|
178
|
-
case "json" => "string"
|
179
|
-
case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
|
180
|
-
}
|
181
|
-
|
182
|
-
}
|
183
|
-
|
184
|
-
private def convertEmbulkTypeToGlueType(t: Type): String =
|
185
|
-
{
|
186
|
-
t match {
|
187
|
-
case _: BooleanType => "boolean"
|
188
|
-
case _: LongType => "bigint"
|
189
|
-
case _: DoubleType => "double"
|
190
|
-
case _: StringType => "string"
|
191
|
-
case _: TimestampType => "string"
|
192
|
-
case _: JsonType => "string"
|
193
|
-
case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
|
194
|
-
}
|
195
|
-
}
|
196
|
-
|
197
|
-
private def isCompressed: Boolean =
|
198
|
-
{
|
199
|
-
!compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
|
200
|
-
}
|
246
|
+
private def isCompressed: Boolean = {
|
247
|
+
!compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
|
248
|
+
}
|
201
249
|
|
202
250
|
}
|