embulk-output-s3_parquet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69eeaa8791df4a9dce1d4746d881805e7f8c2ea4
|
4
|
+
data.tar.gz: 322f28022072631766fb7f862b4465f04f8f0745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6cfbe96838e1960f5097ee9c33f78a2d02f111c9b06014954f18b7cebf97b89d265b22affd755bd1318b4a6a9e9953599aeaa013cde8bc3a7e5d91264abeed71
|
7
|
+
data.tar.gz: e5eac48dd2822412acff3d0612cff714d77fa9a15d8fa33a27b8d3c668f226eb1fbbdbfd4dbf6649ebc6667fcc01273d4da7a896987537d0e2fa0ca654dbbaed
|
@@ -25,6 +25,9 @@ jobs:
|
|
25
25
|
uses: actions/setup-java@v1
|
26
26
|
with:
|
27
27
|
java-version: 1.8
|
28
|
+
- name: scalafmt
|
29
|
+
if: github.event.pull_request.merged == true
|
30
|
+
run: ./gradlew spotlessCheck
|
28
31
|
- name: Test with Gradle
|
29
32
|
if: github.event.pull_request.merged == true
|
30
33
|
run: ./gradlew test
|
data/.github/workflows/test.yml
CHANGED
data/.scalafmt.conf
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
0.2.0 (2020-03-10)
|
2
|
+
==================
|
3
|
+
|
4
|
+
* [Enhancement] [#23](https://github.com/civitaspo/embulk-output-s3_parquet/pull/23) Limit the usage of swapping ContextClassLoader
|
5
|
+
* [BugFix] [#24](https://github.com/civitaspo/embulk-output-s3_parquet/pull/24) Use basic credentials correctly
|
6
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update gradle 4.1 -> 6.1
|
7
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update parquet-{column,common,encoding,hadoop,jackson,tools} 1.10.1 -> 1.11.0 with the latest parquet-format 2.4.0 -> 2.7.0
|
8
|
+
* [parquet-format CHANGELOG](https://github.com/apache/parquet-format/blob/master/CHANGES.md)
|
9
|
+
* [parquet-mr CHANGELOG](https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/CHANGES.md#version-1110)
|
10
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update aws-java-sdk 1.11.676 -> 1.11.739
|
11
|
+
* [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update embulk 0.9.20 -> 0.9.23 with embulk-deps-{config,buffer}
|
12
|
+
* [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt instead of the Intellij formatter.
|
13
|
+
* [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt in CI.
|
14
|
+
* [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Enable to run examples locally with some prepared scripts.
|
15
|
+
|
1
16
|
0.1.0 (2019-11-17)
|
2
17
|
==================
|
3
18
|
|
data/README.md
CHANGED
@@ -131,6 +131,8 @@ out:
|
|
131
131
|
### Run example:
|
132
132
|
|
133
133
|
```shell
|
134
|
+
$ ./run_s3_local.sh
|
135
|
+
$ ./example/prepare_s3_bucket.sh
|
134
136
|
$ ./gradlew classpath
|
135
137
|
$ embulk run example/config.yml -Ilib
|
136
138
|
```
|
@@ -138,8 +140,7 @@ $ embulk run example/config.yml -Ilib
|
|
138
140
|
### Run test:
|
139
141
|
|
140
142
|
```shell
|
141
|
-
|
142
|
-
$ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
|
143
|
+
$ ./run_s3_local.sh
|
143
144
|
$ ./gradlew test
|
144
145
|
```
|
145
146
|
|
data/build.gradle
CHANGED
@@ -3,6 +3,7 @@ plugins {
|
|
3
3
|
id "com.jfrog.bintray" version "1.1"
|
4
4
|
id "com.github.jruby-gradle.base" version "1.5.0"
|
5
5
|
id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
|
6
|
+
id "com.diffplug.gradle.spotless" version "3.27.1"
|
6
7
|
}
|
7
8
|
import com.github.jrubygradle.JRubyExec
|
8
9
|
repositories {
|
@@ -13,29 +14,32 @@ configurations {
|
|
13
14
|
provided
|
14
15
|
}
|
15
16
|
|
16
|
-
version = "0.
|
17
|
+
version = "0.2.0"
|
17
18
|
|
18
19
|
sourceCompatibility = 1.8
|
19
20
|
targetCompatibility = 1.8
|
20
21
|
|
21
22
|
dependencies {
|
22
|
-
compile "org.embulk:embulk-core:0.9.
|
23
|
-
provided "org.embulk:embulk-core:0.9.
|
23
|
+
compile "org.embulk:embulk-core:0.9.23"
|
24
|
+
provided "org.embulk:embulk-core:0.9.23"
|
24
25
|
|
25
26
|
compile 'org.scala-lang:scala-library:2.13.1'
|
26
27
|
['glue', 's3', 'sts'].each { v ->
|
27
|
-
compile "com.amazonaws:aws-java-sdk-${v}:1.11.
|
28
|
+
compile "com.amazonaws:aws-java-sdk-${v}:1.11.739"
|
28
29
|
}
|
29
|
-
['column', 'common', 'encoding', '
|
30
|
-
compile "org.apache.parquet:parquet-${v}:1.
|
30
|
+
['column', 'common', 'encoding', 'hadoop', 'jackson'].each { v ->
|
31
|
+
compile "org.apache.parquet:parquet-${v}:1.11.0"
|
31
32
|
}
|
33
|
+
// ref. https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/pom.xml#L85
|
34
|
+
compile 'org.apache.parquet:parquet-format:2.7.0'
|
32
35
|
compile 'org.apache.hadoop:hadoop-common:2.9.2'
|
33
36
|
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
|
34
37
|
|
38
|
+
['test', 'standards', 'deps-buffer', 'deps-config'].each { v ->
|
39
|
+
testCompile "org.embulk:embulk-${v}:0.9.23"
|
40
|
+
}
|
35
41
|
testCompile 'org.scalatest:scalatest_2.13:3.0.8'
|
36
|
-
testCompile 'org.
|
37
|
-
testCompile 'org.embulk:embulk-standards:0.9.20'
|
38
|
-
testCompile 'org.apache.parquet:parquet-tools:1.10.1'
|
42
|
+
testCompile 'org.apache.parquet:parquet-tools:1.11.0'
|
39
43
|
testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
|
40
44
|
}
|
41
45
|
|
@@ -43,6 +47,12 @@ testlogger {
|
|
43
47
|
theme "mocha"
|
44
48
|
}
|
45
49
|
|
50
|
+
spotless {
|
51
|
+
scala {
|
52
|
+
scalafmt('2.3.2').configFile('.scalafmt.conf')
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
46
56
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
47
57
|
doFirst { file("classpath").deleteDir() }
|
48
58
|
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
data/example/config.yml
CHANGED
data/example/with_catalog.yml
CHANGED
Binary file
|
@@ -1,5 +1,5 @@
|
|
1
1
|
distributionBase=GRADLE_USER_HOME
|
2
2
|
distributionPath=wrapper/dists
|
3
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-6.1-bin.zip
|
3
4
|
zipStoreBase=GRADLE_USER_HOME
|
4
5
|
zipStorePath=wrapper/dists
|
5
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
#!/usr/bin/env sh
|
2
2
|
|
3
|
+
#
|
4
|
+
# Copyright 2015 the original author or authors.
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
|
3
19
|
##############################################################################
|
4
20
|
##
|
5
21
|
## Gradle start up script for UN*X
|
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
|
|
28
44
|
APP_BASE_NAME=`basename "$0"`
|
29
45
|
|
30
46
|
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
-
DEFAULT_JVM_OPTS=""
|
47
|
+
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
32
48
|
|
33
49
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
50
|
MAX_FD="maximum"
|
@@ -109,8 +125,8 @@ if $darwin; then
|
|
109
125
|
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
110
126
|
fi
|
111
127
|
|
112
|
-
# For Cygwin, switch paths to Windows format before running java
|
113
|
-
if $cygwin ; then
|
128
|
+
# For Cygwin or MSYS, switch paths to Windows format before running java
|
129
|
+
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
|
114
130
|
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
115
131
|
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
116
132
|
JAVACMD=`cygpath --unix "$JAVACMD"`
|
@@ -138,19 +154,19 @@ if $cygwin ; then
|
|
138
154
|
else
|
139
155
|
eval `echo args$i`="\"$arg\""
|
140
156
|
fi
|
141
|
-
i
|
157
|
+
i=`expr $i + 1`
|
142
158
|
done
|
143
159
|
case $i in
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
160
|
+
0) set -- ;;
|
161
|
+
1) set -- "$args0" ;;
|
162
|
+
2) set -- "$args0" "$args1" ;;
|
163
|
+
3) set -- "$args0" "$args1" "$args2" ;;
|
164
|
+
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
165
|
+
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
166
|
+
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
167
|
+
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
168
|
+
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
169
|
+
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
154
170
|
esac
|
155
171
|
fi
|
156
172
|
|
@@ -159,14 +175,9 @@ save () {
|
|
159
175
|
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
176
|
echo " "
|
161
177
|
}
|
162
|
-
APP_ARGS
|
178
|
+
APP_ARGS=`save "$@"`
|
163
179
|
|
164
180
|
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
181
|
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
166
182
|
|
167
|
-
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
168
|
-
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
169
|
-
cd "$(dirname "$0")"
|
170
|
-
fi
|
171
|
-
|
172
183
|
exec "$JAVACMD" "$@"
|
data/gradlew.bat
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
@rem
|
2
|
+
@rem Copyright 2015 the original author or authors.
|
3
|
+
@rem
|
4
|
+
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
@rem you may not use this file except in compliance with the License.
|
6
|
+
@rem You may obtain a copy of the License at
|
7
|
+
@rem
|
8
|
+
@rem https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
@rem
|
10
|
+
@rem Unless required by applicable law or agreed to in writing, software
|
11
|
+
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
@rem See the License for the specific language governing permissions and
|
14
|
+
@rem limitations under the License.
|
15
|
+
@rem
|
16
|
+
|
1
17
|
@if "%DEBUG%" == "" @echo off
|
2
18
|
@rem ##########################################################################
|
3
19
|
@rem
|
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
|
|
14
30
|
set APP_HOME=%DIRNAME%
|
15
31
|
|
16
32
|
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
-
set DEFAULT_JVM_OPTS=
|
33
|
+
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
18
34
|
|
19
35
|
@rem Find java.exe
|
20
36
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
data/run_s3_local.sh
ADDED
@@ -1,202 +1,250 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.util.{Optional, Map => JMap}
|
5
4
|
|
6
|
-
import com.amazonaws.services.glue.model.{
|
5
|
+
import com.amazonaws.services.glue.model.{
|
6
|
+
Column,
|
7
|
+
CreateTableRequest,
|
8
|
+
DeleteTableRequest,
|
9
|
+
GetTableRequest,
|
10
|
+
SerDeInfo,
|
11
|
+
StorageDescriptor,
|
12
|
+
TableInput
|
13
|
+
}
|
7
14
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
8
15
|
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
9
16
|
import org.embulk.output.s3_parquet.aws.Aws
|
10
17
|
import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
|
11
18
|
import org.embulk.spi.Schema
|
12
|
-
import org.embulk.spi.`type`.{
|
19
|
+
import org.embulk.spi.`type`.{
|
20
|
+
BooleanType,
|
21
|
+
DoubleType,
|
22
|
+
JsonType,
|
23
|
+
LongType,
|
24
|
+
StringType,
|
25
|
+
TimestampType,
|
26
|
+
Type
|
27
|
+
}
|
13
28
|
import org.slf4j.{Logger, LoggerFactory}
|
14
29
|
|
15
30
|
import scala.jdk.CollectionConverters._
|
16
31
|
import scala.util.Try
|
17
32
|
|
18
|
-
|
19
|
-
|
20
|
-
{
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
33
|
+
object CatalogRegistrator {
|
34
|
+
|
35
|
+
trait Task extends org.embulk.config.Task {
|
36
|
+
|
37
|
+
@Config("catalog_id")
|
38
|
+
@ConfigDefault("null")
|
39
|
+
def getCatalogId: Optional[String]
|
40
|
+
|
41
|
+
@Config("database")
|
42
|
+
def getDatabase: String
|
43
|
+
|
44
|
+
@Config("table")
|
45
|
+
def getTable: String
|
46
|
+
|
47
|
+
@Config("column_options")
|
48
|
+
@ConfigDefault("{}")
|
49
|
+
def getColumnOptions: JMap[String, ColumnOptions]
|
50
|
+
|
51
|
+
@Config("operation_if_exists")
|
52
|
+
@ConfigDefault("\"delete\"")
|
53
|
+
def getOperationIfExists: String
|
54
|
+
}
|
55
|
+
|
56
|
+
trait ColumnOptions {
|
57
|
+
|
58
|
+
@Config("type")
|
59
|
+
def getType: String
|
60
|
+
}
|
61
|
+
|
62
|
+
def apply(
|
63
|
+
aws: Aws,
|
64
|
+
task: Task,
|
65
|
+
schema: Schema,
|
66
|
+
location: String,
|
67
|
+
compressionCodec: CompressionCodecName,
|
68
|
+
loggerOption: Option[Logger] = None,
|
69
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty
|
70
|
+
): CatalogRegistrator = {
|
71
|
+
new CatalogRegistrator(
|
72
|
+
aws,
|
73
|
+
task,
|
74
|
+
schema,
|
75
|
+
location,
|
76
|
+
compressionCodec,
|
77
|
+
loggerOption,
|
78
|
+
parquetColumnLogicalTypes
|
79
|
+
)
|
80
|
+
}
|
59
81
|
}
|
60
82
|
|
61
|
-
class CatalogRegistrator(
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
83
|
+
class CatalogRegistrator(
|
84
|
+
aws: Aws,
|
85
|
+
task: CatalogRegistrator.Task,
|
86
|
+
schema: Schema,
|
87
|
+
location: String,
|
88
|
+
compressionCodec: CompressionCodecName,
|
89
|
+
loggerOption: Option[Logger] = None,
|
90
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty
|
91
|
+
) {
|
92
|
+
|
93
|
+
val logger: Logger =
|
94
|
+
loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
|
95
|
+
|
96
|
+
def run(): Unit = {
|
97
|
+
if (doesTableExists()) {
|
98
|
+
task.getOperationIfExists match {
|
99
|
+
case "skip" =>
|
100
|
+
logger.info(
|
101
|
+
s"Skip to register the table: ${task.getDatabase}.${task.getTable}"
|
102
|
+
)
|
103
|
+
return
|
104
|
+
|
105
|
+
case "delete" =>
|
106
|
+
logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
|
107
|
+
deleteTable()
|
108
|
+
|
109
|
+
case unknown =>
|
110
|
+
throw new ConfigException(s"Unsupported operation: $unknown")
|
111
|
+
}
|
89
112
|
}
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
113
|
+
registerNewParquetTable()
|
114
|
+
showNewTableInfo()
|
115
|
+
}
|
116
|
+
|
117
|
+
def showNewTableInfo(): Unit = {
|
118
|
+
val req = new GetTableRequest()
|
119
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
120
|
+
req.setDatabaseName(task.getDatabase)
|
121
|
+
req.setName(task.getTable)
|
122
|
+
|
123
|
+
val t = aws.withGlue(_.getTable(req)).getTable
|
124
|
+
logger.info(s"Created a table: ${t.toString}")
|
125
|
+
}
|
126
|
+
|
127
|
+
def doesTableExists(): Boolean = {
|
128
|
+
val req = new GetTableRequest()
|
129
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
130
|
+
req.setDatabaseName(task.getDatabase)
|
131
|
+
req.setName(task.getTable)
|
132
|
+
|
133
|
+
Try(aws.withGlue(_.getTable(req))).isSuccess
|
134
|
+
}
|
135
|
+
|
136
|
+
def deleteTable(): Unit = {
|
137
|
+
val req = new DeleteTableRequest()
|
138
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
139
|
+
req.setDatabaseName(task.getDatabase)
|
140
|
+
req.setName(task.getTable)
|
141
|
+
aws.withGlue(_.deleteTable(req))
|
142
|
+
}
|
143
|
+
|
144
|
+
def registerNewParquetTable(): Unit = {
|
145
|
+
logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
|
146
|
+
val req = new CreateTableRequest()
|
147
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
148
|
+
req.setDatabaseName(task.getDatabase)
|
149
|
+
req.setTableInput(
|
150
|
+
new TableInput()
|
151
|
+
.withName(task.getTable)
|
152
|
+
.withDescription("Created by embulk-output-s3_parquet")
|
153
|
+
.withTableType("EXTERNAL_TABLE")
|
154
|
+
.withParameters(
|
155
|
+
Map(
|
156
|
+
"EXTERNAL" -> "TRUE",
|
157
|
+
"classification" -> "parquet",
|
158
|
+
"parquet.compression" -> compressionCodec.name()
|
159
|
+
).asJava
|
160
|
+
)
|
161
|
+
.withStorageDescriptor(
|
162
|
+
new StorageDescriptor()
|
163
|
+
.withColumns(getGlueSchema: _*)
|
164
|
+
.withLocation(location)
|
165
|
+
.withCompressed(isCompressed)
|
166
|
+
.withInputFormat(
|
167
|
+
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
|
168
|
+
)
|
169
|
+
.withOutputFormat(
|
170
|
+
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
|
171
|
+
)
|
172
|
+
.withSerdeInfo(
|
173
|
+
new SerDeInfo()
|
174
|
+
.withSerializationLibrary(
|
175
|
+
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
|
176
|
+
)
|
177
|
+
.withParameters(Map("serialization.format" -> "1").asJava)
|
178
|
+
)
|
179
|
+
)
|
180
|
+
)
|
181
|
+
aws.withGlue(_.createTable(req))
|
182
|
+
}
|
183
|
+
|
184
|
+
private def getGlueSchema: Seq[Column] = {
|
185
|
+
val columnOptions: Map[String, ColumnOptions] =
|
186
|
+
task.getColumnOptions.asScala.toMap
|
187
|
+
schema.getColumns.asScala.toSeq.map { c =>
|
188
|
+
val cType: String =
|
189
|
+
if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
|
190
|
+
else if (parquetColumnLogicalTypes.contains(c.getName))
|
191
|
+
convertParquetLogicalTypeToGlueType(
|
192
|
+
parquetColumnLogicalTypes(c.getName)
|
193
|
+
)
|
194
|
+
else convertEmbulkTypeToGlueType(c.getType)
|
195
|
+
new Column()
|
196
|
+
.withName(c.getName)
|
197
|
+
.withType(cType)
|
110
198
|
}
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
.withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
|
140
|
-
.withSerdeInfo(new SerDeInfo()
|
141
|
-
.withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
|
142
|
-
.withParameters(Map("serialization.format" -> "1").asJava)
|
143
|
-
)
|
144
|
-
)
|
145
|
-
)
|
146
|
-
aws.withGlue(_.createTable(req))
|
199
|
+
}
|
200
|
+
|
201
|
+
private def convertParquetLogicalTypeToGlueType(t: String): String = {
|
202
|
+
t match {
|
203
|
+
case "timestamp-millis" => "timestamp"
|
204
|
+
case "timestamp-micros" =>
|
205
|
+
"bigint" // Glue cannot recognize timestamp-micros.
|
206
|
+
case "int8" => "tinyint"
|
207
|
+
case "int16" => "smallint"
|
208
|
+
case "int32" => "int"
|
209
|
+
case "int64" => "bigint"
|
210
|
+
case "uint8" =>
|
211
|
+
"smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
|
212
|
+
case "uint16" =>
|
213
|
+
"int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
|
214
|
+
case "uint32" =>
|
215
|
+
"bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
|
216
|
+
case "uint64" =>
|
217
|
+
throw new ConfigException(
|
218
|
+
"Cannot convert uint64 to Glue data types automatically" +
|
219
|
+
" because the Glue bigint supports a 64-bit signed integer." +
|
220
|
+
" Please use `catalog.column_options` to define the type."
|
221
|
+
)
|
222
|
+
case "json" => "string"
|
223
|
+
case _ =>
|
224
|
+
throw new ConfigException(
|
225
|
+
s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type."
|
226
|
+
)
|
147
227
|
}
|
148
228
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
229
|
+
}
|
230
|
+
|
231
|
+
private def convertEmbulkTypeToGlueType(t: Type): String = {
|
232
|
+
t match {
|
233
|
+
case _: BooleanType => "boolean"
|
234
|
+
case _: LongType => "bigint"
|
235
|
+
case _: DoubleType => "double"
|
236
|
+
case _: StringType => "string"
|
237
|
+
case _: TimestampType => "string"
|
238
|
+
case _: JsonType => "string"
|
239
|
+
case unknown =>
|
240
|
+
throw new ConfigException(
|
241
|
+
s"Unsupported embulk type: ${unknown.getName}"
|
242
|
+
)
|
161
243
|
}
|
244
|
+
}
|
162
245
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
case "timestamp-millis" => "timestamp"
|
167
|
-
case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
|
168
|
-
case "int8" => "tinyint"
|
169
|
-
case "int16" => "smallint"
|
170
|
-
case "int32" => "int"
|
171
|
-
case "int64" => "bigint"
|
172
|
-
case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
|
173
|
-
case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
|
174
|
-
case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
|
175
|
-
case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
|
176
|
-
" because the Glue bigint supports a 64-bit signed integer." +
|
177
|
-
" Please use `catalog.column_options` to define the type.")
|
178
|
-
case "json" => "string"
|
179
|
-
case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
|
180
|
-
}
|
181
|
-
|
182
|
-
}
|
183
|
-
|
184
|
-
private def convertEmbulkTypeToGlueType(t: Type): String =
|
185
|
-
{
|
186
|
-
t match {
|
187
|
-
case _: BooleanType => "boolean"
|
188
|
-
case _: LongType => "bigint"
|
189
|
-
case _: DoubleType => "double"
|
190
|
-
case _: StringType => "string"
|
191
|
-
case _: TimestampType => "string"
|
192
|
-
case _: JsonType => "string"
|
193
|
-
case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
|
194
|
-
}
|
195
|
-
}
|
196
|
-
|
197
|
-
private def isCompressed: Boolean =
|
198
|
-
{
|
199
|
-
!compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
|
200
|
-
}
|
246
|
+
private def isCompressed: Boolean = {
|
247
|
+
!compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
|
248
|
+
}
|
201
249
|
|
202
250
|
}
|