embulk-output-s3_parquet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a5fcc051188467ff067c7542e3c12d32b9ce57e
4
- data.tar.gz: c93d01c345e6e3a8b43f335f0467ee47532cc32d
3
+ metadata.gz: 69eeaa8791df4a9dce1d4746d881805e7f8c2ea4
4
+ data.tar.gz: 322f28022072631766fb7f862b4465f04f8f0745
5
5
  SHA512:
6
- metadata.gz: 510bf2837f6c57e225b53084790dc0e79feef60247b73d8aee7cd268725676e0783e9ecee0cb2db4a3235969634802b2b8005208f36a4f1a86f1d15777ea9bb9
7
- data.tar.gz: a1c23cbf8e5bc1c5414e4e906b0060cb5a7da9085767f319df57763724cfc07cda3925ba085015951dd3a5e40a9dba6a211d777474f00383b9d3f08d9f1d706a
6
+ metadata.gz: 6cfbe96838e1960f5097ee9c33f78a2d02f111c9b06014954f18b7cebf97b89d265b22affd755bd1318b4a6a9e9953599aeaa013cde8bc3a7e5d91264abeed71
7
+ data.tar.gz: e5eac48dd2822412acff3d0612cff714d77fa9a15d8fa33a27b8d3c668f226eb1fbbdbfd4dbf6649ebc6667fcc01273d4da7a896987537d0e2fa0ca654dbbaed
@@ -25,6 +25,9 @@ jobs:
25
25
  uses: actions/setup-java@v1
26
26
  with:
27
27
  java-version: 1.8
28
+ - name: scalafmt
29
+ if: github.event.pull_request.merged == true
30
+ run: ./gradlew spotlessCheck
28
31
  - name: Test with Gradle
29
32
  if: github.event.pull_request.merged == true
30
33
  run: ./gradlew test
@@ -21,6 +21,8 @@ jobs:
21
21
  uses: actions/setup-java@v1
22
22
  with:
23
23
  java-version: 1.8
24
+ - name: scalafmt
25
+ run: ./gradlew spotlessCheck
24
26
  - name: Test with Gradle
25
27
  run: ./gradlew test
26
28
 
@@ -0,0 +1,5 @@
1
+ # https://scalameta.org/scalafmt/#Configuration
2
+
3
+ version = "2.3.2"
4
+ newlines.alwaysBeforeElseAfterCurlyIf = true
5
+ newlines.alwaysBeforeTopLevelStatements = true
@@ -1,3 +1,18 @@
1
+ 0.2.0 (2020-03-10)
2
+ ==================
3
+
4
+ * [Enhancement] [#23](https://github.com/civitaspo/embulk-output-s3_parquet/pull/23) Limit the usage of swapping ContextClassLoader
5
+ * [BugFix] [#24](https://github.com/civitaspo/embulk-output-s3_parquet/pull/24) Use basic credentials correctly
6
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update gradle 4.1 -> 6.1
7
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update parquet-{column,common,encoding,hadoop,jackson,tools} 1.10.1 -> 1.11.0 with the latest parquet-format 2.4.0 -> 2.7.0
8
+ * [parquet-format CHANGELOG](https://github.com/apache/parquet-format/blob/master/CHANGES.md)
9
+ * [parquet-mr CHANGELOG](https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/CHANGES.md#version-1110)
10
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update aws-java-sdk 1.11.676 -> 1.11.739
11
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update embulk 0.9.20 -> 0.9.23 with embulk-deps-{config,buffer}
12
+ * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt instead of the Intellij formatter.
13
+ * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt in CI.
14
+ * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Enable to run examples locally with some prepared scripts.
15
+
1
16
  0.1.0 (2019-11-17)
2
17
  ==================
3
18
 
data/README.md CHANGED
@@ -131,6 +131,8 @@ out:
131
131
  ### Run example:
132
132
 
133
133
  ```shell
134
+ $ ./run_s3_local.sh
135
+ $ ./example/prepare_s3_bucket.sh
134
136
  $ ./gradlew classpath
135
137
  $ embulk run example/config.yml -Ilib
136
138
  ```
@@ -138,8 +140,7 @@ $ embulk run example/config.yml -Ilib
138
140
  ### Run test:
139
141
 
140
142
  ```shell
141
- ## Run fake S3 with localstack
142
- $ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
143
+ $ ./run_s3_local.sh
143
144
  $ ./gradlew test
144
145
  ```
145
146
 
@@ -3,6 +3,7 @@ plugins {
3
3
  id "com.jfrog.bintray" version "1.1"
4
4
  id "com.github.jruby-gradle.base" version "1.5.0"
5
5
  id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
6
+ id "com.diffplug.gradle.spotless" version "3.27.1"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
@@ -13,29 +14,32 @@ configurations {
13
14
  provided
14
15
  }
15
16
 
16
- version = "0.1.0"
17
+ version = "0.2.0"
17
18
 
18
19
  sourceCompatibility = 1.8
19
20
  targetCompatibility = 1.8
20
21
 
21
22
  dependencies {
22
- compile "org.embulk:embulk-core:0.9.20"
23
- provided "org.embulk:embulk-core:0.9.20"
23
+ compile "org.embulk:embulk-core:0.9.23"
24
+ provided "org.embulk:embulk-core:0.9.23"
24
25
 
25
26
  compile 'org.scala-lang:scala-library:2.13.1'
26
27
  ['glue', 's3', 'sts'].each { v ->
27
- compile "com.amazonaws:aws-java-sdk-${v}:1.11.676"
28
+ compile "com.amazonaws:aws-java-sdk-${v}:1.11.739"
28
29
  }
29
- ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
30
- compile "org.apache.parquet:parquet-${v}:1.10.1"
30
+ ['column', 'common', 'encoding', 'hadoop', 'jackson'].each { v ->
31
+ compile "org.apache.parquet:parquet-${v}:1.11.0"
31
32
  }
33
+ // ref. https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/pom.xml#L85
34
+ compile 'org.apache.parquet:parquet-format:2.7.0'
32
35
  compile 'org.apache.hadoop:hadoop-common:2.9.2'
33
36
  compile 'org.xerial.snappy:snappy-java:1.1.7.3'
34
37
 
38
+ ['test', 'standards', 'deps-buffer', 'deps-config'].each { v ->
39
+ testCompile "org.embulk:embulk-${v}:0.9.23"
40
+ }
35
41
  testCompile 'org.scalatest:scalatest_2.13:3.0.8'
36
- testCompile 'org.embulk:embulk-test:0.9.20'
37
- testCompile 'org.embulk:embulk-standards:0.9.20'
38
- testCompile 'org.apache.parquet:parquet-tools:1.10.1'
42
+ testCompile 'org.apache.parquet:parquet-tools:1.11.0'
39
43
  testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
40
44
  }
41
45
 
@@ -43,6 +47,12 @@ testlogger {
43
47
  theme "mocha"
44
48
  }
45
49
 
50
+ spotless {
51
+ scala {
52
+ scalafmt('2.3.2').configFile('.scalafmt.conf')
53
+ }
54
+ }
55
+
46
56
  task classpath(type: Copy, dependsOn: ["jar"]) {
47
57
  doFirst { file("classpath").deleteDir() }
48
58
  from (configurations.runtime - configurations.provided + files(jar.archivePath))
@@ -17,7 +17,9 @@ in:
17
17
 
18
18
  out:
19
19
  type: s3_parquet
20
- bucket: my-bucket
20
+ bucket: example
21
+ region: us-east-1
22
+ endpoint: http://127.0.0.1:4572
21
23
  path_prefix: path/to/my-obj.
22
24
  file_ext: snappy.parquet
23
25
  compression_codec: snappy
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+
3
+ aws s3 mb s3://example \
4
+ --endpoint-url http://localhost:4572 \
5
+ --region us-east-1
6
+
@@ -17,7 +17,9 @@ in:
17
17
 
18
18
  out:
19
19
  type: s3_parquet
20
- bucket: dev-baikal-workspace
20
+ bucket: example
21
+ region: us-east-1
22
+ endpoint: http://127.0.0.1:4572
21
23
  path_prefix: path/to/my-obj-2.
22
24
  file_ext: snappy.parquet
23
25
  compression_codec: snappy
@@ -17,7 +17,9 @@ in:
17
17
 
18
18
  out:
19
19
  type: s3_parquet
20
- bucket: my-bucket
20
+ bucket: example
21
+ region: us-east-1
22
+ endpoint: http://127.0.0.1:4572
21
23
  path_prefix: path/to/my-obj-2.
22
24
  file_ext: snappy.parquet
23
25
  compression_codec: snappy
@@ -1,5 +1,5 @@
1
1
  distributionBase=GRADLE_USER_HOME
2
2
  distributionPath=wrapper/dists
3
+ distributionUrl=https\://services.gradle.org/distributions/gradle-6.1-bin.zip
3
4
  zipStoreBase=GRADLE_USER_HOME
4
5
  zipStorePath=wrapper/dists
5
- distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
data/gradlew CHANGED
@@ -1,5 +1,21 @@
1
1
  #!/usr/bin/env sh
2
2
 
3
+ #
4
+ # Copyright 2015 the original author or authors.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # https://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+
3
19
  ##############################################################################
4
20
  ##
5
21
  ## Gradle start up script for UN*X
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
28
44
  APP_BASE_NAME=`basename "$0"`
29
45
 
30
46
  # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
- DEFAULT_JVM_OPTS=""
47
+ DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
32
48
 
33
49
  # Use the maximum available, or set MAX_FD != -1 to use that value.
34
50
  MAX_FD="maximum"
@@ -109,8 +125,8 @@ if $darwin; then
109
125
  GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110
126
  fi
111
127
 
112
- # For Cygwin, switch paths to Windows format before running java
113
- if $cygwin ; then
128
+ # For Cygwin or MSYS, switch paths to Windows format before running java
129
+ if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
114
130
  APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115
131
  CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
132
  JAVACMD=`cygpath --unix "$JAVACMD"`
@@ -138,19 +154,19 @@ if $cygwin ; then
138
154
  else
139
155
  eval `echo args$i`="\"$arg\""
140
156
  fi
141
- i=$((i+1))
157
+ i=`expr $i + 1`
142
158
  done
143
159
  case $i in
144
- (0) set -- ;;
145
- (1) set -- "$args0" ;;
146
- (2) set -- "$args0" "$args1" ;;
147
- (3) set -- "$args0" "$args1" "$args2" ;;
148
- (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
- (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
- (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
- (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
- (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
- (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
160
+ 0) set -- ;;
161
+ 1) set -- "$args0" ;;
162
+ 2) set -- "$args0" "$args1" ;;
163
+ 3) set -- "$args0" "$args1" "$args2" ;;
164
+ 4) set -- "$args0" "$args1" "$args2" "$args3" ;;
165
+ 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
166
+ 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
167
+ 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
168
+ 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
169
+ 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
170
  esac
155
171
  fi
156
172
 
@@ -159,14 +175,9 @@ save () {
159
175
  for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
176
  echo " "
161
177
  }
162
- APP_ARGS=$(save "$@")
178
+ APP_ARGS=`save "$@"`
163
179
 
164
180
  # Collect all arguments for the java command, following the shell quoting and substitution rules
165
181
  eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166
182
 
167
- # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168
- if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169
- cd "$(dirname "$0")"
170
- fi
171
-
172
183
  exec "$JAVACMD" "$@"
@@ -1,3 +1,19 @@
1
+ @rem
2
+ @rem Copyright 2015 the original author or authors.
3
+ @rem
4
+ @rem Licensed under the Apache License, Version 2.0 (the "License");
5
+ @rem you may not use this file except in compliance with the License.
6
+ @rem You may obtain a copy of the License at
7
+ @rem
8
+ @rem https://www.apache.org/licenses/LICENSE-2.0
9
+ @rem
10
+ @rem Unless required by applicable law or agreed to in writing, software
11
+ @rem distributed under the License is distributed on an "AS IS" BASIS,
12
+ @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ @rem See the License for the specific language governing permissions and
14
+ @rem limitations under the License.
15
+ @rem
16
+
1
17
  @if "%DEBUG%" == "" @echo off
2
18
  @rem ##########################################################################
3
19
  @rem
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
14
30
  set APP_HOME=%DIRNAME%
15
31
 
16
32
  @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
- set DEFAULT_JVM_OPTS=
33
+ set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
18
34
 
19
35
  @rem Find java.exe
20
36
  if defined JAVA_HOME goto findJavaFromJavaHome
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+
3
+ docker run -it -d --rm \
4
+ -p 4572:4572 \
5
+ -e SERVICES=s3 \
6
+ localstack/localstack
7
+
@@ -1,202 +1,250 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
-
4
3
  import java.util.{Optional, Map => JMap}
5
4
 
6
- import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
5
+ import com.amazonaws.services.glue.model.{
6
+ Column,
7
+ CreateTableRequest,
8
+ DeleteTableRequest,
9
+ GetTableRequest,
10
+ SerDeInfo,
11
+ StorageDescriptor,
12
+ TableInput
13
+ }
7
14
  import org.apache.parquet.hadoop.metadata.CompressionCodecName
8
15
  import org.embulk.config.{Config, ConfigDefault, ConfigException}
9
16
  import org.embulk.output.s3_parquet.aws.Aws
10
17
  import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
11
18
  import org.embulk.spi.Schema
12
- import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
19
+ import org.embulk.spi.`type`.{
20
+ BooleanType,
21
+ DoubleType,
22
+ JsonType,
23
+ LongType,
24
+ StringType,
25
+ TimestampType,
26
+ Type
27
+ }
13
28
  import org.slf4j.{Logger, LoggerFactory}
14
29
 
15
30
  import scala.jdk.CollectionConverters._
16
31
  import scala.util.Try
17
32
 
18
-
19
- object CatalogRegistrator
20
- {
21
- trait Task
22
- extends org.embulk.config.Task
23
- {
24
- @Config("catalog_id")
25
- @ConfigDefault("null")
26
- def getCatalogId: Optional[String]
27
-
28
- @Config("database")
29
- def getDatabase: String
30
-
31
- @Config("table")
32
- def getTable: String
33
-
34
- @Config("column_options")
35
- @ConfigDefault("{}")
36
- def getColumnOptions: JMap[String, ColumnOptions]
37
-
38
- @Config("operation_if_exists")
39
- @ConfigDefault("\"delete\"")
40
- def getOperationIfExists: String
41
- }
42
-
43
- trait ColumnOptions
44
- {
45
- @Config("type")
46
- def getType: String
47
- }
48
-
49
- def apply(aws: Aws,
50
- task: Task,
51
- schema: Schema,
52
- location: String,
53
- compressionCodec: CompressionCodecName,
54
- loggerOption: Option[Logger] = None,
55
- parquetColumnLogicalTypes: Map[String, String] = Map.empty): CatalogRegistrator =
56
- {
57
- new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption, parquetColumnLogicalTypes)
58
- }
33
+ object CatalogRegistrator {
34
+
35
+ trait Task extends org.embulk.config.Task {
36
+
37
+ @Config("catalog_id")
38
+ @ConfigDefault("null")
39
+ def getCatalogId: Optional[String]
40
+
41
+ @Config("database")
42
+ def getDatabase: String
43
+
44
+ @Config("table")
45
+ def getTable: String
46
+
47
+ @Config("column_options")
48
+ @ConfigDefault("{}")
49
+ def getColumnOptions: JMap[String, ColumnOptions]
50
+
51
+ @Config("operation_if_exists")
52
+ @ConfigDefault("\"delete\"")
53
+ def getOperationIfExists: String
54
+ }
55
+
56
+ trait ColumnOptions {
57
+
58
+ @Config("type")
59
+ def getType: String
60
+ }
61
+
62
+ def apply(
63
+ aws: Aws,
64
+ task: Task,
65
+ schema: Schema,
66
+ location: String,
67
+ compressionCodec: CompressionCodecName,
68
+ loggerOption: Option[Logger] = None,
69
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty
70
+ ): CatalogRegistrator = {
71
+ new CatalogRegistrator(
72
+ aws,
73
+ task,
74
+ schema,
75
+ location,
76
+ compressionCodec,
77
+ loggerOption,
78
+ parquetColumnLogicalTypes
79
+ )
80
+ }
59
81
  }
60
82
 
61
- class CatalogRegistrator(aws: Aws,
62
- task: CatalogRegistrator.Task,
63
- schema: Schema,
64
- location: String,
65
- compressionCodec: CompressionCodecName,
66
- loggerOption: Option[Logger] = None,
67
- parquetColumnLogicalTypes: Map[String, String] = Map.empty)
68
- {
69
- val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
70
-
71
- def run(): Unit =
72
- {
73
- if (doesTableExists()) {
74
- task.getOperationIfExists match {
75
- case "skip" =>
76
- logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
77
- return
78
-
79
- case "delete" =>
80
- logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
81
- deleteTable()
82
-
83
- case unknown =>
84
- throw new ConfigException(s"Unsupported operation: $unknown")
85
- }
86
- }
87
- registerNewParquetTable()
88
- showNewTableInfo()
83
+ class CatalogRegistrator(
84
+ aws: Aws,
85
+ task: CatalogRegistrator.Task,
86
+ schema: Schema,
87
+ location: String,
88
+ compressionCodec: CompressionCodecName,
89
+ loggerOption: Option[Logger] = None,
90
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty
91
+ ) {
92
+
93
+ val logger: Logger =
94
+ loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
95
+
96
+ def run(): Unit = {
97
+ if (doesTableExists()) {
98
+ task.getOperationIfExists match {
99
+ case "skip" =>
100
+ logger.info(
101
+ s"Skip to register the table: ${task.getDatabase}.${task.getTable}"
102
+ )
103
+ return
104
+
105
+ case "delete" =>
106
+ logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
107
+ deleteTable()
108
+
109
+ case unknown =>
110
+ throw new ConfigException(s"Unsupported operation: $unknown")
111
+ }
89
112
  }
90
-
91
- def showNewTableInfo(): Unit =
92
- {
93
- val req = new GetTableRequest()
94
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
95
- req.setDatabaseName(task.getDatabase)
96
- req.setName(task.getTable)
97
-
98
- val t = aws.withGlue(_.getTable(req)).getTable
99
- logger.info(s"Created a table: ${t.toString}")
100
- }
101
-
102
- def doesTableExists(): Boolean =
103
- {
104
- val req = new GetTableRequest()
105
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
106
- req.setDatabaseName(task.getDatabase)
107
- req.setName(task.getTable)
108
-
109
- Try(aws.withGlue(_.getTable(req))).isSuccess
113
+ registerNewParquetTable()
114
+ showNewTableInfo()
115
+ }
116
+
117
+ def showNewTableInfo(): Unit = {
118
+ val req = new GetTableRequest()
119
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
120
+ req.setDatabaseName(task.getDatabase)
121
+ req.setName(task.getTable)
122
+
123
+ val t = aws.withGlue(_.getTable(req)).getTable
124
+ logger.info(s"Created a table: ${t.toString}")
125
+ }
126
+
127
+ def doesTableExists(): Boolean = {
128
+ val req = new GetTableRequest()
129
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
130
+ req.setDatabaseName(task.getDatabase)
131
+ req.setName(task.getTable)
132
+
133
+ Try(aws.withGlue(_.getTable(req))).isSuccess
134
+ }
135
+
136
+ def deleteTable(): Unit = {
137
+ val req = new DeleteTableRequest()
138
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
139
+ req.setDatabaseName(task.getDatabase)
140
+ req.setName(task.getTable)
141
+ aws.withGlue(_.deleteTable(req))
142
+ }
143
+
144
+ def registerNewParquetTable(): Unit = {
145
+ logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
146
+ val req = new CreateTableRequest()
147
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
148
+ req.setDatabaseName(task.getDatabase)
149
+ req.setTableInput(
150
+ new TableInput()
151
+ .withName(task.getTable)
152
+ .withDescription("Created by embulk-output-s3_parquet")
153
+ .withTableType("EXTERNAL_TABLE")
154
+ .withParameters(
155
+ Map(
156
+ "EXTERNAL" -> "TRUE",
157
+ "classification" -> "parquet",
158
+ "parquet.compression" -> compressionCodec.name()
159
+ ).asJava
160
+ )
161
+ .withStorageDescriptor(
162
+ new StorageDescriptor()
163
+ .withColumns(getGlueSchema: _*)
164
+ .withLocation(location)
165
+ .withCompressed(isCompressed)
166
+ .withInputFormat(
167
+ "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
168
+ )
169
+ .withOutputFormat(
170
+ "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
171
+ )
172
+ .withSerdeInfo(
173
+ new SerDeInfo()
174
+ .withSerializationLibrary(
175
+ "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
176
+ )
177
+ .withParameters(Map("serialization.format" -> "1").asJava)
178
+ )
179
+ )
180
+ )
181
+ aws.withGlue(_.createTable(req))
182
+ }
183
+
184
+ private def getGlueSchema: Seq[Column] = {
185
+ val columnOptions: Map[String, ColumnOptions] =
186
+ task.getColumnOptions.asScala.toMap
187
+ schema.getColumns.asScala.toSeq.map { c =>
188
+ val cType: String =
189
+ if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
190
+ else if (parquetColumnLogicalTypes.contains(c.getName))
191
+ convertParquetLogicalTypeToGlueType(
192
+ parquetColumnLogicalTypes(c.getName)
193
+ )
194
+ else convertEmbulkTypeToGlueType(c.getType)
195
+ new Column()
196
+ .withName(c.getName)
197
+ .withType(cType)
110
198
  }
111
-
112
- def deleteTable(): Unit =
113
- {
114
- val req = new DeleteTableRequest()
115
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
116
- req.setDatabaseName(task.getDatabase)
117
- req.setName(task.getTable)
118
- aws.withGlue(_.deleteTable(req))
119
- }
120
-
121
- def registerNewParquetTable(): Unit =
122
- {
123
- logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
124
- val req = new CreateTableRequest()
125
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
126
- req.setDatabaseName(task.getDatabase)
127
- req.setTableInput(new TableInput()
128
- .withName(task.getTable)
129
- .withDescription("Created by embulk-output-s3_parquet")
130
- .withTableType("EXTERNAL_TABLE")
131
- .withParameters(Map("EXTERNAL" -> "TRUE",
132
- "classification" -> "parquet",
133
- "parquet.compression" -> compressionCodec.name()).asJava)
134
- .withStorageDescriptor(new StorageDescriptor()
135
- .withColumns(getGlueSchema: _*)
136
- .withLocation(location)
137
- .withCompressed(isCompressed)
138
- .withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
139
- .withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
140
- .withSerdeInfo(new SerDeInfo()
141
- .withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
142
- .withParameters(Map("serialization.format" -> "1").asJava)
143
- )
144
- )
145
- )
146
- aws.withGlue(_.createTable(req))
199
+ }
200
+
201
+ private def convertParquetLogicalTypeToGlueType(t: String): String = {
202
+ t match {
203
+ case "timestamp-millis" => "timestamp"
204
+ case "timestamp-micros" =>
205
+ "bigint" // Glue cannot recognize timestamp-micros.
206
+ case "int8" => "tinyint"
207
+ case "int16" => "smallint"
208
+ case "int32" => "int"
209
+ case "int64" => "bigint"
210
+ case "uint8" =>
211
+ "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
212
+ case "uint16" =>
213
+ "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
214
+ case "uint32" =>
215
+ "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
216
+ case "uint64" =>
217
+ throw new ConfigException(
218
+ "Cannot convert uint64 to Glue data types automatically" +
219
+ " because the Glue bigint supports a 64-bit signed integer." +
220
+ " Please use `catalog.column_options` to define the type."
221
+ )
222
+ case "json" => "string"
223
+ case _ =>
224
+ throw new ConfigException(
225
+ s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type."
226
+ )
147
227
  }
148
228
 
149
- private def getGlueSchema: Seq[Column] =
150
- {
151
- val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
152
- schema.getColumns.asScala.toSeq.map { c =>
153
- val cType: String =
154
- if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
155
- else if (parquetColumnLogicalTypes.contains(c.getName)) convertParquetLogicalTypeToGlueType(parquetColumnLogicalTypes(c.getName))
156
- else convertEmbulkTypeToGlueType(c.getType)
157
- new Column()
158
- .withName(c.getName)
159
- .withType(cType)
160
- }
229
+ }
230
+
231
+ private def convertEmbulkTypeToGlueType(t: Type): String = {
232
+ t match {
233
+ case _: BooleanType => "boolean"
234
+ case _: LongType => "bigint"
235
+ case _: DoubleType => "double"
236
+ case _: StringType => "string"
237
+ case _: TimestampType => "string"
238
+ case _: JsonType => "string"
239
+ case unknown =>
240
+ throw new ConfigException(
241
+ s"Unsupported embulk type: ${unknown.getName}"
242
+ )
161
243
  }
244
+ }
162
245
 
163
- private def convertParquetLogicalTypeToGlueType(t: String): String =
164
- {
165
- t match {
166
- case "timestamp-millis" => "timestamp"
167
- case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
168
- case "int8" => "tinyint"
169
- case "int16" => "smallint"
170
- case "int32" => "int"
171
- case "int64" => "bigint"
172
- case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
173
- case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
174
- case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
175
- case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
176
- " because the Glue bigint supports a 64-bit signed integer." +
177
- " Please use `catalog.column_options` to define the type.")
178
- case "json" => "string"
179
- case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
180
- }
181
-
182
- }
183
-
184
- private def convertEmbulkTypeToGlueType(t: Type): String =
185
- {
186
- t match {
187
- case _: BooleanType => "boolean"
188
- case _: LongType => "bigint"
189
- case _: DoubleType => "double"
190
- case _: StringType => "string"
191
- case _: TimestampType => "string"
192
- case _: JsonType => "string"
193
- case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
194
- }
195
- }
196
-
197
- private def isCompressed: Boolean =
198
- {
199
- !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
200
- }
246
+ private def isCompressed: Boolean = {
247
+ !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
248
+ }
201
249
 
202
250
  }