embulk-output-s3_parquet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a5fcc051188467ff067c7542e3c12d32b9ce57e
4
- data.tar.gz: c93d01c345e6e3a8b43f335f0467ee47532cc32d
3
+ metadata.gz: 69eeaa8791df4a9dce1d4746d881805e7f8c2ea4
4
+ data.tar.gz: 322f28022072631766fb7f862b4465f04f8f0745
5
5
  SHA512:
6
- metadata.gz: 510bf2837f6c57e225b53084790dc0e79feef60247b73d8aee7cd268725676e0783e9ecee0cb2db4a3235969634802b2b8005208f36a4f1a86f1d15777ea9bb9
7
- data.tar.gz: a1c23cbf8e5bc1c5414e4e906b0060cb5a7da9085767f319df57763724cfc07cda3925ba085015951dd3a5e40a9dba6a211d777474f00383b9d3f08d9f1d706a
6
+ metadata.gz: 6cfbe96838e1960f5097ee9c33f78a2d02f111c9b06014954f18b7cebf97b89d265b22affd755bd1318b4a6a9e9953599aeaa013cde8bc3a7e5d91264abeed71
7
+ data.tar.gz: e5eac48dd2822412acff3d0612cff714d77fa9a15d8fa33a27b8d3c668f226eb1fbbdbfd4dbf6649ebc6667fcc01273d4da7a896987537d0e2fa0ca654dbbaed
@@ -25,6 +25,9 @@ jobs:
25
25
  uses: actions/setup-java@v1
26
26
  with:
27
27
  java-version: 1.8
28
+ - name: scalafmt
29
+ if: github.event.pull_request.merged == true
30
+ run: ./gradlew spotlessCheck
28
31
  - name: Test with Gradle
29
32
  if: github.event.pull_request.merged == true
30
33
  run: ./gradlew test
@@ -21,6 +21,8 @@ jobs:
21
21
  uses: actions/setup-java@v1
22
22
  with:
23
23
  java-version: 1.8
24
+ - name: scalafmt
25
+ run: ./gradlew spotlessCheck
24
26
  - name: Test with Gradle
25
27
  run: ./gradlew test
26
28
 
@@ -0,0 +1,5 @@
1
+ # https://scalameta.org/scalafmt/#Configuration
2
+
3
+ version = "2.3.2"
4
+ newlines.alwaysBeforeElseAfterCurlyIf = true
5
+ newlines.alwaysBeforeTopLevelStatements = true
@@ -1,3 +1,18 @@
1
+ 0.2.0 (2020-03-10)
2
+ ==================
3
+
4
+ * [Enhancement] [#23](https://github.com/civitaspo/embulk-output-s3_parquet/pull/23) Limit the usage of swapping ContextClassLoader
5
+ * [BugFix] [#24](https://github.com/civitaspo/embulk-output-s3_parquet/pull/24) Use basic credentials correctly
6
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update gradle 4.1 -> 6.1
7
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update parquet-{column,common,encoding,hadoop,jackson,tools} 1.10.1 -> 1.11.0 with the latest parquet-format 2.4.0 -> 2.7.0
8
+ * [parquet-format CHANGELOG](https://github.com/apache/parquet-format/blob/master/CHANGES.md)
9
+ * [parquet-mr CHANGELOG](https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/CHANGES.md#version-1110)
10
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update aws-java-sdk 1.11.676 -> 1.11.739
11
+ * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update embulk 0.9.20 -> 0.9.23 with embulk-deps-{config,buffer}
12
+ * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt instead of the Intellij formatter.
13
+ * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt in CI.
14
+ * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Enable to run examples locally with some prepared scripts.
15
+
1
16
  0.1.0 (2019-11-17)
2
17
  ==================
3
18
 
data/README.md CHANGED
@@ -131,6 +131,8 @@ out:
131
131
  ### Run example:
132
132
 
133
133
  ```shell
134
+ $ ./run_s3_local.sh
135
+ $ ./example/prepare_s3_bucket.sh
134
136
  $ ./gradlew classpath
135
137
  $ embulk run example/config.yml -Ilib
136
138
  ```
@@ -138,8 +140,7 @@ $ embulk run example/config.yml -Ilib
138
140
  ### Run test:
139
141
 
140
142
  ```shell
141
- ## Run fake S3 with localstack
142
- $ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
143
+ $ ./run_s3_local.sh
143
144
  $ ./gradlew test
144
145
  ```
145
146
 
@@ -3,6 +3,7 @@ plugins {
3
3
  id "com.jfrog.bintray" version "1.1"
4
4
  id "com.github.jruby-gradle.base" version "1.5.0"
5
5
  id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
6
+ id "com.diffplug.gradle.spotless" version "3.27.1"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
@@ -13,29 +14,32 @@ configurations {
13
14
  provided
14
15
  }
15
16
 
16
- version = "0.1.0"
17
+ version = "0.2.0"
17
18
 
18
19
  sourceCompatibility = 1.8
19
20
  targetCompatibility = 1.8
20
21
 
21
22
  dependencies {
22
- compile "org.embulk:embulk-core:0.9.20"
23
- provided "org.embulk:embulk-core:0.9.20"
23
+ compile "org.embulk:embulk-core:0.9.23"
24
+ provided "org.embulk:embulk-core:0.9.23"
24
25
 
25
26
  compile 'org.scala-lang:scala-library:2.13.1'
26
27
  ['glue', 's3', 'sts'].each { v ->
27
- compile "com.amazonaws:aws-java-sdk-${v}:1.11.676"
28
+ compile "com.amazonaws:aws-java-sdk-${v}:1.11.739"
28
29
  }
29
- ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
30
- compile "org.apache.parquet:parquet-${v}:1.10.1"
30
+ ['column', 'common', 'encoding', 'hadoop', 'jackson'].each { v ->
31
+ compile "org.apache.parquet:parquet-${v}:1.11.0"
31
32
  }
33
+ // ref. https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/pom.xml#L85
34
+ compile 'org.apache.parquet:parquet-format:2.7.0'
32
35
  compile 'org.apache.hadoop:hadoop-common:2.9.2'
33
36
  compile 'org.xerial.snappy:snappy-java:1.1.7.3'
34
37
 
38
+ ['test', 'standards', 'deps-buffer', 'deps-config'].each { v ->
39
+ testCompile "org.embulk:embulk-${v}:0.9.23"
40
+ }
35
41
  testCompile 'org.scalatest:scalatest_2.13:3.0.8'
36
- testCompile 'org.embulk:embulk-test:0.9.20'
37
- testCompile 'org.embulk:embulk-standards:0.9.20'
38
- testCompile 'org.apache.parquet:parquet-tools:1.10.1'
42
+ testCompile 'org.apache.parquet:parquet-tools:1.11.0'
39
43
  testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
40
44
  }
41
45
 
@@ -43,6 +47,12 @@ testlogger {
43
47
  theme "mocha"
44
48
  }
45
49
 
50
+ spotless {
51
+ scala {
52
+ scalafmt('2.3.2').configFile('.scalafmt.conf')
53
+ }
54
+ }
55
+
46
56
  task classpath(type: Copy, dependsOn: ["jar"]) {
47
57
  doFirst { file("classpath").deleteDir() }
48
58
  from (configurations.runtime - configurations.provided + files(jar.archivePath))
@@ -17,7 +17,9 @@ in:
17
17
 
18
18
  out:
19
19
  type: s3_parquet
20
- bucket: my-bucket
20
+ bucket: example
21
+ region: us-east-1
22
+ endpoint: http://127.0.0.1:4572
21
23
  path_prefix: path/to/my-obj.
22
24
  file_ext: snappy.parquet
23
25
  compression_codec: snappy
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+
3
+ aws s3 mb s3://example \
4
+ --endpoint-url http://localhost:4572 \
5
+ --region us-east-1
6
+
@@ -17,7 +17,9 @@ in:
17
17
 
18
18
  out:
19
19
  type: s3_parquet
20
- bucket: dev-baikal-workspace
20
+ bucket: example
21
+ region: us-east-1
22
+ endpoint: http://127.0.0.1:4572
21
23
  path_prefix: path/to/my-obj-2.
22
24
  file_ext: snappy.parquet
23
25
  compression_codec: snappy
@@ -17,7 +17,9 @@ in:
17
17
 
18
18
  out:
19
19
  type: s3_parquet
20
- bucket: my-bucket
20
+ bucket: example
21
+ region: us-east-1
22
+ endpoint: http://127.0.0.1:4572
21
23
  path_prefix: path/to/my-obj-2.
22
24
  file_ext: snappy.parquet
23
25
  compression_codec: snappy
@@ -1,5 +1,5 @@
1
1
  distributionBase=GRADLE_USER_HOME
2
2
  distributionPath=wrapper/dists
3
+ distributionUrl=https\://services.gradle.org/distributions/gradle-6.1-bin.zip
3
4
  zipStoreBase=GRADLE_USER_HOME
4
5
  zipStorePath=wrapper/dists
5
- distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
data/gradlew CHANGED
@@ -1,5 +1,21 @@
1
1
  #!/usr/bin/env sh
2
2
 
3
+ #
4
+ # Copyright 2015 the original author or authors.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # https://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ #
18
+
3
19
  ##############################################################################
4
20
  ##
5
21
  ## Gradle start up script for UN*X
@@ -28,7 +44,7 @@ APP_NAME="Gradle"
28
44
  APP_BASE_NAME=`basename "$0"`
29
45
 
30
46
  # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
- DEFAULT_JVM_OPTS=""
47
+ DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
32
48
 
33
49
  # Use the maximum available, or set MAX_FD != -1 to use that value.
34
50
  MAX_FD="maximum"
@@ -109,8 +125,8 @@ if $darwin; then
109
125
  GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110
126
  fi
111
127
 
112
- # For Cygwin, switch paths to Windows format before running java
113
- if $cygwin ; then
128
+ # For Cygwin or MSYS, switch paths to Windows format before running java
129
+ if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
114
130
  APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115
131
  CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
132
  JAVACMD=`cygpath --unix "$JAVACMD"`
@@ -138,19 +154,19 @@ if $cygwin ; then
138
154
  else
139
155
  eval `echo args$i`="\"$arg\""
140
156
  fi
141
- i=$((i+1))
157
+ i=`expr $i + 1`
142
158
  done
143
159
  case $i in
144
- (0) set -- ;;
145
- (1) set -- "$args0" ;;
146
- (2) set -- "$args0" "$args1" ;;
147
- (3) set -- "$args0" "$args1" "$args2" ;;
148
- (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
- (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
- (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
- (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
- (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
- (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
160
+ 0) set -- ;;
161
+ 1) set -- "$args0" ;;
162
+ 2) set -- "$args0" "$args1" ;;
163
+ 3) set -- "$args0" "$args1" "$args2" ;;
164
+ 4) set -- "$args0" "$args1" "$args2" "$args3" ;;
165
+ 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
166
+ 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
167
+ 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
168
+ 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
169
+ 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
170
  esac
155
171
  fi
156
172
 
@@ -159,14 +175,9 @@ save () {
159
175
  for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
176
  echo " "
161
177
  }
162
- APP_ARGS=$(save "$@")
178
+ APP_ARGS=`save "$@"`
163
179
 
164
180
  # Collect all arguments for the java command, following the shell quoting and substitution rules
165
181
  eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166
182
 
167
- # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168
- if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169
- cd "$(dirname "$0")"
170
- fi
171
-
172
183
  exec "$JAVACMD" "$@"
@@ -1,3 +1,19 @@
1
+ @rem
2
+ @rem Copyright 2015 the original author or authors.
3
+ @rem
4
+ @rem Licensed under the Apache License, Version 2.0 (the "License");
5
+ @rem you may not use this file except in compliance with the License.
6
+ @rem You may obtain a copy of the License at
7
+ @rem
8
+ @rem https://www.apache.org/licenses/LICENSE-2.0
9
+ @rem
10
+ @rem Unless required by applicable law or agreed to in writing, software
11
+ @rem distributed under the License is distributed on an "AS IS" BASIS,
12
+ @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ @rem See the License for the specific language governing permissions and
14
+ @rem limitations under the License.
15
+ @rem
16
+
1
17
  @if "%DEBUG%" == "" @echo off
2
18
  @rem ##########################################################################
3
19
  @rem
@@ -14,7 +30,7 @@ set APP_BASE_NAME=%~n0
14
30
  set APP_HOME=%DIRNAME%
15
31
 
16
32
  @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
- set DEFAULT_JVM_OPTS=
33
+ set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
18
34
 
19
35
  @rem Find java.exe
20
36
  if defined JAVA_HOME goto findJavaFromJavaHome
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+
3
+ docker run -it -d --rm \
4
+ -p 4572:4572 \
5
+ -e SERVICES=s3 \
6
+ localstack/localstack
7
+
@@ -1,202 +1,250 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
-
4
3
  import java.util.{Optional, Map => JMap}
5
4
 
6
- import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
5
+ import com.amazonaws.services.glue.model.{
6
+ Column,
7
+ CreateTableRequest,
8
+ DeleteTableRequest,
9
+ GetTableRequest,
10
+ SerDeInfo,
11
+ StorageDescriptor,
12
+ TableInput
13
+ }
7
14
  import org.apache.parquet.hadoop.metadata.CompressionCodecName
8
15
  import org.embulk.config.{Config, ConfigDefault, ConfigException}
9
16
  import org.embulk.output.s3_parquet.aws.Aws
10
17
  import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
11
18
  import org.embulk.spi.Schema
12
- import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
19
+ import org.embulk.spi.`type`.{
20
+ BooleanType,
21
+ DoubleType,
22
+ JsonType,
23
+ LongType,
24
+ StringType,
25
+ TimestampType,
26
+ Type
27
+ }
13
28
  import org.slf4j.{Logger, LoggerFactory}
14
29
 
15
30
  import scala.jdk.CollectionConverters._
16
31
  import scala.util.Try
17
32
 
18
-
19
- object CatalogRegistrator
20
- {
21
- trait Task
22
- extends org.embulk.config.Task
23
- {
24
- @Config("catalog_id")
25
- @ConfigDefault("null")
26
- def getCatalogId: Optional[String]
27
-
28
- @Config("database")
29
- def getDatabase: String
30
-
31
- @Config("table")
32
- def getTable: String
33
-
34
- @Config("column_options")
35
- @ConfigDefault("{}")
36
- def getColumnOptions: JMap[String, ColumnOptions]
37
-
38
- @Config("operation_if_exists")
39
- @ConfigDefault("\"delete\"")
40
- def getOperationIfExists: String
41
- }
42
-
43
- trait ColumnOptions
44
- {
45
- @Config("type")
46
- def getType: String
47
- }
48
-
49
- def apply(aws: Aws,
50
- task: Task,
51
- schema: Schema,
52
- location: String,
53
- compressionCodec: CompressionCodecName,
54
- loggerOption: Option[Logger] = None,
55
- parquetColumnLogicalTypes: Map[String, String] = Map.empty): CatalogRegistrator =
56
- {
57
- new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption, parquetColumnLogicalTypes)
58
- }
33
+ object CatalogRegistrator {
34
+
35
+ trait Task extends org.embulk.config.Task {
36
+
37
+ @Config("catalog_id")
38
+ @ConfigDefault("null")
39
+ def getCatalogId: Optional[String]
40
+
41
+ @Config("database")
42
+ def getDatabase: String
43
+
44
+ @Config("table")
45
+ def getTable: String
46
+
47
+ @Config("column_options")
48
+ @ConfigDefault("{}")
49
+ def getColumnOptions: JMap[String, ColumnOptions]
50
+
51
+ @Config("operation_if_exists")
52
+ @ConfigDefault("\"delete\"")
53
+ def getOperationIfExists: String
54
+ }
55
+
56
+ trait ColumnOptions {
57
+
58
+ @Config("type")
59
+ def getType: String
60
+ }
61
+
62
+ def apply(
63
+ aws: Aws,
64
+ task: Task,
65
+ schema: Schema,
66
+ location: String,
67
+ compressionCodec: CompressionCodecName,
68
+ loggerOption: Option[Logger] = None,
69
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty
70
+ ): CatalogRegistrator = {
71
+ new CatalogRegistrator(
72
+ aws,
73
+ task,
74
+ schema,
75
+ location,
76
+ compressionCodec,
77
+ loggerOption,
78
+ parquetColumnLogicalTypes
79
+ )
80
+ }
59
81
  }
60
82
 
61
- class CatalogRegistrator(aws: Aws,
62
- task: CatalogRegistrator.Task,
63
- schema: Schema,
64
- location: String,
65
- compressionCodec: CompressionCodecName,
66
- loggerOption: Option[Logger] = None,
67
- parquetColumnLogicalTypes: Map[String, String] = Map.empty)
68
- {
69
- val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
70
-
71
- def run(): Unit =
72
- {
73
- if (doesTableExists()) {
74
- task.getOperationIfExists match {
75
- case "skip" =>
76
- logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
77
- return
78
-
79
- case "delete" =>
80
- logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
81
- deleteTable()
82
-
83
- case unknown =>
84
- throw new ConfigException(s"Unsupported operation: $unknown")
85
- }
86
- }
87
- registerNewParquetTable()
88
- showNewTableInfo()
83
+ class CatalogRegistrator(
84
+ aws: Aws,
85
+ task: CatalogRegistrator.Task,
86
+ schema: Schema,
87
+ location: String,
88
+ compressionCodec: CompressionCodecName,
89
+ loggerOption: Option[Logger] = None,
90
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty
91
+ ) {
92
+
93
+ val logger: Logger =
94
+ loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
95
+
96
+ def run(): Unit = {
97
+ if (doesTableExists()) {
98
+ task.getOperationIfExists match {
99
+ case "skip" =>
100
+ logger.info(
101
+ s"Skip to register the table: ${task.getDatabase}.${task.getTable}"
102
+ )
103
+ return
104
+
105
+ case "delete" =>
106
+ logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
107
+ deleteTable()
108
+
109
+ case unknown =>
110
+ throw new ConfigException(s"Unsupported operation: $unknown")
111
+ }
89
112
  }
90
-
91
- def showNewTableInfo(): Unit =
92
- {
93
- val req = new GetTableRequest()
94
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
95
- req.setDatabaseName(task.getDatabase)
96
- req.setName(task.getTable)
97
-
98
- val t = aws.withGlue(_.getTable(req)).getTable
99
- logger.info(s"Created a table: ${t.toString}")
100
- }
101
-
102
- def doesTableExists(): Boolean =
103
- {
104
- val req = new GetTableRequest()
105
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
106
- req.setDatabaseName(task.getDatabase)
107
- req.setName(task.getTable)
108
-
109
- Try(aws.withGlue(_.getTable(req))).isSuccess
113
+ registerNewParquetTable()
114
+ showNewTableInfo()
115
+ }
116
+
117
+ def showNewTableInfo(): Unit = {
118
+ val req = new GetTableRequest()
119
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
120
+ req.setDatabaseName(task.getDatabase)
121
+ req.setName(task.getTable)
122
+
123
+ val t = aws.withGlue(_.getTable(req)).getTable
124
+ logger.info(s"Created a table: ${t.toString}")
125
+ }
126
+
127
+ def doesTableExists(): Boolean = {
128
+ val req = new GetTableRequest()
129
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
130
+ req.setDatabaseName(task.getDatabase)
131
+ req.setName(task.getTable)
132
+
133
+ Try(aws.withGlue(_.getTable(req))).isSuccess
134
+ }
135
+
136
+ def deleteTable(): Unit = {
137
+ val req = new DeleteTableRequest()
138
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
139
+ req.setDatabaseName(task.getDatabase)
140
+ req.setName(task.getTable)
141
+ aws.withGlue(_.deleteTable(req))
142
+ }
143
+
144
+ def registerNewParquetTable(): Unit = {
145
+ logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
146
+ val req = new CreateTableRequest()
147
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
148
+ req.setDatabaseName(task.getDatabase)
149
+ req.setTableInput(
150
+ new TableInput()
151
+ .withName(task.getTable)
152
+ .withDescription("Created by embulk-output-s3_parquet")
153
+ .withTableType("EXTERNAL_TABLE")
154
+ .withParameters(
155
+ Map(
156
+ "EXTERNAL" -> "TRUE",
157
+ "classification" -> "parquet",
158
+ "parquet.compression" -> compressionCodec.name()
159
+ ).asJava
160
+ )
161
+ .withStorageDescriptor(
162
+ new StorageDescriptor()
163
+ .withColumns(getGlueSchema: _*)
164
+ .withLocation(location)
165
+ .withCompressed(isCompressed)
166
+ .withInputFormat(
167
+ "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
168
+ )
169
+ .withOutputFormat(
170
+ "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
171
+ )
172
+ .withSerdeInfo(
173
+ new SerDeInfo()
174
+ .withSerializationLibrary(
175
+ "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
176
+ )
177
+ .withParameters(Map("serialization.format" -> "1").asJava)
178
+ )
179
+ )
180
+ )
181
+ aws.withGlue(_.createTable(req))
182
+ }
183
+
184
+ private def getGlueSchema: Seq[Column] = {
185
+ val columnOptions: Map[String, ColumnOptions] =
186
+ task.getColumnOptions.asScala.toMap
187
+ schema.getColumns.asScala.toSeq.map { c =>
188
+ val cType: String =
189
+ if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
190
+ else if (parquetColumnLogicalTypes.contains(c.getName))
191
+ convertParquetLogicalTypeToGlueType(
192
+ parquetColumnLogicalTypes(c.getName)
193
+ )
194
+ else convertEmbulkTypeToGlueType(c.getType)
195
+ new Column()
196
+ .withName(c.getName)
197
+ .withType(cType)
110
198
  }
111
-
112
- def deleteTable(): Unit =
113
- {
114
- val req = new DeleteTableRequest()
115
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
116
- req.setDatabaseName(task.getDatabase)
117
- req.setName(task.getTable)
118
- aws.withGlue(_.deleteTable(req))
119
- }
120
-
121
- def registerNewParquetTable(): Unit =
122
- {
123
- logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
124
- val req = new CreateTableRequest()
125
- task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
126
- req.setDatabaseName(task.getDatabase)
127
- req.setTableInput(new TableInput()
128
- .withName(task.getTable)
129
- .withDescription("Created by embulk-output-s3_parquet")
130
- .withTableType("EXTERNAL_TABLE")
131
- .withParameters(Map("EXTERNAL" -> "TRUE",
132
- "classification" -> "parquet",
133
- "parquet.compression" -> compressionCodec.name()).asJava)
134
- .withStorageDescriptor(new StorageDescriptor()
135
- .withColumns(getGlueSchema: _*)
136
- .withLocation(location)
137
- .withCompressed(isCompressed)
138
- .withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
139
- .withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
140
- .withSerdeInfo(new SerDeInfo()
141
- .withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
142
- .withParameters(Map("serialization.format" -> "1").asJava)
143
- )
144
- )
145
- )
146
- aws.withGlue(_.createTable(req))
199
+ }
200
+
201
+ private def convertParquetLogicalTypeToGlueType(t: String): String = {
202
+ t match {
203
+ case "timestamp-millis" => "timestamp"
204
+ case "timestamp-micros" =>
205
+ "bigint" // Glue cannot recognize timestamp-micros.
206
+ case "int8" => "tinyint"
207
+ case "int16" => "smallint"
208
+ case "int32" => "int"
209
+ case "int64" => "bigint"
210
+ case "uint8" =>
211
+ "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
212
+ case "uint16" =>
213
+ "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
214
+ case "uint32" =>
215
+ "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
216
+ case "uint64" =>
217
+ throw new ConfigException(
218
+ "Cannot convert uint64 to Glue data types automatically" +
219
+ " because the Glue bigint supports a 64-bit signed integer." +
220
+ " Please use `catalog.column_options` to define the type."
221
+ )
222
+ case "json" => "string"
223
+ case _ =>
224
+ throw new ConfigException(
225
+ s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type."
226
+ )
147
227
  }
148
228
 
149
- private def getGlueSchema: Seq[Column] =
150
- {
151
- val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
152
- schema.getColumns.asScala.toSeq.map { c =>
153
- val cType: String =
154
- if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
155
- else if (parquetColumnLogicalTypes.contains(c.getName)) convertParquetLogicalTypeToGlueType(parquetColumnLogicalTypes(c.getName))
156
- else convertEmbulkTypeToGlueType(c.getType)
157
- new Column()
158
- .withName(c.getName)
159
- .withType(cType)
160
- }
229
+ }
230
+
231
+ private def convertEmbulkTypeToGlueType(t: Type): String = {
232
+ t match {
233
+ case _: BooleanType => "boolean"
234
+ case _: LongType => "bigint"
235
+ case _: DoubleType => "double"
236
+ case _: StringType => "string"
237
+ case _: TimestampType => "string"
238
+ case _: JsonType => "string"
239
+ case unknown =>
240
+ throw new ConfigException(
241
+ s"Unsupported embulk type: ${unknown.getName}"
242
+ )
161
243
  }
244
+ }
162
245
 
163
- private def convertParquetLogicalTypeToGlueType(t: String): String =
164
- {
165
- t match {
166
- case "timestamp-millis" => "timestamp"
167
- case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
168
- case "int8" => "tinyint"
169
- case "int16" => "smallint"
170
- case "int32" => "int"
171
- case "int64" => "bigint"
172
- case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
173
- case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
174
- case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
175
- case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
176
- " because the Glue bigint supports a 64-bit signed integer." +
177
- " Please use `catalog.column_options` to define the type.")
178
- case "json" => "string"
179
- case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
180
- }
181
-
182
- }
183
-
184
- private def convertEmbulkTypeToGlueType(t: Type): String =
185
- {
186
- t match {
187
- case _: BooleanType => "boolean"
188
- case _: LongType => "bigint"
189
- case _: DoubleType => "double"
190
- case _: StringType => "string"
191
- case _: TimestampType => "string"
192
- case _: JsonType => "string"
193
- case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
194
- }
195
- }
196
-
197
- private def isCompressed: Boolean =
198
- {
199
- !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
200
- }
246
+ private def isCompressed: Boolean = {
247
+ !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
248
+ }
201
249
 
202
250
  }