embulk-output-s3_parquet 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.scalafmt.conf +9 -0
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +21 -0
- data/README.md +122 -0
- data/build.gradle +101 -0
- data/example/config.yml +25 -0
- data/example/data.tsv +5 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +5 -0
- data/gradlew +172 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/output/s3_parquet.rb +3 -0
- data/settings.gradle +1 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +199 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +65 -0
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +45 -0
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +34 -0
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +128 -0
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +49 -0
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +56 -0
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +56 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +59 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +33 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +125 -0
- data/src/test/resources/org/embulk/output/s3_parquet/in1.csv +6 -0
- data/src/test/resources/org/embulk/output/s3_parquet/out1.tsv +5 -0
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +140 -0
- metadata +184 -0
data/gradlew.bat
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
set DIRNAME=%~dp0
|
12
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
+
set APP_BASE_NAME=%~n0
|
14
|
+
set APP_HOME=%DIRNAME%
|
15
|
+
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
|
53
|
+
:win9xME_args
|
54
|
+
@rem Slurp the command line arguments.
|
55
|
+
set CMD_LINE_ARGS=
|
56
|
+
set _SKIP=2
|
57
|
+
|
58
|
+
:win9xME_args_slurp
|
59
|
+
if "x%~1" == "x" goto execute
|
60
|
+
|
61
|
+
set CMD_LINE_ARGS=%*
|
62
|
+
|
63
|
+
:execute
|
64
|
+
@rem Setup the command line
|
65
|
+
|
66
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
+
|
68
|
+
@rem Execute Gradle
|
69
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
+
|
71
|
+
:end
|
72
|
+
@rem End local scope for the variables with windows NT shell
|
73
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
+
|
75
|
+
:fail
|
76
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
+
rem the _cmd.exe /c_ return code!
|
78
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
+
exit /b 1
|
80
|
+
|
81
|
+
:mainEnd
|
82
|
+
if "%OS%"=="Windows_NT" endlocal
|
83
|
+
|
84
|
+
:omega
|
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = 'embulk-output-s3_parquet'
|
@@ -0,0 +1,199 @@
|
|
1
|
+
package org.embulk.output.s3_parquet
|
2
|
+
|
3
|
+
|
4
|
+
import java.nio.file.{Files, Paths}
|
5
|
+
import java.util.{IllegalFormatException, Locale, Optional, List => JList, Map => JMap}
|
6
|
+
|
7
|
+
import com.amazonaws.services.s3.model.CannedAccessControlList
|
8
|
+
import org.apache.parquet.column.ParquetProperties
|
9
|
+
import org.apache.parquet.hadoop.ParquetWriter
|
10
|
+
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
11
|
+
import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
|
12
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.PluginTask
|
13
|
+
import org.embulk.output.s3_parquet.aws.Aws
|
14
|
+
import org.embulk.output.s3_parquet.parquet.ParquetFileWriter
|
15
|
+
import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
|
16
|
+
import org.embulk.spi.time.TimestampFormatter
|
17
|
+
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
|
+
import org.embulk.spi.util.Timestamps
|
19
|
+
import org.slf4j.Logger
|
20
|
+
|
21
|
+
object S3ParquetOutputPlugin {
|
22
|
+
|
23
|
+
trait PluginTask
|
24
|
+
extends Task
|
25
|
+
with TimestampFormatter.Task
|
26
|
+
with Aws.Task {
|
27
|
+
|
28
|
+
@Config("bucket")
|
29
|
+
def getBucket: String
|
30
|
+
|
31
|
+
@Config("path_prefix")
|
32
|
+
@ConfigDefault("\"\"")
|
33
|
+
def getPathPrefix: String
|
34
|
+
|
35
|
+
@Config("sequence_format")
|
36
|
+
@ConfigDefault("\"%03d.%02d.\"")
|
37
|
+
def getSequenceFormat: String
|
38
|
+
|
39
|
+
@Config("file_ext")
|
40
|
+
@ConfigDefault("\"parquet\"")
|
41
|
+
def getFileExt: String
|
42
|
+
|
43
|
+
@Config("compression_codec")
|
44
|
+
@ConfigDefault("\"uncompressed\"")
|
45
|
+
def getCompressionCodecString: String
|
46
|
+
|
47
|
+
def setCompressionCodec(v: CompressionCodecName): Unit
|
48
|
+
|
49
|
+
def getCompressionCodec: CompressionCodecName
|
50
|
+
|
51
|
+
@Config("column_options")
|
52
|
+
@ConfigDefault("{}")
|
53
|
+
def getColumnOptions: JMap[String, TimestampColumnOption]
|
54
|
+
|
55
|
+
@Config("canned_acl")
|
56
|
+
@ConfigDefault("\"private\"")
|
57
|
+
def getCannedAclString: String
|
58
|
+
|
59
|
+
def setCannedAcl(v: CannedAccessControlList): Unit
|
60
|
+
|
61
|
+
def getCannedAcl: CannedAccessControlList
|
62
|
+
|
63
|
+
@Config("block_size")
|
64
|
+
@ConfigDefault("null")
|
65
|
+
def getBlockSize: Optional[Int]
|
66
|
+
|
67
|
+
@Config("page_size")
|
68
|
+
@ConfigDefault("null")
|
69
|
+
def getPageSize: Optional[Int]
|
70
|
+
|
71
|
+
@Config("max_padding_size")
|
72
|
+
@ConfigDefault("null")
|
73
|
+
def getMaxPaddingSize: Optional[Int]
|
74
|
+
|
75
|
+
@Config("enable_dictionary_encoding")
|
76
|
+
@ConfigDefault("null")
|
77
|
+
def getEnableDictionaryEncoding: Optional[Boolean]
|
78
|
+
|
79
|
+
@Config("buffer_dir")
|
80
|
+
@ConfigDefault("null")
|
81
|
+
def getBufferDir: Optional[String]
|
82
|
+
|
83
|
+
}
|
84
|
+
|
85
|
+
}
|
86
|
+
|
87
|
+
class S3ParquetOutputPlugin
|
88
|
+
extends OutputPlugin {
|
89
|
+
|
90
|
+
val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
|
91
|
+
|
92
|
+
private def withPluginContextClassLoader[A](f: => A): A = {
|
93
|
+
val original: ClassLoader = Thread.currentThread.getContextClassLoader
|
94
|
+
Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
|
95
|
+
try f
|
96
|
+
finally Thread.currentThread.setContextClassLoader(original)
|
97
|
+
}
|
98
|
+
|
99
|
+
override def transaction(config: ConfigSource,
|
100
|
+
schema: Schema,
|
101
|
+
taskCount: Int,
|
102
|
+
control: OutputPlugin.Control): ConfigDiff = {
|
103
|
+
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
104
|
+
|
105
|
+
withPluginContextClassLoader {
|
106
|
+
configure(task, schema)
|
107
|
+
control.run(task.dump)
|
108
|
+
}
|
109
|
+
|
110
|
+
Exec.newConfigDiff
|
111
|
+
}
|
112
|
+
|
113
|
+
private def configure(task: PluginTask,
|
114
|
+
schema: Schema): Unit = {
|
115
|
+
// sequence_format
|
116
|
+
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
117
|
+
catch {
|
118
|
+
case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
|
119
|
+
}
|
120
|
+
|
121
|
+
// compression_codec
|
122
|
+
CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
|
123
|
+
case Some(v) => task.setCompressionCodec(v)
|
124
|
+
case None =>
|
125
|
+
val unsupported: String = task.getCompressionCodecString
|
126
|
+
val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
|
127
|
+
throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
|
128
|
+
}
|
129
|
+
|
130
|
+
// column_options
|
131
|
+
task.getColumnOptions.forEach { (k: String,
|
132
|
+
_) =>
|
133
|
+
val c = schema.lookupColumn(k)
|
134
|
+
if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
135
|
+
}
|
136
|
+
|
137
|
+
// canned_acl
|
138
|
+
CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
|
139
|
+
case Some(v) => task.setCannedAcl(v)
|
140
|
+
case None =>
|
141
|
+
val unsupported: String = task.getCannedAclString
|
142
|
+
val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
|
143
|
+
throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
override def resume(taskSource: TaskSource,
|
148
|
+
schema: Schema,
|
149
|
+
taskCount: Int,
|
150
|
+
control: OutputPlugin.Control): ConfigDiff = {
|
151
|
+
throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
|
152
|
+
}
|
153
|
+
|
154
|
+
override def cleanup(taskSource: TaskSource,
|
155
|
+
schema: Schema,
|
156
|
+
taskCount: Int,
|
157
|
+
successTaskReports: JList[TaskReport]): Unit = {
|
158
|
+
successTaskReports.forEach { tr =>
|
159
|
+
logger.info(
|
160
|
+
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
161
|
+
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
162
|
+
+ s"etag: ${tr.get(classOf[String], "etag", null)}")
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
override def open(taskSource: TaskSource,
|
167
|
+
schema: Schema,
|
168
|
+
taskIndex: Int): TransactionalPageOutput = {
|
169
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
170
|
+
val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
|
171
|
+
val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
|
172
|
+
val destS3bucket: String = task.getBucket
|
173
|
+
val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
|
174
|
+
|
175
|
+
|
176
|
+
val pageReader: PageReader = new PageReader(schema)
|
177
|
+
val aws: Aws = Aws(task)
|
178
|
+
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
179
|
+
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
180
|
+
.withPath(bufferFile)
|
181
|
+
.withSchema(schema)
|
182
|
+
.withTimestampFormatters(timestampFormatters)
|
183
|
+
.withCompressionCodec(task.getCompressionCodec)
|
184
|
+
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|
185
|
+
.withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
|
186
|
+
.withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
|
187
|
+
.withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
|
188
|
+
.withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
|
189
|
+
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
190
|
+
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
|
191
|
+
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
192
|
+
.build()
|
193
|
+
|
194
|
+
logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
|
195
|
+
|
196
|
+
S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
|
197
|
+
}
|
198
|
+
|
199
|
+
}
|
@@ -0,0 +1,65 @@
|
|
1
|
+
package org.embulk.output.s3_parquet
|
2
|
+
|
3
|
+
|
4
|
+
import java.io.File
|
5
|
+
import java.nio.file.{Files, Paths}
|
6
|
+
|
7
|
+
import com.amazonaws.services.s3.transfer.{TransferManager, Upload}
|
8
|
+
import com.amazonaws.services.s3.transfer.model.UploadResult
|
9
|
+
import org.apache.parquet.hadoop.ParquetWriter
|
10
|
+
import org.embulk.config.TaskReport
|
11
|
+
import org.embulk.output.s3_parquet.aws.Aws
|
12
|
+
import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
|
13
|
+
|
14
|
+
case class S3ParquetPageOutput(outputLocalFile: String,
|
15
|
+
reader: PageReader,
|
16
|
+
writer: ParquetWriter[PageReader],
|
17
|
+
aws: Aws,
|
18
|
+
destBucket: String,
|
19
|
+
destKey: String)
|
20
|
+
extends TransactionalPageOutput {
|
21
|
+
|
22
|
+
private var isClosed: Boolean = false
|
23
|
+
|
24
|
+
override def add(page: Page): Unit = {
|
25
|
+
reader.setPage(page)
|
26
|
+
while (reader.nextRecord()) {
|
27
|
+
writer.write(reader)
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
override def finish(): Unit = {
|
32
|
+
}
|
33
|
+
|
34
|
+
override def close(): Unit = {
|
35
|
+
synchronized {
|
36
|
+
if (!isClosed) {
|
37
|
+
writer.close()
|
38
|
+
isClosed = true
|
39
|
+
}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
override def abort(): Unit = {
|
44
|
+
close()
|
45
|
+
cleanup()
|
46
|
+
}
|
47
|
+
|
48
|
+
override def commit(): TaskReport = {
|
49
|
+
close()
|
50
|
+
val result: UploadResult = aws.withTransferManager { xfer: TransferManager =>
|
51
|
+
val upload: Upload = xfer.upload(destBucket, destKey, new File(outputLocalFile))
|
52
|
+
upload.waitForUploadResult()
|
53
|
+
}
|
54
|
+
cleanup()
|
55
|
+
Exec.newTaskReport()
|
56
|
+
.set("bucket", result.getBucketName)
|
57
|
+
.set("key", result.getKey)
|
58
|
+
.set("etag", result.getETag)
|
59
|
+
.set("version_id", result.getVersionId)
|
60
|
+
}
|
61
|
+
|
62
|
+
private def cleanup(): Unit = {
|
63
|
+
Files.delete(Paths.get(outputLocalFile))
|
64
|
+
}
|
65
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
package org.embulk.output.s3_parquet.aws
|
2
|
+
|
3
|
+
|
4
|
+
import com.amazonaws.client.builder.AwsClientBuilder
|
5
|
+
import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
|
6
|
+
import com.amazonaws.services.s3.transfer.{TransferManager, TransferManagerBuilder}
|
7
|
+
|
8
|
+
object Aws {
|
9
|
+
|
10
|
+
trait Task
|
11
|
+
extends AwsCredentials.Task
|
12
|
+
with AwsEndpointConfiguration.Task
|
13
|
+
with AwsClientConfiguration.Task
|
14
|
+
with AwsS3Configuration.Task
|
15
|
+
|
16
|
+
def apply(task: Task): Aws = new Aws(task)
|
17
|
+
|
18
|
+
}
|
19
|
+
|
20
|
+
class Aws(task: Aws.Task) {
|
21
|
+
|
22
|
+
def withS3[A](f: AmazonS3 => A): A = {
|
23
|
+
val builder: AmazonS3ClientBuilder = AmazonS3ClientBuilder.standard()
|
24
|
+
AwsS3Configuration(task).configureAmazonS3ClientBuilder(builder)
|
25
|
+
val svc = createService(builder)
|
26
|
+
try f(svc)
|
27
|
+
finally svc.shutdown()
|
28
|
+
}
|
29
|
+
|
30
|
+
def withTransferManager[A](f: TransferManager => A): A = {
|
31
|
+
withS3 { s3 =>
|
32
|
+
val svc = TransferManagerBuilder.standard().withS3Client(s3).build()
|
33
|
+
try f(svc)
|
34
|
+
finally svc.shutdownNow(false)
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
def createService[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): T = {
|
39
|
+
AwsEndpointConfiguration(task).configureAwsClientBuilder(builder)
|
40
|
+
AwsClientConfiguration(task).configureAwsClientBuilder(builder)
|
41
|
+
builder.setCredentials(AwsCredentials(task).createAwsCredentialsProvider)
|
42
|
+
|
43
|
+
builder.build()
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
package org.embulk.output.s3_parquet.aws
|
2
|
+
|
3
|
+
|
4
|
+
import java.util.Optional
|
5
|
+
|
6
|
+
import com.amazonaws.ClientConfiguration
|
7
|
+
import com.amazonaws.client.builder.AwsClientBuilder
|
8
|
+
import org.embulk.config.{Config, ConfigDefault}
|
9
|
+
import org.embulk.output.s3_parquet.aws.AwsClientConfiguration.Task
|
10
|
+
|
11
|
+
object AwsClientConfiguration {
|
12
|
+
|
13
|
+
trait Task {
|
14
|
+
|
15
|
+
@Config("http_proxy")
|
16
|
+
@ConfigDefault("null")
|
17
|
+
def getHttpProxy: Optional[HttpProxy.Task]
|
18
|
+
|
19
|
+
}
|
20
|
+
|
21
|
+
def apply(task: Task): AwsClientConfiguration = new AwsClientConfiguration(task)
|
22
|
+
}
|
23
|
+
|
24
|
+
class AwsClientConfiguration(task: Task) {
|
25
|
+
|
26
|
+
def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): Unit = {
|
27
|
+
task.getHttpProxy.ifPresent { v =>
|
28
|
+
val cc = new ClientConfiguration
|
29
|
+
HttpProxy(v).configureClientConfiguration(cc)
|
30
|
+
builder.setClientConfiguration(cc)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
}
|
@@ -0,0 +1,128 @@
|
|
1
|
+
package org.embulk.output.s3_parquet.aws
|
2
|
+
|
3
|
+
|
4
|
+
import java.util.Optional
|
5
|
+
|
6
|
+
import com.amazonaws.auth.{AnonymousAWSCredentials, AWSCredentialsProvider, AWSStaticCredentialsProvider, BasicAWSCredentials, BasicSessionCredentials, DefaultAWSCredentialsProviderChain, EC2ContainerCredentialsProviderWrapper, EnvironmentVariableCredentialsProvider, STSAssumeRoleSessionCredentialsProvider, SystemPropertiesCredentialsProvider}
|
7
|
+
import com.amazonaws.auth.profile.{ProfileCredentialsProvider, ProfilesConfigFile}
|
8
|
+
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
9
|
+
import org.embulk.output.s3_parquet.aws.AwsCredentials.Task
|
10
|
+
import org.embulk.spi.unit.LocalFile
|
11
|
+
|
12
|
+
object AwsCredentials {
|
13
|
+
|
14
|
+
trait Task {
|
15
|
+
|
16
|
+
@Config("auth_method")
|
17
|
+
@ConfigDefault("\"default\"")
|
18
|
+
def getAuthMethod: String
|
19
|
+
|
20
|
+
@Config("access_key_id")
|
21
|
+
@ConfigDefault("null")
|
22
|
+
def getAccessKeyId: Optional[String]
|
23
|
+
|
24
|
+
@Config("secret_access_key")
|
25
|
+
@ConfigDefault("null")
|
26
|
+
def getSecretAccessKey: Optional[String]
|
27
|
+
|
28
|
+
@Config("session_token")
|
29
|
+
@ConfigDefault("null")
|
30
|
+
def getSessionToken: Optional[String]
|
31
|
+
|
32
|
+
@Config("profile_file")
|
33
|
+
@ConfigDefault("null")
|
34
|
+
def getProfileFile: Optional[LocalFile]
|
35
|
+
|
36
|
+
@Config("profile_name")
|
37
|
+
@ConfigDefault("\"default\"")
|
38
|
+
def getProfileName: String
|
39
|
+
|
40
|
+
@Config("role_arn")
|
41
|
+
@ConfigDefault("null")
|
42
|
+
def getRoleArn: Optional[String]
|
43
|
+
|
44
|
+
@Config("role_session_name")
|
45
|
+
@ConfigDefault("null")
|
46
|
+
def getRoleSessionName: Optional[String]
|
47
|
+
|
48
|
+
@Config("role_external_id")
|
49
|
+
@ConfigDefault("null")
|
50
|
+
def getRoleExternalId: Optional[String]
|
51
|
+
|
52
|
+
@Config("role_session_duration_seconds")
|
53
|
+
@ConfigDefault("null")
|
54
|
+
def getRoleSessionDurationSeconds: Optional[Int]
|
55
|
+
|
56
|
+
@Config("scope_down_policy")
|
57
|
+
@ConfigDefault("null")
|
58
|
+
def getScopeDownPolicy: Optional[String]
|
59
|
+
|
60
|
+
}
|
61
|
+
|
62
|
+
def apply(task: Task): AwsCredentials = new AwsCredentials(task)
|
63
|
+
}
|
64
|
+
|
65
|
+
class AwsCredentials(task: Task) {
|
66
|
+
|
67
|
+
def createAwsCredentialsProvider: AWSCredentialsProvider = {
|
68
|
+
task.getAuthMethod match {
|
69
|
+
case "basic" =>
|
70
|
+
new AWSStaticCredentialsProvider(new BasicAWSCredentials(
|
71
|
+
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
72
|
+
getRequiredOption(task.getAccessKeyId, "secret_access_key")
|
73
|
+
))
|
74
|
+
|
75
|
+
case "env" =>
|
76
|
+
new EnvironmentVariableCredentialsProvider
|
77
|
+
|
78
|
+
case "instance" =>
|
79
|
+
// NOTE: combination of InstanceProfileCredentialsProvider and ContainerCredentialsProvider
|
80
|
+
new EC2ContainerCredentialsProviderWrapper
|
81
|
+
|
82
|
+
case "profile" =>
|
83
|
+
if (task.getProfileFile.isPresent) {
|
84
|
+
val pf: ProfilesConfigFile = new ProfilesConfigFile(task.getProfileFile.get().getFile)
|
85
|
+
new ProfileCredentialsProvider(pf, task.getProfileName)
|
86
|
+
}
|
87
|
+
else new ProfileCredentialsProvider(task.getProfileName)
|
88
|
+
|
89
|
+
case "properties" =>
|
90
|
+
new SystemPropertiesCredentialsProvider
|
91
|
+
|
92
|
+
case "anonymous" =>
|
93
|
+
new AWSStaticCredentialsProvider(new AnonymousAWSCredentials)
|
94
|
+
|
95
|
+
case "session" =>
|
96
|
+
new AWSStaticCredentialsProvider(new BasicSessionCredentials(
|
97
|
+
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
98
|
+
getRequiredOption(task.getSecretAccessKey, "secret_access_key"),
|
99
|
+
getRequiredOption(task.getSessionToken, "session_token")
|
100
|
+
))
|
101
|
+
|
102
|
+
case "assume_role" =>
|
103
|
+
// NOTE: Are http_proxy, endpoint, region required when assuming role?
|
104
|
+
val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(
|
105
|
+
getRequiredOption(task.getRoleArn, "role_arn"),
|
106
|
+
getRequiredOption(task.getRoleSessionName, "role_session_name")
|
107
|
+
)
|
108
|
+
task.getRoleExternalId.ifPresent(v => builder.withExternalId(v))
|
109
|
+
task.getRoleSessionDurationSeconds.ifPresent(v => builder.withRoleSessionDurationSeconds(v))
|
110
|
+
task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v))
|
111
|
+
|
112
|
+
builder.build()
|
113
|
+
|
114
|
+
case "default" =>
|
115
|
+
new DefaultAWSCredentialsProviderChain
|
116
|
+
|
117
|
+
case am =>
|
118
|
+
throw new ConfigException(s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default'].")
|
119
|
+
}
|
120
|
+
}
|
121
|
+
|
122
|
+
private def getRequiredOption[A](o: Optional[A],
|
123
|
+
name: String): A = {
|
124
|
+
o.orElseThrow(() => new ConfigException(s"`$name` must be set when `auth_method` is ${task.getAuthMethod}."))
|
125
|
+
}
|
126
|
+
|
127
|
+
|
128
|
+
}
|