embulk-output-s3_parquet 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/build.gradle +12 -13
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +178 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +166 -144
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +43 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +47 -29
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +22 -14
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +104 -95
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +34 -26
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +39 -31
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -32
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +57 -37
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +26 -19
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +128 -94
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +113 -104
- metadata +18 -16
- data/.scalafmt.conf +0 -9
@@ -11,55 +11,63 @@ import org.embulk.config.TaskReport
|
|
11
11
|
import org.embulk.output.s3_parquet.aws.Aws
|
12
12
|
import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
|
13
13
|
|
14
|
+
|
14
15
|
case class S3ParquetPageOutput(outputLocalFile: String,
|
15
16
|
reader: PageReader,
|
16
17
|
writer: ParquetWriter[PageReader],
|
17
18
|
aws: Aws,
|
18
19
|
destBucket: String,
|
19
20
|
destKey: String)
|
20
|
-
|
21
|
+
extends TransactionalPageOutput
|
22
|
+
{
|
21
23
|
|
22
|
-
|
24
|
+
private var isClosed: Boolean = false
|
23
25
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
26
|
+
override def add(page: Page): Unit =
|
27
|
+
{
|
28
|
+
reader.setPage(page)
|
29
|
+
while (reader.nextRecord()) {
|
30
|
+
writer.write(reader)
|
31
|
+
}
|
28
32
|
}
|
29
|
-
}
|
30
33
|
|
31
|
-
|
32
|
-
|
34
|
+
override def finish(): Unit =
|
35
|
+
{
|
36
|
+
}
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
38
|
+
override def close(): Unit =
|
39
|
+
{
|
40
|
+
synchronized {
|
41
|
+
if (!isClosed) {
|
42
|
+
writer.close()
|
43
|
+
isClosed = true
|
44
|
+
}
|
45
|
+
}
|
40
46
|
}
|
41
|
-
}
|
42
47
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
48
|
+
override def abort(): Unit =
|
49
|
+
{
|
50
|
+
close()
|
51
|
+
cleanup()
|
52
|
+
}
|
47
53
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
54
|
+
override def commit(): TaskReport =
|
55
|
+
{
|
56
|
+
close()
|
57
|
+
val result: UploadResult = aws.withTransferManager { xfer: TransferManager =>
|
58
|
+
val upload: Upload = xfer.upload(destBucket, destKey, new File(outputLocalFile))
|
59
|
+
upload.waitForUploadResult()
|
60
|
+
}
|
61
|
+
cleanup()
|
62
|
+
Exec.newTaskReport()
|
63
|
+
.set("bucket", result.getBucketName)
|
64
|
+
.set("key", result.getKey)
|
65
|
+
.set("etag", result.getETag)
|
66
|
+
.set("version_id", result.getVersionId)
|
53
67
|
}
|
54
|
-
cleanup()
|
55
|
-
Exec.newTaskReport()
|
56
|
-
.set("bucket", result.getBucketName)
|
57
|
-
.set("key", result.getKey)
|
58
|
-
.set("etag", result.getETag)
|
59
|
-
.set("version_id", result.getVersionId)
|
60
|
-
}
|
61
68
|
|
62
|
-
|
63
|
-
|
64
|
-
|
69
|
+
private def cleanup(): Unit =
|
70
|
+
{
|
71
|
+
Files.delete(Paths.get(outputLocalFile))
|
72
|
+
}
|
65
73
|
}
|
@@ -2,44 +2,62 @@ package org.embulk.output.s3_parquet.aws
|
|
2
2
|
|
3
3
|
|
4
4
|
import com.amazonaws.client.builder.AwsClientBuilder
|
5
|
+
import com.amazonaws.services.glue.{AWSGlue, AWSGlueClientBuilder}
|
5
6
|
import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
|
6
7
|
import com.amazonaws.services.s3.transfer.{TransferManager, TransferManagerBuilder}
|
7
8
|
|
8
|
-
object Aws {
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
with AwsEndpointConfiguration.Task
|
13
|
-
with AwsClientConfiguration.Task
|
14
|
-
with AwsS3Configuration.Task
|
10
|
+
object Aws
|
11
|
+
{
|
15
12
|
|
16
|
-
|
13
|
+
trait Task
|
14
|
+
extends AwsCredentials.Task
|
15
|
+
with AwsEndpointConfiguration.Task
|
16
|
+
with AwsClientConfiguration.Task
|
17
|
+
with AwsS3Configuration.Task
|
18
|
+
|
19
|
+
def apply(task: Task): Aws =
|
20
|
+
{
|
21
|
+
new Aws(task)
|
22
|
+
}
|
17
23
|
|
18
24
|
}
|
19
25
|
|
20
|
-
class Aws(task: Aws.Task)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
26
|
+
class Aws(task: Aws.Task)
|
27
|
+
{
|
28
|
+
|
29
|
+
def withS3[A](f: AmazonS3 => A): A =
|
30
|
+
{
|
31
|
+
val builder: AmazonS3ClientBuilder = AmazonS3ClientBuilder.standard()
|
32
|
+
AwsS3Configuration(task).configureAmazonS3ClientBuilder(builder)
|
33
|
+
val svc = createService(builder)
|
34
|
+
try f(svc)
|
35
|
+
finally svc.shutdown()
|
36
|
+
}
|
37
|
+
|
38
|
+
def withTransferManager[A](f: TransferManager => A): A =
|
39
|
+
{
|
40
|
+
withS3 { s3 =>
|
41
|
+
val svc = TransferManagerBuilder.standard().withS3Client(s3).build()
|
42
|
+
try f(svc)
|
43
|
+
finally svc.shutdownNow(false)
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
def withGlue[A](f: AWSGlue => A): A =
|
48
|
+
{
|
49
|
+
val builder: AWSGlueClientBuilder = AWSGlueClientBuilder.standard()
|
50
|
+
val svc = createService(builder)
|
51
|
+
try f(svc)
|
52
|
+
finally svc.shutdown()
|
35
53
|
}
|
36
|
-
}
|
37
54
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
55
|
+
def createService[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): T =
|
56
|
+
{
|
57
|
+
AwsEndpointConfiguration(task).configureAwsClientBuilder(builder)
|
58
|
+
AwsClientConfiguration(task).configureAwsClientBuilder(builder)
|
59
|
+
builder.setCredentials(AwsCredentials(task).createAwsCredentialsProvider)
|
42
60
|
|
43
|
-
|
44
|
-
|
61
|
+
builder.build()
|
62
|
+
}
|
45
63
|
}
|
@@ -8,27 +8,35 @@ import com.amazonaws.client.builder.AwsClientBuilder
|
|
8
8
|
import org.embulk.config.{Config, ConfigDefault}
|
9
9
|
import org.embulk.output.s3_parquet.aws.AwsClientConfiguration.Task
|
10
10
|
|
11
|
-
object AwsClientConfiguration {
|
12
11
|
|
13
|
-
|
12
|
+
object AwsClientConfiguration
|
13
|
+
{
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
def getHttpProxy: Optional[HttpProxy.Task]
|
15
|
+
trait Task
|
16
|
+
{
|
18
17
|
|
19
|
-
|
18
|
+
@Config("http_proxy")
|
19
|
+
@ConfigDefault("null")
|
20
|
+
def getHttpProxy: Optional[HttpProxy.Task]
|
20
21
|
|
21
|
-
|
22
|
+
}
|
23
|
+
|
24
|
+
def apply(task: Task): AwsClientConfiguration =
|
25
|
+
{
|
26
|
+
new AwsClientConfiguration(task)
|
27
|
+
}
|
22
28
|
}
|
23
29
|
|
24
|
-
class AwsClientConfiguration(task: Task)
|
30
|
+
class AwsClientConfiguration(task: Task)
|
31
|
+
{
|
25
32
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): Unit =
|
34
|
+
{
|
35
|
+
task.getHttpProxy.ifPresent { v =>
|
36
|
+
val cc = new ClientConfiguration
|
37
|
+
HttpProxy(v).configureClientConfiguration(cc)
|
38
|
+
builder.setClientConfiguration(cc)
|
39
|
+
}
|
31
40
|
}
|
32
|
-
}
|
33
41
|
|
34
42
|
}
|
@@ -9,120 +9,129 @@ import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
|
9
9
|
import org.embulk.output.s3_parquet.aws.AwsCredentials.Task
|
10
10
|
import org.embulk.spi.unit.LocalFile
|
11
11
|
|
12
|
-
object AwsCredentials {
|
13
12
|
|
14
|
-
|
13
|
+
object AwsCredentials
|
14
|
+
{
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
def getAuthMethod: String
|
16
|
+
trait Task
|
17
|
+
{
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
@Config("auth_method")
|
20
|
+
@ConfigDefault("\"default\"")
|
21
|
+
def getAuthMethod: String
|
23
22
|
|
24
|
-
|
25
|
-
|
26
|
-
|
23
|
+
@Config("access_key_id")
|
24
|
+
@ConfigDefault("null")
|
25
|
+
def getAccessKeyId: Optional[String]
|
27
26
|
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
@Config("secret_access_key")
|
28
|
+
@ConfigDefault("null")
|
29
|
+
def getSecretAccessKey: Optional[String]
|
31
30
|
|
32
|
-
|
33
|
-
|
34
|
-
|
31
|
+
@Config("session_token")
|
32
|
+
@ConfigDefault("null")
|
33
|
+
def getSessionToken: Optional[String]
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
@Config("profile_file")
|
36
|
+
@ConfigDefault("null")
|
37
|
+
def getProfileFile: Optional[LocalFile]
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
@Config("profile_name")
|
40
|
+
@ConfigDefault("\"default\"")
|
41
|
+
def getProfileName: String
|
43
42
|
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
@Config("role_arn")
|
44
|
+
@ConfigDefault("null")
|
45
|
+
def getRoleArn: Optional[String]
|
47
46
|
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
@Config("role_session_name")
|
48
|
+
@ConfigDefault("null")
|
49
|
+
def getRoleSessionName: Optional[String]
|
51
50
|
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
@Config("role_external_id")
|
52
|
+
@ConfigDefault("null")
|
53
|
+
def getRoleExternalId: Optional[String]
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
55
|
+
@Config("role_session_duration_seconds")
|
56
|
+
@ConfigDefault("null")
|
57
|
+
def getRoleSessionDurationSeconds: Optional[Int]
|
59
58
|
|
60
|
-
|
59
|
+
@Config("scope_down_policy")
|
60
|
+
@ConfigDefault("null")
|
61
|
+
def getScopeDownPolicy: Optional[String]
|
61
62
|
|
62
|
-
|
63
|
-
}
|
64
|
-
|
65
|
-
class AwsCredentials(task: Task) {
|
66
|
-
|
67
|
-
def createAwsCredentialsProvider: AWSCredentialsProvider = {
|
68
|
-
task.getAuthMethod match {
|
69
|
-
case "basic" =>
|
70
|
-
new AWSStaticCredentialsProvider(new BasicAWSCredentials(
|
71
|
-
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
72
|
-
getRequiredOption(task.getAccessKeyId, "secret_access_key")
|
73
|
-
))
|
74
|
-
|
75
|
-
case "env" =>
|
76
|
-
new EnvironmentVariableCredentialsProvider
|
63
|
+
}
|
77
64
|
|
78
|
-
|
79
|
-
|
80
|
-
new
|
65
|
+
def apply(task: Task): AwsCredentials =
|
66
|
+
{
|
67
|
+
new AwsCredentials(task)
|
68
|
+
}
|
69
|
+
}
|
81
70
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
71
|
+
class AwsCredentials(task: Task)
|
72
|
+
{
|
73
|
+
|
74
|
+
def createAwsCredentialsProvider: AWSCredentialsProvider =
|
75
|
+
{
|
76
|
+
task.getAuthMethod match {
|
77
|
+
case "basic" =>
|
78
|
+
new AWSStaticCredentialsProvider(new BasicAWSCredentials(
|
79
|
+
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
80
|
+
getRequiredOption(task.getAccessKeyId, "secret_access_key")
|
81
|
+
))
|
82
|
+
|
83
|
+
case "env" =>
|
84
|
+
new EnvironmentVariableCredentialsProvider
|
85
|
+
|
86
|
+
case "instance" =>
|
87
|
+
// NOTE: combination of InstanceProfileCredentialsProvider and ContainerCredentialsProvider
|
88
|
+
new EC2ContainerCredentialsProviderWrapper
|
89
|
+
|
90
|
+
case "profile" =>
|
91
|
+
if (task.getProfileFile.isPresent) {
|
92
|
+
val pf: ProfilesConfigFile = new ProfilesConfigFile(task.getProfileFile.get().getFile)
|
93
|
+
new ProfileCredentialsProvider(pf, task.getProfileName)
|
94
|
+
}
|
95
|
+
else new ProfileCredentialsProvider(task.getProfileName)
|
96
|
+
|
97
|
+
case "properties" =>
|
98
|
+
new SystemPropertiesCredentialsProvider
|
99
|
+
|
100
|
+
case "anonymous" =>
|
101
|
+
new AWSStaticCredentialsProvider(new AnonymousAWSCredentials)
|
102
|
+
|
103
|
+
case "session" =>
|
104
|
+
new AWSStaticCredentialsProvider(new BasicSessionCredentials(
|
105
|
+
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
106
|
+
getRequiredOption(task.getSecretAccessKey, "secret_access_key"),
|
107
|
+
getRequiredOption(task.getSessionToken, "session_token")
|
108
|
+
))
|
109
|
+
|
110
|
+
case "assume_role" =>
|
111
|
+
// NOTE: Are http_proxy, endpoint, region required when assuming role?
|
112
|
+
val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(
|
113
|
+
getRequiredOption(task.getRoleArn, "role_arn"),
|
114
|
+
getRequiredOption(task.getRoleSessionName, "role_session_name")
|
115
|
+
)
|
116
|
+
task.getRoleExternalId.ifPresent(v => builder.withExternalId(v))
|
117
|
+
task.getRoleSessionDurationSeconds.ifPresent(v => builder.withRoleSessionDurationSeconds(v))
|
118
|
+
task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v))
|
119
|
+
|
120
|
+
builder.build()
|
121
|
+
|
122
|
+
case "default" =>
|
123
|
+
new DefaultAWSCredentialsProviderChain
|
124
|
+
|
125
|
+
case am =>
|
126
|
+
throw new ConfigException(s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default'].")
|
86
127
|
}
|
87
|
-
else new ProfileCredentialsProvider(task.getProfileName)
|
88
|
-
|
89
|
-
case "properties" =>
|
90
|
-
new SystemPropertiesCredentialsProvider
|
91
|
-
|
92
|
-
case "anonymous" =>
|
93
|
-
new AWSStaticCredentialsProvider(new AnonymousAWSCredentials)
|
94
|
-
|
95
|
-
case "session" =>
|
96
|
-
new AWSStaticCredentialsProvider(new BasicSessionCredentials(
|
97
|
-
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
98
|
-
getRequiredOption(task.getSecretAccessKey, "secret_access_key"),
|
99
|
-
getRequiredOption(task.getSessionToken, "session_token")
|
100
|
-
))
|
101
|
-
|
102
|
-
case "assume_role" =>
|
103
|
-
// NOTE: Are http_proxy, endpoint, region required when assuming role?
|
104
|
-
val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(
|
105
|
-
getRequiredOption(task.getRoleArn, "role_arn"),
|
106
|
-
getRequiredOption(task.getRoleSessionName, "role_session_name")
|
107
|
-
)
|
108
|
-
task.getRoleExternalId.ifPresent(v => builder.withExternalId(v))
|
109
|
-
task.getRoleSessionDurationSeconds.ifPresent(v => builder.withRoleSessionDurationSeconds(v))
|
110
|
-
task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v))
|
111
|
-
|
112
|
-
builder.build()
|
113
|
-
|
114
|
-
case "default" =>
|
115
|
-
new DefaultAWSCredentialsProviderChain
|
116
|
-
|
117
|
-
case am =>
|
118
|
-
throw new ConfigException(s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default'].")
|
119
128
|
}
|
120
|
-
}
|
121
129
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
130
|
+
private def getRequiredOption[A](o: Optional[A],
|
131
|
+
name: String): A =
|
132
|
+
{
|
133
|
+
o.orElseThrow(() => new ConfigException(s"`$name` must be set when `auth_method` is ${task.getAuthMethod}."))
|
134
|
+
}
|
126
135
|
|
127
136
|
|
128
137
|
}
|