embulk-output-s3_parquet 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/build.gradle +12 -13
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +178 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +166 -144
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +43 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +47 -29
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +22 -14
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +104 -95
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +34 -26
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +39 -31
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -32
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +57 -37
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +26 -19
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +128 -94
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +113 -104
- metadata +18 -16
- data/.scalafmt.conf +0 -9
@@ -11,55 +11,63 @@ import org.embulk.config.TaskReport
|
|
11
11
|
import org.embulk.output.s3_parquet.aws.Aws
|
12
12
|
import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
|
13
13
|
|
14
|
+
|
14
15
|
case class S3ParquetPageOutput(outputLocalFile: String,
|
15
16
|
reader: PageReader,
|
16
17
|
writer: ParquetWriter[PageReader],
|
17
18
|
aws: Aws,
|
18
19
|
destBucket: String,
|
19
20
|
destKey: String)
|
20
|
-
|
21
|
+
extends TransactionalPageOutput
|
22
|
+
{
|
21
23
|
|
22
|
-
|
24
|
+
private var isClosed: Boolean = false
|
23
25
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
26
|
+
override def add(page: Page): Unit =
|
27
|
+
{
|
28
|
+
reader.setPage(page)
|
29
|
+
while (reader.nextRecord()) {
|
30
|
+
writer.write(reader)
|
31
|
+
}
|
28
32
|
}
|
29
|
-
}
|
30
33
|
|
31
|
-
|
32
|
-
|
34
|
+
override def finish(): Unit =
|
35
|
+
{
|
36
|
+
}
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
38
|
+
override def close(): Unit =
|
39
|
+
{
|
40
|
+
synchronized {
|
41
|
+
if (!isClosed) {
|
42
|
+
writer.close()
|
43
|
+
isClosed = true
|
44
|
+
}
|
45
|
+
}
|
40
46
|
}
|
41
|
-
}
|
42
47
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
48
|
+
override def abort(): Unit =
|
49
|
+
{
|
50
|
+
close()
|
51
|
+
cleanup()
|
52
|
+
}
|
47
53
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
54
|
+
override def commit(): TaskReport =
|
55
|
+
{
|
56
|
+
close()
|
57
|
+
val result: UploadResult = aws.withTransferManager { xfer: TransferManager =>
|
58
|
+
val upload: Upload = xfer.upload(destBucket, destKey, new File(outputLocalFile))
|
59
|
+
upload.waitForUploadResult()
|
60
|
+
}
|
61
|
+
cleanup()
|
62
|
+
Exec.newTaskReport()
|
63
|
+
.set("bucket", result.getBucketName)
|
64
|
+
.set("key", result.getKey)
|
65
|
+
.set("etag", result.getETag)
|
66
|
+
.set("version_id", result.getVersionId)
|
53
67
|
}
|
54
|
-
cleanup()
|
55
|
-
Exec.newTaskReport()
|
56
|
-
.set("bucket", result.getBucketName)
|
57
|
-
.set("key", result.getKey)
|
58
|
-
.set("etag", result.getETag)
|
59
|
-
.set("version_id", result.getVersionId)
|
60
|
-
}
|
61
68
|
|
62
|
-
|
63
|
-
|
64
|
-
|
69
|
+
private def cleanup(): Unit =
|
70
|
+
{
|
71
|
+
Files.delete(Paths.get(outputLocalFile))
|
72
|
+
}
|
65
73
|
}
|
@@ -2,44 +2,62 @@ package org.embulk.output.s3_parquet.aws
|
|
2
2
|
|
3
3
|
|
4
4
|
import com.amazonaws.client.builder.AwsClientBuilder
|
5
|
+
import com.amazonaws.services.glue.{AWSGlue, AWSGlueClientBuilder}
|
5
6
|
import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
|
6
7
|
import com.amazonaws.services.s3.transfer.{TransferManager, TransferManagerBuilder}
|
7
8
|
|
8
|
-
object Aws {
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
with AwsEndpointConfiguration.Task
|
13
|
-
with AwsClientConfiguration.Task
|
14
|
-
with AwsS3Configuration.Task
|
10
|
+
object Aws
|
11
|
+
{
|
15
12
|
|
16
|
-
|
13
|
+
trait Task
|
14
|
+
extends AwsCredentials.Task
|
15
|
+
with AwsEndpointConfiguration.Task
|
16
|
+
with AwsClientConfiguration.Task
|
17
|
+
with AwsS3Configuration.Task
|
18
|
+
|
19
|
+
def apply(task: Task): Aws =
|
20
|
+
{
|
21
|
+
new Aws(task)
|
22
|
+
}
|
17
23
|
|
18
24
|
}
|
19
25
|
|
20
|
-
class Aws(task: Aws.Task)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
26
|
+
class Aws(task: Aws.Task)
|
27
|
+
{
|
28
|
+
|
29
|
+
def withS3[A](f: AmazonS3 => A): A =
|
30
|
+
{
|
31
|
+
val builder: AmazonS3ClientBuilder = AmazonS3ClientBuilder.standard()
|
32
|
+
AwsS3Configuration(task).configureAmazonS3ClientBuilder(builder)
|
33
|
+
val svc = createService(builder)
|
34
|
+
try f(svc)
|
35
|
+
finally svc.shutdown()
|
36
|
+
}
|
37
|
+
|
38
|
+
def withTransferManager[A](f: TransferManager => A): A =
|
39
|
+
{
|
40
|
+
withS3 { s3 =>
|
41
|
+
val svc = TransferManagerBuilder.standard().withS3Client(s3).build()
|
42
|
+
try f(svc)
|
43
|
+
finally svc.shutdownNow(false)
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
def withGlue[A](f: AWSGlue => A): A =
|
48
|
+
{
|
49
|
+
val builder: AWSGlueClientBuilder = AWSGlueClientBuilder.standard()
|
50
|
+
val svc = createService(builder)
|
51
|
+
try f(svc)
|
52
|
+
finally svc.shutdown()
|
35
53
|
}
|
36
|
-
}
|
37
54
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
55
|
+
def createService[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): T =
|
56
|
+
{
|
57
|
+
AwsEndpointConfiguration(task).configureAwsClientBuilder(builder)
|
58
|
+
AwsClientConfiguration(task).configureAwsClientBuilder(builder)
|
59
|
+
builder.setCredentials(AwsCredentials(task).createAwsCredentialsProvider)
|
42
60
|
|
43
|
-
|
44
|
-
|
61
|
+
builder.build()
|
62
|
+
}
|
45
63
|
}
|
@@ -8,27 +8,35 @@ import com.amazonaws.client.builder.AwsClientBuilder
|
|
8
8
|
import org.embulk.config.{Config, ConfigDefault}
|
9
9
|
import org.embulk.output.s3_parquet.aws.AwsClientConfiguration.Task
|
10
10
|
|
11
|
-
object AwsClientConfiguration {
|
12
11
|
|
13
|
-
|
12
|
+
object AwsClientConfiguration
|
13
|
+
{
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
def getHttpProxy: Optional[HttpProxy.Task]
|
15
|
+
trait Task
|
16
|
+
{
|
18
17
|
|
19
|
-
|
18
|
+
@Config("http_proxy")
|
19
|
+
@ConfigDefault("null")
|
20
|
+
def getHttpProxy: Optional[HttpProxy.Task]
|
20
21
|
|
21
|
-
|
22
|
+
}
|
23
|
+
|
24
|
+
def apply(task: Task): AwsClientConfiguration =
|
25
|
+
{
|
26
|
+
new AwsClientConfiguration(task)
|
27
|
+
}
|
22
28
|
}
|
23
29
|
|
24
|
-
class AwsClientConfiguration(task: Task)
|
30
|
+
class AwsClientConfiguration(task: Task)
|
31
|
+
{
|
25
32
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): Unit =
|
34
|
+
{
|
35
|
+
task.getHttpProxy.ifPresent { v =>
|
36
|
+
val cc = new ClientConfiguration
|
37
|
+
HttpProxy(v).configureClientConfiguration(cc)
|
38
|
+
builder.setClientConfiguration(cc)
|
39
|
+
}
|
31
40
|
}
|
32
|
-
}
|
33
41
|
|
34
42
|
}
|
@@ -9,120 +9,129 @@ import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
|
9
9
|
import org.embulk.output.s3_parquet.aws.AwsCredentials.Task
|
10
10
|
import org.embulk.spi.unit.LocalFile
|
11
11
|
|
12
|
-
object AwsCredentials {
|
13
12
|
|
14
|
-
|
13
|
+
object AwsCredentials
|
14
|
+
{
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
def getAuthMethod: String
|
16
|
+
trait Task
|
17
|
+
{
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
@Config("auth_method")
|
20
|
+
@ConfigDefault("\"default\"")
|
21
|
+
def getAuthMethod: String
|
23
22
|
|
24
|
-
|
25
|
-
|
26
|
-
|
23
|
+
@Config("access_key_id")
|
24
|
+
@ConfigDefault("null")
|
25
|
+
def getAccessKeyId: Optional[String]
|
27
26
|
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
@Config("secret_access_key")
|
28
|
+
@ConfigDefault("null")
|
29
|
+
def getSecretAccessKey: Optional[String]
|
31
30
|
|
32
|
-
|
33
|
-
|
34
|
-
|
31
|
+
@Config("session_token")
|
32
|
+
@ConfigDefault("null")
|
33
|
+
def getSessionToken: Optional[String]
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
@Config("profile_file")
|
36
|
+
@ConfigDefault("null")
|
37
|
+
def getProfileFile: Optional[LocalFile]
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
@Config("profile_name")
|
40
|
+
@ConfigDefault("\"default\"")
|
41
|
+
def getProfileName: String
|
43
42
|
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
@Config("role_arn")
|
44
|
+
@ConfigDefault("null")
|
45
|
+
def getRoleArn: Optional[String]
|
47
46
|
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
@Config("role_session_name")
|
48
|
+
@ConfigDefault("null")
|
49
|
+
def getRoleSessionName: Optional[String]
|
51
50
|
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
@Config("role_external_id")
|
52
|
+
@ConfigDefault("null")
|
53
|
+
def getRoleExternalId: Optional[String]
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
55
|
+
@Config("role_session_duration_seconds")
|
56
|
+
@ConfigDefault("null")
|
57
|
+
def getRoleSessionDurationSeconds: Optional[Int]
|
59
58
|
|
60
|
-
|
59
|
+
@Config("scope_down_policy")
|
60
|
+
@ConfigDefault("null")
|
61
|
+
def getScopeDownPolicy: Optional[String]
|
61
62
|
|
62
|
-
|
63
|
-
}
|
64
|
-
|
65
|
-
class AwsCredentials(task: Task) {
|
66
|
-
|
67
|
-
def createAwsCredentialsProvider: AWSCredentialsProvider = {
|
68
|
-
task.getAuthMethod match {
|
69
|
-
case "basic" =>
|
70
|
-
new AWSStaticCredentialsProvider(new BasicAWSCredentials(
|
71
|
-
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
72
|
-
getRequiredOption(task.getAccessKeyId, "secret_access_key")
|
73
|
-
))
|
74
|
-
|
75
|
-
case "env" =>
|
76
|
-
new EnvironmentVariableCredentialsProvider
|
63
|
+
}
|
77
64
|
|
78
|
-
|
79
|
-
|
80
|
-
new
|
65
|
+
def apply(task: Task): AwsCredentials =
|
66
|
+
{
|
67
|
+
new AwsCredentials(task)
|
68
|
+
}
|
69
|
+
}
|
81
70
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
71
|
+
class AwsCredentials(task: Task)
|
72
|
+
{
|
73
|
+
|
74
|
+
def createAwsCredentialsProvider: AWSCredentialsProvider =
|
75
|
+
{
|
76
|
+
task.getAuthMethod match {
|
77
|
+
case "basic" =>
|
78
|
+
new AWSStaticCredentialsProvider(new BasicAWSCredentials(
|
79
|
+
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
80
|
+
getRequiredOption(task.getAccessKeyId, "secret_access_key")
|
81
|
+
))
|
82
|
+
|
83
|
+
case "env" =>
|
84
|
+
new EnvironmentVariableCredentialsProvider
|
85
|
+
|
86
|
+
case "instance" =>
|
87
|
+
// NOTE: combination of InstanceProfileCredentialsProvider and ContainerCredentialsProvider
|
88
|
+
new EC2ContainerCredentialsProviderWrapper
|
89
|
+
|
90
|
+
case "profile" =>
|
91
|
+
if (task.getProfileFile.isPresent) {
|
92
|
+
val pf: ProfilesConfigFile = new ProfilesConfigFile(task.getProfileFile.get().getFile)
|
93
|
+
new ProfileCredentialsProvider(pf, task.getProfileName)
|
94
|
+
}
|
95
|
+
else new ProfileCredentialsProvider(task.getProfileName)
|
96
|
+
|
97
|
+
case "properties" =>
|
98
|
+
new SystemPropertiesCredentialsProvider
|
99
|
+
|
100
|
+
case "anonymous" =>
|
101
|
+
new AWSStaticCredentialsProvider(new AnonymousAWSCredentials)
|
102
|
+
|
103
|
+
case "session" =>
|
104
|
+
new AWSStaticCredentialsProvider(new BasicSessionCredentials(
|
105
|
+
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
106
|
+
getRequiredOption(task.getSecretAccessKey, "secret_access_key"),
|
107
|
+
getRequiredOption(task.getSessionToken, "session_token")
|
108
|
+
))
|
109
|
+
|
110
|
+
case "assume_role" =>
|
111
|
+
// NOTE: Are http_proxy, endpoint, region required when assuming role?
|
112
|
+
val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(
|
113
|
+
getRequiredOption(task.getRoleArn, "role_arn"),
|
114
|
+
getRequiredOption(task.getRoleSessionName, "role_session_name")
|
115
|
+
)
|
116
|
+
task.getRoleExternalId.ifPresent(v => builder.withExternalId(v))
|
117
|
+
task.getRoleSessionDurationSeconds.ifPresent(v => builder.withRoleSessionDurationSeconds(v))
|
118
|
+
task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v))
|
119
|
+
|
120
|
+
builder.build()
|
121
|
+
|
122
|
+
case "default" =>
|
123
|
+
new DefaultAWSCredentialsProviderChain
|
124
|
+
|
125
|
+
case am =>
|
126
|
+
throw new ConfigException(s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default'].")
|
86
127
|
}
|
87
|
-
else new ProfileCredentialsProvider(task.getProfileName)
|
88
|
-
|
89
|
-
case "properties" =>
|
90
|
-
new SystemPropertiesCredentialsProvider
|
91
|
-
|
92
|
-
case "anonymous" =>
|
93
|
-
new AWSStaticCredentialsProvider(new AnonymousAWSCredentials)
|
94
|
-
|
95
|
-
case "session" =>
|
96
|
-
new AWSStaticCredentialsProvider(new BasicSessionCredentials(
|
97
|
-
getRequiredOption(task.getAccessKeyId, "access_key_id"),
|
98
|
-
getRequiredOption(task.getSecretAccessKey, "secret_access_key"),
|
99
|
-
getRequiredOption(task.getSessionToken, "session_token")
|
100
|
-
))
|
101
|
-
|
102
|
-
case "assume_role" =>
|
103
|
-
// NOTE: Are http_proxy, endpoint, region required when assuming role?
|
104
|
-
val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(
|
105
|
-
getRequiredOption(task.getRoleArn, "role_arn"),
|
106
|
-
getRequiredOption(task.getRoleSessionName, "role_session_name")
|
107
|
-
)
|
108
|
-
task.getRoleExternalId.ifPresent(v => builder.withExternalId(v))
|
109
|
-
task.getRoleSessionDurationSeconds.ifPresent(v => builder.withRoleSessionDurationSeconds(v))
|
110
|
-
task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v))
|
111
|
-
|
112
|
-
builder.build()
|
113
|
-
|
114
|
-
case "default" =>
|
115
|
-
new DefaultAWSCredentialsProviderChain
|
116
|
-
|
117
|
-
case am =>
|
118
|
-
throw new ConfigException(s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default'].")
|
119
128
|
}
|
120
|
-
}
|
121
129
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
130
|
+
private def getRequiredOption[A](o: Optional[A],
|
131
|
+
name: String): A =
|
132
|
+
{
|
133
|
+
o.orElseThrow(() => new ConfigException(s"`$name` must be set when `auth_method` is ${task.getAuthMethod}."))
|
134
|
+
}
|
126
135
|
|
127
136
|
|
128
137
|
}
|