embulk-output-s3_parquet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.aws
|
2
2
|
|
3
|
-
|
4
3
|
import java.util.Optional
|
5
4
|
|
6
5
|
import com.amazonaws.client.builder.AwsClientBuilder
|
@@ -11,47 +10,45 @@ import org.embulk.output.s3_parquet.aws.AwsEndpointConfiguration.Task
|
|
11
10
|
|
12
11
|
import scala.util.Try
|
13
12
|
|
13
|
+
object AwsEndpointConfiguration {
|
14
14
|
|
15
|
-
|
16
|
-
{
|
17
|
-
|
18
|
-
trait Task
|
19
|
-
{
|
15
|
+
trait Task {
|
20
16
|
|
21
|
-
|
22
|
-
|
23
|
-
|
17
|
+
@Config("endpoint")
|
18
|
+
@ConfigDefault("null")
|
19
|
+
def getEndpoint: Optional[String]
|
24
20
|
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
@Config("region")
|
22
|
+
@ConfigDefault("null")
|
23
|
+
def getRegion: Optional[String]
|
28
24
|
|
29
|
-
|
25
|
+
}
|
30
26
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
}
|
27
|
+
def apply(task: Task): AwsEndpointConfiguration = {
|
28
|
+
new AwsEndpointConfiguration(task)
|
29
|
+
}
|
35
30
|
}
|
36
31
|
|
37
|
-
class AwsEndpointConfiguration(task: Task)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
32
|
+
class AwsEndpointConfiguration(task: Task) {
|
33
|
+
|
34
|
+
def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](
|
35
|
+
builder: AwsClientBuilder[S, T]
|
36
|
+
): Unit = {
|
37
|
+
if (task.getRegion.isPresent && task.getEndpoint.isPresent) {
|
38
|
+
val ec =
|
39
|
+
new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get)
|
40
|
+
builder.setEndpointConfiguration(ec)
|
41
|
+
}
|
42
|
+
else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) {
|
43
|
+
builder.setRegion(task.getRegion.get)
|
44
|
+
}
|
45
|
+
else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) {
|
46
|
+
val r: String = Try(new DefaultAwsRegionProviderChain().getRegion)
|
47
|
+
.getOrElse(Regions.DEFAULT_REGION.getName)
|
48
|
+
val e: String = task.getEndpoint.get
|
49
|
+
val ec = new EndpointConfiguration(e, r)
|
50
|
+
builder.setEndpointConfiguration(ec)
|
55
51
|
}
|
52
|
+
}
|
56
53
|
|
57
54
|
}
|
@@ -1,64 +1,68 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.aws
|
2
2
|
|
3
|
-
|
4
3
|
import java.util.Optional
|
5
4
|
|
6
5
|
import com.amazonaws.services.s3.AmazonS3ClientBuilder
|
7
6
|
import org.embulk.config.{Config, ConfigDefault}
|
8
7
|
import org.embulk.output.s3_parquet.aws.AwsS3Configuration.Task
|
9
8
|
|
10
|
-
|
11
9
|
/*
|
12
10
|
* These are advanced settings, so write no documentation.
|
13
11
|
*/
|
14
|
-
object AwsS3Configuration
|
15
|
-
{
|
16
|
-
trait Task
|
17
|
-
{
|
12
|
+
object AwsS3Configuration {
|
18
13
|
|
19
|
-
|
20
|
-
@ConfigDefault("null")
|
21
|
-
def getAccelerateModeEnabled: Optional[Boolean]
|
14
|
+
trait Task {
|
22
15
|
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
@Config("accelerate_mode_enabled")
|
17
|
+
@ConfigDefault("null")
|
18
|
+
def getAccelerateModeEnabled: Optional[Boolean]
|
26
19
|
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
@Config("chunked_encoding_disabled")
|
21
|
+
@ConfigDefault("null")
|
22
|
+
def getChunkedEncodingDisabled: Optional[Boolean]
|
30
23
|
|
31
|
-
|
32
|
-
|
33
|
-
|
24
|
+
@Config("dualstack_enabled")
|
25
|
+
@ConfigDefault("null")
|
26
|
+
def getDualstackEnabled: Optional[Boolean]
|
34
27
|
|
35
|
-
|
36
|
-
|
37
|
-
|
28
|
+
@Config("force_global_bucket_access_enabled")
|
29
|
+
@ConfigDefault("null")
|
30
|
+
def getForceGlobalBucketAccessEnabled: Optional[Boolean]
|
38
31
|
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
@Config("path_style_access_enabled")
|
33
|
+
@ConfigDefault("null")
|
34
|
+
def getPathStyleAccessEnabled: Optional[Boolean]
|
42
35
|
|
43
|
-
|
36
|
+
@Config("payload_signing_enabled")
|
37
|
+
@ConfigDefault("null")
|
38
|
+
def getPayloadSigningEnabled: Optional[Boolean]
|
44
39
|
|
45
|
-
|
46
|
-
{
|
47
|
-
new AwsS3Configuration(task)
|
48
|
-
}
|
49
|
-
}
|
40
|
+
}
|
50
41
|
|
51
|
-
|
52
|
-
|
42
|
+
def apply(task: Task): AwsS3Configuration = {
|
43
|
+
new AwsS3Configuration(task)
|
44
|
+
}
|
45
|
+
}
|
53
46
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
47
|
+
class AwsS3Configuration(task: Task) {
|
48
|
+
|
49
|
+
def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit = {
|
50
|
+
task.getAccelerateModeEnabled.ifPresent(v =>
|
51
|
+
builder.setAccelerateModeEnabled(v)
|
52
|
+
)
|
53
|
+
task.getChunkedEncodingDisabled.ifPresent(v =>
|
54
|
+
builder.setChunkedEncodingDisabled(v)
|
55
|
+
)
|
56
|
+
task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v))
|
57
|
+
task.getForceGlobalBucketAccessEnabled.ifPresent(v =>
|
58
|
+
builder.setForceGlobalBucketAccessEnabled(v)
|
59
|
+
)
|
60
|
+
task.getPathStyleAccessEnabled.ifPresent(v =>
|
61
|
+
builder.setPathStyleAccessEnabled(v)
|
62
|
+
)
|
63
|
+
task.getPayloadSigningEnabled.ifPresent(v =>
|
64
|
+
builder.setPayloadSigningEnabled(v)
|
65
|
+
)
|
66
|
+
}
|
63
67
|
|
64
68
|
}
|
@@ -1,64 +1,61 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.aws
|
2
2
|
|
3
|
-
|
4
3
|
import java.util.Optional
|
5
4
|
|
6
5
|
import com.amazonaws.{ClientConfiguration, Protocol}
|
7
6
|
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
8
7
|
import org.embulk.output.s3_parquet.aws.HttpProxy.Task
|
9
8
|
|
9
|
+
object HttpProxy {
|
10
10
|
|
11
|
-
|
12
|
-
{
|
13
|
-
|
14
|
-
trait Task
|
15
|
-
{
|
11
|
+
trait Task {
|
16
12
|
|
17
|
-
|
18
|
-
|
19
|
-
|
13
|
+
@Config("host")
|
14
|
+
@ConfigDefault("null")
|
15
|
+
def getHost: Optional[String]
|
20
16
|
|
21
|
-
|
22
|
-
|
23
|
-
|
17
|
+
@Config("port")
|
18
|
+
@ConfigDefault("null")
|
19
|
+
def getPort: Optional[Int]
|
24
20
|
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
@Config("protocol")
|
22
|
+
@ConfigDefault("\"https\"")
|
23
|
+
def getProtocol: String
|
28
24
|
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
@Config("user")
|
26
|
+
@ConfigDefault("null")
|
27
|
+
def getUser: Optional[String]
|
32
28
|
|
33
|
-
|
34
|
-
|
35
|
-
|
29
|
+
@Config("password")
|
30
|
+
@ConfigDefault("null")
|
31
|
+
def getPassword: Optional[String]
|
36
32
|
|
37
|
-
|
33
|
+
}
|
38
34
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
}
|
35
|
+
def apply(task: Task): HttpProxy = {
|
36
|
+
new HttpProxy(task)
|
37
|
+
}
|
43
38
|
|
44
39
|
}
|
45
40
|
|
46
|
-
class HttpProxy(task: Task)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
task.getUser.ifPresent(v => cc.setProxyUsername(v))
|
62
|
-
task.getPassword.ifPresent(v => cc.setProxyPassword(v))
|
41
|
+
class HttpProxy(task: Task) {
|
42
|
+
|
43
|
+
def configureClientConfiguration(cc: ClientConfiguration): Unit = {
|
44
|
+
task.getHost.ifPresent(v => cc.setProxyHost(v))
|
45
|
+
task.getPort.ifPresent(v => cc.setProxyPort(v))
|
46
|
+
|
47
|
+
Protocol.values.find(p => p.name().equals(task.getProtocol)) match {
|
48
|
+
case Some(v) =>
|
49
|
+
cc.setProtocol(v)
|
50
|
+
case None =>
|
51
|
+
throw new ConfigException(
|
52
|
+
s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values
|
53
|
+
.map(v => s"'$v'")
|
54
|
+
.mkString(", ")}]."
|
55
|
+
)
|
63
56
|
}
|
57
|
+
|
58
|
+
task.getUser.ifPresent(v => cc.setProxyUsername(v))
|
59
|
+
task.getPassword.ifPresent(v => cc.setProxyPassword(v))
|
60
|
+
}
|
64
61
|
}
|
@@ -1,107 +1,153 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import com.google.common.collect.ImmutableList
|
5
|
-
import org.apache.parquet.schema.{
|
4
|
+
import org.apache.parquet.schema.{
|
5
|
+
MessageType,
|
6
|
+
OriginalType,
|
7
|
+
PrimitiveType,
|
8
|
+
Type
|
9
|
+
}
|
6
10
|
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
|
7
11
|
import org.embulk.spi.{Column, ColumnVisitor, Schema}
|
8
12
|
|
13
|
+
object EmbulkMessageType {
|
14
|
+
|
15
|
+
def builder(): Builder = {
|
16
|
+
Builder()
|
17
|
+
}
|
18
|
+
|
19
|
+
case class Builder(
|
20
|
+
name: String = "embulk",
|
21
|
+
schema: Schema = Schema.builder().build(),
|
22
|
+
logicalTypeHandlers: LogicalTypeHandlerStore =
|
23
|
+
LogicalTypeHandlerStore.empty
|
24
|
+
) {
|
25
|
+
|
26
|
+
def withName(name: String): Builder = {
|
27
|
+
Builder(
|
28
|
+
name = name,
|
29
|
+
schema = schema,
|
30
|
+
logicalTypeHandlers = logicalTypeHandlers
|
31
|
+
)
|
32
|
+
}
|
33
|
+
|
34
|
+
def withSchema(schema: Schema): Builder = {
|
35
|
+
Builder(
|
36
|
+
name = name,
|
37
|
+
schema = schema,
|
38
|
+
logicalTypeHandlers = logicalTypeHandlers
|
39
|
+
)
|
40
|
+
}
|
41
|
+
|
42
|
+
def withLogicalTypeHandlers(
|
43
|
+
logicalTypeHandlers: LogicalTypeHandlerStore
|
44
|
+
): Builder = {
|
45
|
+
Builder(
|
46
|
+
name = name,
|
47
|
+
schema = schema,
|
48
|
+
logicalTypeHandlers = logicalTypeHandlers
|
49
|
+
)
|
50
|
+
}
|
51
|
+
|
52
|
+
def build(): MessageType = {
|
53
|
+
val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
|
54
|
+
schema.visitColumns(
|
55
|
+
EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers)
|
56
|
+
)
|
57
|
+
new MessageType("embulk", builder.build())
|
58
|
+
}
|
59
|
+
|
60
|
+
}
|
61
|
+
|
62
|
+
private case class EmbulkMessageTypeColumnVisitor(
|
63
|
+
builder: ImmutableList.Builder[Type],
|
64
|
+
logicalTypeHandlers: LogicalTypeHandlerStore =
|
65
|
+
LogicalTypeHandlerStore.empty
|
66
|
+
) extends ColumnVisitor {
|
67
|
+
|
68
|
+
override def booleanColumn(column: Column): Unit = {
|
69
|
+
builder.add(
|
70
|
+
new PrimitiveType(
|
71
|
+
Type.Repetition.OPTIONAL,
|
72
|
+
PrimitiveTypeName.BOOLEAN,
|
73
|
+
column.getName
|
74
|
+
)
|
75
|
+
)
|
76
|
+
}
|
9
77
|
|
10
|
-
|
11
|
-
|
78
|
+
override def longColumn(column: Column): Unit = {
|
79
|
+
val name = column.getName
|
80
|
+
val et = column.getType
|
81
|
+
|
82
|
+
val t = logicalTypeHandlers.get(name, et) match {
|
83
|
+
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
84
|
+
case _ =>
|
85
|
+
new PrimitiveType(
|
86
|
+
Type.Repetition.OPTIONAL,
|
87
|
+
PrimitiveTypeName.INT64,
|
88
|
+
column.getName
|
89
|
+
)
|
90
|
+
}
|
91
|
+
|
92
|
+
builder.add(t)
|
93
|
+
}
|
12
94
|
|
13
|
-
def
|
14
|
-
|
15
|
-
|
95
|
+
override def doubleColumn(column: Column): Unit = {
|
96
|
+
builder.add(
|
97
|
+
new PrimitiveType(
|
98
|
+
Type.Repetition.OPTIONAL,
|
99
|
+
PrimitiveTypeName.DOUBLE,
|
100
|
+
column.getName
|
101
|
+
)
|
102
|
+
)
|
16
103
|
}
|
17
104
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
def withSchema(schema: Schema): Builder =
|
29
|
-
{
|
30
|
-
Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
|
31
|
-
}
|
32
|
-
|
33
|
-
def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
|
34
|
-
{
|
35
|
-
Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
|
36
|
-
}
|
37
|
-
|
38
|
-
def build(): MessageType =
|
39
|
-
{
|
40
|
-
val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
|
41
|
-
schema.visitColumns(EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers))
|
42
|
-
new MessageType("embulk", builder.build())
|
43
|
-
}
|
105
|
+
override def stringColumn(column: Column): Unit = {
|
106
|
+
builder.add(
|
107
|
+
new PrimitiveType(
|
108
|
+
Type.Repetition.OPTIONAL,
|
109
|
+
PrimitiveTypeName.BINARY,
|
110
|
+
column.getName,
|
111
|
+
OriginalType.UTF8
|
112
|
+
)
|
113
|
+
)
|
114
|
+
}
|
44
115
|
|
116
|
+
override def timestampColumn(column: Column): Unit = {
|
117
|
+
val name = column.getName
|
118
|
+
val et = column.getType
|
119
|
+
|
120
|
+
val t = logicalTypeHandlers.get(name, et) match {
|
121
|
+
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
122
|
+
case _ =>
|
123
|
+
new PrimitiveType(
|
124
|
+
Type.Repetition.OPTIONAL,
|
125
|
+
PrimitiveTypeName.BINARY,
|
126
|
+
name,
|
127
|
+
OriginalType.UTF8
|
128
|
+
)
|
129
|
+
}
|
130
|
+
|
131
|
+
builder.add(t)
|
45
132
|
}
|
46
133
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
64
|
-
case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.INT64, column.getName)
|
65
|
-
}
|
66
|
-
|
67
|
-
builder.add(t)
|
68
|
-
}
|
69
|
-
|
70
|
-
override def doubleColumn(column: Column): Unit =
|
71
|
-
{
|
72
|
-
builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.DOUBLE, column.getName))
|
73
|
-
}
|
74
|
-
|
75
|
-
override def stringColumn(column: Column): Unit =
|
76
|
-
{
|
77
|
-
builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName, OriginalType.UTF8))
|
78
|
-
}
|
79
|
-
|
80
|
-
override def timestampColumn(column: Column): Unit =
|
81
|
-
{
|
82
|
-
val name = column.getName
|
83
|
-
val et = column.getType
|
84
|
-
|
85
|
-
val t = logicalTypeHandlers.get(name, et) match {
|
86
|
-
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
87
|
-
case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
|
88
|
-
}
|
89
|
-
|
90
|
-
builder.add(t)
|
91
|
-
}
|
92
|
-
|
93
|
-
override def jsonColumn(column: Column): Unit =
|
94
|
-
{
|
95
|
-
val name = column.getName
|
96
|
-
val et = column.getType
|
97
|
-
|
98
|
-
val t = logicalTypeHandlers.get(name, et) match {
|
99
|
-
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
100
|
-
case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
|
101
|
-
}
|
102
|
-
|
103
|
-
builder.add(t)
|
104
|
-
}
|
134
|
+
override def jsonColumn(column: Column): Unit = {
|
135
|
+
val name = column.getName
|
136
|
+
val et = column.getType
|
137
|
+
|
138
|
+
val t = logicalTypeHandlers.get(name, et) match {
|
139
|
+
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
140
|
+
case _ =>
|
141
|
+
new PrimitiveType(
|
142
|
+
Type.Repetition.OPTIONAL,
|
143
|
+
PrimitiveTypeName.BINARY,
|
144
|
+
name,
|
145
|
+
OriginalType.UTF8
|
146
|
+
)
|
147
|
+
}
|
148
|
+
|
149
|
+
builder.add(t)
|
105
150
|
}
|
151
|
+
}
|
106
152
|
|
107
|
-
}
|
153
|
+
}
|