embulk-output-s3_parquet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.aws
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.amazonaws.client.builder.AwsClientBuilder
@@ -11,47 +10,45 @@ import org.embulk.output.s3_parquet.aws.AwsEndpointConfiguration.Task
11
10
 
12
11
  import scala.util.Try
13
12
 
13
+ object AwsEndpointConfiguration {
14
14
 
15
- object AwsEndpointConfiguration
16
- {
17
-
18
- trait Task
19
- {
15
+ trait Task {
20
16
 
21
- @Config("endpoint")
22
- @ConfigDefault("null")
23
- def getEndpoint: Optional[String]
17
+ @Config("endpoint")
18
+ @ConfigDefault("null")
19
+ def getEndpoint: Optional[String]
24
20
 
25
- @Config("region")
26
- @ConfigDefault("null")
27
- def getRegion: Optional[String]
21
+ @Config("region")
22
+ @ConfigDefault("null")
23
+ def getRegion: Optional[String]
28
24
 
29
- }
25
+ }
30
26
 
31
- def apply(task: Task): AwsEndpointConfiguration =
32
- {
33
- new AwsEndpointConfiguration(task)
34
- }
27
+ def apply(task: Task): AwsEndpointConfiguration = {
28
+ new AwsEndpointConfiguration(task)
29
+ }
35
30
  }
36
31
 
37
- class AwsEndpointConfiguration(task: Task)
38
- {
39
-
40
- def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): Unit =
41
- {
42
- if (task.getRegion.isPresent && task.getEndpoint.isPresent) {
43
- val ec = new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get)
44
- builder.setEndpointConfiguration(ec)
45
- }
46
- else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) {
47
- builder.setRegion(task.getRegion.get)
48
- }
49
- else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) {
50
- val r: String = Try(new DefaultAwsRegionProviderChain().getRegion).getOrElse(Regions.DEFAULT_REGION.getName)
51
- val e: String = task.getEndpoint.get
52
- val ec = new EndpointConfiguration(e, r)
53
- builder.setEndpointConfiguration(ec)
54
- }
32
+ class AwsEndpointConfiguration(task: Task) {
33
+
34
+ def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](
35
+ builder: AwsClientBuilder[S, T]
36
+ ): Unit = {
37
+ if (task.getRegion.isPresent && task.getEndpoint.isPresent) {
38
+ val ec =
39
+ new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get)
40
+ builder.setEndpointConfiguration(ec)
41
+ }
42
+ else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) {
43
+ builder.setRegion(task.getRegion.get)
44
+ }
45
+ else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) {
46
+ val r: String = Try(new DefaultAwsRegionProviderChain().getRegion)
47
+ .getOrElse(Regions.DEFAULT_REGION.getName)
48
+ val e: String = task.getEndpoint.get
49
+ val ec = new EndpointConfiguration(e, r)
50
+ builder.setEndpointConfiguration(ec)
55
51
  }
52
+ }
56
53
 
57
54
  }
@@ -1,64 +1,68 @@
1
1
  package org.embulk.output.s3_parquet.aws
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.amazonaws.services.s3.AmazonS3ClientBuilder
7
6
  import org.embulk.config.{Config, ConfigDefault}
8
7
  import org.embulk.output.s3_parquet.aws.AwsS3Configuration.Task
9
8
 
10
-
11
9
  /*
12
10
  * These are advanced settings, so write no documentation.
13
11
  */
14
- object AwsS3Configuration
15
- {
16
- trait Task
17
- {
12
+ object AwsS3Configuration {
18
13
 
19
- @Config("accelerate_mode_enabled")
20
- @ConfigDefault("null")
21
- def getAccelerateModeEnabled: Optional[Boolean]
14
+ trait Task {
22
15
 
23
- @Config("chunked_encoding_disabled")
24
- @ConfigDefault("null")
25
- def getChunkedEncodingDisabled: Optional[Boolean]
16
+ @Config("accelerate_mode_enabled")
17
+ @ConfigDefault("null")
18
+ def getAccelerateModeEnabled: Optional[Boolean]
26
19
 
27
- @Config("dualstack_enabled")
28
- @ConfigDefault("null")
29
- def getDualstackEnabled: Optional[Boolean]
20
+ @Config("chunked_encoding_disabled")
21
+ @ConfigDefault("null")
22
+ def getChunkedEncodingDisabled: Optional[Boolean]
30
23
 
31
- @Config("force_global_bucket_access_enabled")
32
- @ConfigDefault("null")
33
- def getForceGlobalBucketAccessEnabled: Optional[Boolean]
24
+ @Config("dualstack_enabled")
25
+ @ConfigDefault("null")
26
+ def getDualstackEnabled: Optional[Boolean]
34
27
 
35
- @Config("path_style_access_enabled")
36
- @ConfigDefault("null")
37
- def getPathStyleAccessEnabled: Optional[Boolean]
28
+ @Config("force_global_bucket_access_enabled")
29
+ @ConfigDefault("null")
30
+ def getForceGlobalBucketAccessEnabled: Optional[Boolean]
38
31
 
39
- @Config("payload_signing_enabled")
40
- @ConfigDefault("null")
41
- def getPayloadSigningEnabled: Optional[Boolean]
32
+ @Config("path_style_access_enabled")
33
+ @ConfigDefault("null")
34
+ def getPathStyleAccessEnabled: Optional[Boolean]
42
35
 
43
- }
36
+ @Config("payload_signing_enabled")
37
+ @ConfigDefault("null")
38
+ def getPayloadSigningEnabled: Optional[Boolean]
44
39
 
45
- def apply(task: Task): AwsS3Configuration =
46
- {
47
- new AwsS3Configuration(task)
48
- }
49
- }
40
+ }
50
41
 
51
- class AwsS3Configuration(task: Task)
52
- {
42
+ def apply(task: Task): AwsS3Configuration = {
43
+ new AwsS3Configuration(task)
44
+ }
45
+ }
53
46
 
54
- def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit =
55
- {
56
- task.getAccelerateModeEnabled.ifPresent(v => builder.setAccelerateModeEnabled(v))
57
- task.getChunkedEncodingDisabled.ifPresent(v => builder.setChunkedEncodingDisabled(v))
58
- task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v))
59
- task.getForceGlobalBucketAccessEnabled.ifPresent(v => builder.setForceGlobalBucketAccessEnabled(v))
60
- task.getPathStyleAccessEnabled.ifPresent(v => builder.setPathStyleAccessEnabled(v))
61
- task.getPayloadSigningEnabled.ifPresent(v => builder.setPayloadSigningEnabled(v))
62
- }
47
+ class AwsS3Configuration(task: Task) {
48
+
49
+ def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit = {
50
+ task.getAccelerateModeEnabled.ifPresent(v =>
51
+ builder.setAccelerateModeEnabled(v)
52
+ )
53
+ task.getChunkedEncodingDisabled.ifPresent(v =>
54
+ builder.setChunkedEncodingDisabled(v)
55
+ )
56
+ task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v))
57
+ task.getForceGlobalBucketAccessEnabled.ifPresent(v =>
58
+ builder.setForceGlobalBucketAccessEnabled(v)
59
+ )
60
+ task.getPathStyleAccessEnabled.ifPresent(v =>
61
+ builder.setPathStyleAccessEnabled(v)
62
+ )
63
+ task.getPayloadSigningEnabled.ifPresent(v =>
64
+ builder.setPayloadSigningEnabled(v)
65
+ )
66
+ }
63
67
 
64
68
  }
@@ -1,64 +1,61 @@
1
1
  package org.embulk.output.s3_parquet.aws
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.amazonaws.{ClientConfiguration, Protocol}
7
6
  import org.embulk.config.{Config, ConfigDefault, ConfigException}
8
7
  import org.embulk.output.s3_parquet.aws.HttpProxy.Task
9
8
 
9
+ object HttpProxy {
10
10
 
11
- object HttpProxy
12
- {
13
-
14
- trait Task
15
- {
11
+ trait Task {
16
12
 
17
- @Config("host")
18
- @ConfigDefault("null")
19
- def getHost: Optional[String]
13
+ @Config("host")
14
+ @ConfigDefault("null")
15
+ def getHost: Optional[String]
20
16
 
21
- @Config("port")
22
- @ConfigDefault("null")
23
- def getPort: Optional[Int]
17
+ @Config("port")
18
+ @ConfigDefault("null")
19
+ def getPort: Optional[Int]
24
20
 
25
- @Config("protocol")
26
- @ConfigDefault("\"https\"")
27
- def getProtocol: String
21
+ @Config("protocol")
22
+ @ConfigDefault("\"https\"")
23
+ def getProtocol: String
28
24
 
29
- @Config("user")
30
- @ConfigDefault("null")
31
- def getUser: Optional[String]
25
+ @Config("user")
26
+ @ConfigDefault("null")
27
+ def getUser: Optional[String]
32
28
 
33
- @Config("password")
34
- @ConfigDefault("null")
35
- def getPassword: Optional[String]
29
+ @Config("password")
30
+ @ConfigDefault("null")
31
+ def getPassword: Optional[String]
36
32
 
37
- }
33
+ }
38
34
 
39
- def apply(task: Task): HttpProxy =
40
- {
41
- new HttpProxy(task)
42
- }
35
+ def apply(task: Task): HttpProxy = {
36
+ new HttpProxy(task)
37
+ }
43
38
 
44
39
  }
45
40
 
46
- class HttpProxy(task: Task)
47
- {
48
-
49
- def configureClientConfiguration(cc: ClientConfiguration): Unit =
50
- {
51
- task.getHost.ifPresent(v => cc.setProxyHost(v))
52
- task.getPort.ifPresent(v => cc.setProxyPort(v))
53
-
54
- Protocol.values.find(p => p.name().equals(task.getProtocol)) match {
55
- case Some(v) =>
56
- cc.setProtocol(v)
57
- case None =>
58
- throw new ConfigException(s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values.map(v => s"'$v'").mkString(", ")}].")
59
- }
60
-
61
- task.getUser.ifPresent(v => cc.setProxyUsername(v))
62
- task.getPassword.ifPresent(v => cc.setProxyPassword(v))
41
+ class HttpProxy(task: Task) {
42
+
43
+ def configureClientConfiguration(cc: ClientConfiguration): Unit = {
44
+ task.getHost.ifPresent(v => cc.setProxyHost(v))
45
+ task.getPort.ifPresent(v => cc.setProxyPort(v))
46
+
47
+ Protocol.values.find(p => p.name().equals(task.getProtocol)) match {
48
+ case Some(v) =>
49
+ cc.setProtocol(v)
50
+ case None =>
51
+ throw new ConfigException(
52
+ s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values
53
+ .map(v => s"'$v'")
54
+ .mkString(", ")}]."
55
+ )
63
56
  }
57
+
58
+ task.getUser.ifPresent(v => cc.setProxyUsername(v))
59
+ task.getPassword.ifPresent(v => cc.setProxyPassword(v))
60
+ }
64
61
  }
@@ -1,107 +1,153 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import com.google.common.collect.ImmutableList
5
- import org.apache.parquet.schema.{MessageType, OriginalType, PrimitiveType, Type}
4
+ import org.apache.parquet.schema.{
5
+ MessageType,
6
+ OriginalType,
7
+ PrimitiveType,
8
+ Type
9
+ }
6
10
  import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
7
11
  import org.embulk.spi.{Column, ColumnVisitor, Schema}
8
12
 
13
+ object EmbulkMessageType {
14
+
15
+ def builder(): Builder = {
16
+ Builder()
17
+ }
18
+
19
+ case class Builder(
20
+ name: String = "embulk",
21
+ schema: Schema = Schema.builder().build(),
22
+ logicalTypeHandlers: LogicalTypeHandlerStore =
23
+ LogicalTypeHandlerStore.empty
24
+ ) {
25
+
26
+ def withName(name: String): Builder = {
27
+ Builder(
28
+ name = name,
29
+ schema = schema,
30
+ logicalTypeHandlers = logicalTypeHandlers
31
+ )
32
+ }
33
+
34
+ def withSchema(schema: Schema): Builder = {
35
+ Builder(
36
+ name = name,
37
+ schema = schema,
38
+ logicalTypeHandlers = logicalTypeHandlers
39
+ )
40
+ }
41
+
42
+ def withLogicalTypeHandlers(
43
+ logicalTypeHandlers: LogicalTypeHandlerStore
44
+ ): Builder = {
45
+ Builder(
46
+ name = name,
47
+ schema = schema,
48
+ logicalTypeHandlers = logicalTypeHandlers
49
+ )
50
+ }
51
+
52
+ def build(): MessageType = {
53
+ val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
54
+ schema.visitColumns(
55
+ EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers)
56
+ )
57
+ new MessageType("embulk", builder.build())
58
+ }
59
+
60
+ }
61
+
62
+ private case class EmbulkMessageTypeColumnVisitor(
63
+ builder: ImmutableList.Builder[Type],
64
+ logicalTypeHandlers: LogicalTypeHandlerStore =
65
+ LogicalTypeHandlerStore.empty
66
+ ) extends ColumnVisitor {
67
+
68
+ override def booleanColumn(column: Column): Unit = {
69
+ builder.add(
70
+ new PrimitiveType(
71
+ Type.Repetition.OPTIONAL,
72
+ PrimitiveTypeName.BOOLEAN,
73
+ column.getName
74
+ )
75
+ )
76
+ }
9
77
 
10
- object EmbulkMessageType
11
- {
78
+ override def longColumn(column: Column): Unit = {
79
+ val name = column.getName
80
+ val et = column.getType
81
+
82
+ val t = logicalTypeHandlers.get(name, et) match {
83
+ case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
84
+ case _ =>
85
+ new PrimitiveType(
86
+ Type.Repetition.OPTIONAL,
87
+ PrimitiveTypeName.INT64,
88
+ column.getName
89
+ )
90
+ }
91
+
92
+ builder.add(t)
93
+ }
12
94
 
13
- def builder(): Builder =
14
- {
15
- Builder()
95
+ override def doubleColumn(column: Column): Unit = {
96
+ builder.add(
97
+ new PrimitiveType(
98
+ Type.Repetition.OPTIONAL,
99
+ PrimitiveTypeName.DOUBLE,
100
+ column.getName
101
+ )
102
+ )
16
103
  }
17
104
 
18
- case class Builder(name: String = "embulk",
19
- schema: Schema = Schema.builder().build(),
20
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
21
- {
22
-
23
- def withName(name: String): Builder =
24
- {
25
- Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
26
- }
27
-
28
- def withSchema(schema: Schema): Builder =
29
- {
30
- Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
31
- }
32
-
33
- def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
34
- {
35
- Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
36
- }
37
-
38
- def build(): MessageType =
39
- {
40
- val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
41
- schema.visitColumns(EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers))
42
- new MessageType("embulk", builder.build())
43
- }
105
+ override def stringColumn(column: Column): Unit = {
106
+ builder.add(
107
+ new PrimitiveType(
108
+ Type.Repetition.OPTIONAL,
109
+ PrimitiveTypeName.BINARY,
110
+ column.getName,
111
+ OriginalType.UTF8
112
+ )
113
+ )
114
+ }
44
115
 
116
+ override def timestampColumn(column: Column): Unit = {
117
+ val name = column.getName
118
+ val et = column.getType
119
+
120
+ val t = logicalTypeHandlers.get(name, et) match {
121
+ case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
122
+ case _ =>
123
+ new PrimitiveType(
124
+ Type.Repetition.OPTIONAL,
125
+ PrimitiveTypeName.BINARY,
126
+ name,
127
+ OriginalType.UTF8
128
+ )
129
+ }
130
+
131
+ builder.add(t)
45
132
  }
46
133
 
47
- private case class EmbulkMessageTypeColumnVisitor(builder: ImmutableList.Builder[Type],
48
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
49
- extends ColumnVisitor
50
- {
51
-
52
- override def booleanColumn(column: Column): Unit =
53
- {
54
- builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BOOLEAN, column.getName))
55
- }
56
-
57
- override def longColumn(column: Column): Unit =
58
- {
59
- val name = column.getName
60
- val et = column.getType
61
-
62
- val t = logicalTypeHandlers.get(name, et) match {
63
- case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
64
- case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.INT64, column.getName)
65
- }
66
-
67
- builder.add(t)
68
- }
69
-
70
- override def doubleColumn(column: Column): Unit =
71
- {
72
- builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.DOUBLE, column.getName))
73
- }
74
-
75
- override def stringColumn(column: Column): Unit =
76
- {
77
- builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName, OriginalType.UTF8))
78
- }
79
-
80
- override def timestampColumn(column: Column): Unit =
81
- {
82
- val name = column.getName
83
- val et = column.getType
84
-
85
- val t = logicalTypeHandlers.get(name, et) match {
86
- case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
87
- case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
88
- }
89
-
90
- builder.add(t)
91
- }
92
-
93
- override def jsonColumn(column: Column): Unit =
94
- {
95
- val name = column.getName
96
- val et = column.getType
97
-
98
- val t = logicalTypeHandlers.get(name, et) match {
99
- case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
100
- case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
101
- }
102
-
103
- builder.add(t)
104
- }
134
+ override def jsonColumn(column: Column): Unit = {
135
+ val name = column.getName
136
+ val et = column.getType
137
+
138
+ val t = logicalTypeHandlers.get(name, et) match {
139
+ case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
140
+ case _ =>
141
+ new PrimitiveType(
142
+ Type.Repetition.OPTIONAL,
143
+ PrimitiveTypeName.BINARY,
144
+ name,
145
+ OriginalType.UTF8
146
+ )
147
+ }
148
+
149
+ builder.add(t)
105
150
  }
151
+ }
106
152
 
107
- }
153
+ }