embulk-output-s3_parquet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.aws
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.amazonaws.client.builder.AwsClientBuilder
@@ -11,47 +10,45 @@ import org.embulk.output.s3_parquet.aws.AwsEndpointConfiguration.Task
11
10
 
12
11
  import scala.util.Try
13
12
 
13
+ object AwsEndpointConfiguration {
14
14
 
15
- object AwsEndpointConfiguration
16
- {
17
-
18
- trait Task
19
- {
15
+ trait Task {
20
16
 
21
- @Config("endpoint")
22
- @ConfigDefault("null")
23
- def getEndpoint: Optional[String]
17
+ @Config("endpoint")
18
+ @ConfigDefault("null")
19
+ def getEndpoint: Optional[String]
24
20
 
25
- @Config("region")
26
- @ConfigDefault("null")
27
- def getRegion: Optional[String]
21
+ @Config("region")
22
+ @ConfigDefault("null")
23
+ def getRegion: Optional[String]
28
24
 
29
- }
25
+ }
30
26
 
31
- def apply(task: Task): AwsEndpointConfiguration =
32
- {
33
- new AwsEndpointConfiguration(task)
34
- }
27
+ def apply(task: Task): AwsEndpointConfiguration = {
28
+ new AwsEndpointConfiguration(task)
29
+ }
35
30
  }
36
31
 
37
- class AwsEndpointConfiguration(task: Task)
38
- {
39
-
40
- def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](builder: AwsClientBuilder[S, T]): Unit =
41
- {
42
- if (task.getRegion.isPresent && task.getEndpoint.isPresent) {
43
- val ec = new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get)
44
- builder.setEndpointConfiguration(ec)
45
- }
46
- else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) {
47
- builder.setRegion(task.getRegion.get)
48
- }
49
- else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) {
50
- val r: String = Try(new DefaultAwsRegionProviderChain().getRegion).getOrElse(Regions.DEFAULT_REGION.getName)
51
- val e: String = task.getEndpoint.get
52
- val ec = new EndpointConfiguration(e, r)
53
- builder.setEndpointConfiguration(ec)
54
- }
32
+ class AwsEndpointConfiguration(task: Task) {
33
+
34
+ def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](
35
+ builder: AwsClientBuilder[S, T]
36
+ ): Unit = {
37
+ if (task.getRegion.isPresent && task.getEndpoint.isPresent) {
38
+ val ec =
39
+ new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get)
40
+ builder.setEndpointConfiguration(ec)
41
+ }
42
+ else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) {
43
+ builder.setRegion(task.getRegion.get)
44
+ }
45
+ else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) {
46
+ val r: String = Try(new DefaultAwsRegionProviderChain().getRegion)
47
+ .getOrElse(Regions.DEFAULT_REGION.getName)
48
+ val e: String = task.getEndpoint.get
49
+ val ec = new EndpointConfiguration(e, r)
50
+ builder.setEndpointConfiguration(ec)
55
51
  }
52
+ }
56
53
 
57
54
  }
@@ -1,64 +1,68 @@
1
1
  package org.embulk.output.s3_parquet.aws
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.amazonaws.services.s3.AmazonS3ClientBuilder
7
6
  import org.embulk.config.{Config, ConfigDefault}
8
7
  import org.embulk.output.s3_parquet.aws.AwsS3Configuration.Task
9
8
 
10
-
11
9
  /*
12
10
  * These are advanced settings, so write no documentation.
13
11
  */
14
- object AwsS3Configuration
15
- {
16
- trait Task
17
- {
12
+ object AwsS3Configuration {
18
13
 
19
- @Config("accelerate_mode_enabled")
20
- @ConfigDefault("null")
21
- def getAccelerateModeEnabled: Optional[Boolean]
14
+ trait Task {
22
15
 
23
- @Config("chunked_encoding_disabled")
24
- @ConfigDefault("null")
25
- def getChunkedEncodingDisabled: Optional[Boolean]
16
+ @Config("accelerate_mode_enabled")
17
+ @ConfigDefault("null")
18
+ def getAccelerateModeEnabled: Optional[Boolean]
26
19
 
27
- @Config("dualstack_enabled")
28
- @ConfigDefault("null")
29
- def getDualstackEnabled: Optional[Boolean]
20
+ @Config("chunked_encoding_disabled")
21
+ @ConfigDefault("null")
22
+ def getChunkedEncodingDisabled: Optional[Boolean]
30
23
 
31
- @Config("force_global_bucket_access_enabled")
32
- @ConfigDefault("null")
33
- def getForceGlobalBucketAccessEnabled: Optional[Boolean]
24
+ @Config("dualstack_enabled")
25
+ @ConfigDefault("null")
26
+ def getDualstackEnabled: Optional[Boolean]
34
27
 
35
- @Config("path_style_access_enabled")
36
- @ConfigDefault("null")
37
- def getPathStyleAccessEnabled: Optional[Boolean]
28
+ @Config("force_global_bucket_access_enabled")
29
+ @ConfigDefault("null")
30
+ def getForceGlobalBucketAccessEnabled: Optional[Boolean]
38
31
 
39
- @Config("payload_signing_enabled")
40
- @ConfigDefault("null")
41
- def getPayloadSigningEnabled: Optional[Boolean]
32
+ @Config("path_style_access_enabled")
33
+ @ConfigDefault("null")
34
+ def getPathStyleAccessEnabled: Optional[Boolean]
42
35
 
43
- }
36
+ @Config("payload_signing_enabled")
37
+ @ConfigDefault("null")
38
+ def getPayloadSigningEnabled: Optional[Boolean]
44
39
 
45
- def apply(task: Task): AwsS3Configuration =
46
- {
47
- new AwsS3Configuration(task)
48
- }
49
- }
40
+ }
50
41
 
51
- class AwsS3Configuration(task: Task)
52
- {
42
+ def apply(task: Task): AwsS3Configuration = {
43
+ new AwsS3Configuration(task)
44
+ }
45
+ }
53
46
 
54
- def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit =
55
- {
56
- task.getAccelerateModeEnabled.ifPresent(v => builder.setAccelerateModeEnabled(v))
57
- task.getChunkedEncodingDisabled.ifPresent(v => builder.setChunkedEncodingDisabled(v))
58
- task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v))
59
- task.getForceGlobalBucketAccessEnabled.ifPresent(v => builder.setForceGlobalBucketAccessEnabled(v))
60
- task.getPathStyleAccessEnabled.ifPresent(v => builder.setPathStyleAccessEnabled(v))
61
- task.getPayloadSigningEnabled.ifPresent(v => builder.setPayloadSigningEnabled(v))
62
- }
47
+ class AwsS3Configuration(task: Task) {
48
+
49
+ def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit = {
50
+ task.getAccelerateModeEnabled.ifPresent(v =>
51
+ builder.setAccelerateModeEnabled(v)
52
+ )
53
+ task.getChunkedEncodingDisabled.ifPresent(v =>
54
+ builder.setChunkedEncodingDisabled(v)
55
+ )
56
+ task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v))
57
+ task.getForceGlobalBucketAccessEnabled.ifPresent(v =>
58
+ builder.setForceGlobalBucketAccessEnabled(v)
59
+ )
60
+ task.getPathStyleAccessEnabled.ifPresent(v =>
61
+ builder.setPathStyleAccessEnabled(v)
62
+ )
63
+ task.getPayloadSigningEnabled.ifPresent(v =>
64
+ builder.setPayloadSigningEnabled(v)
65
+ )
66
+ }
63
67
 
64
68
  }
@@ -1,64 +1,61 @@
1
1
  package org.embulk.output.s3_parquet.aws
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.amazonaws.{ClientConfiguration, Protocol}
7
6
  import org.embulk.config.{Config, ConfigDefault, ConfigException}
8
7
  import org.embulk.output.s3_parquet.aws.HttpProxy.Task
9
8
 
9
+ object HttpProxy {
10
10
 
11
- object HttpProxy
12
- {
13
-
14
- trait Task
15
- {
11
+ trait Task {
16
12
 
17
- @Config("host")
18
- @ConfigDefault("null")
19
- def getHost: Optional[String]
13
+ @Config("host")
14
+ @ConfigDefault("null")
15
+ def getHost: Optional[String]
20
16
 
21
- @Config("port")
22
- @ConfigDefault("null")
23
- def getPort: Optional[Int]
17
+ @Config("port")
18
+ @ConfigDefault("null")
19
+ def getPort: Optional[Int]
24
20
 
25
- @Config("protocol")
26
- @ConfigDefault("\"https\"")
27
- def getProtocol: String
21
+ @Config("protocol")
22
+ @ConfigDefault("\"https\"")
23
+ def getProtocol: String
28
24
 
29
- @Config("user")
30
- @ConfigDefault("null")
31
- def getUser: Optional[String]
25
+ @Config("user")
26
+ @ConfigDefault("null")
27
+ def getUser: Optional[String]
32
28
 
33
- @Config("password")
34
- @ConfigDefault("null")
35
- def getPassword: Optional[String]
29
+ @Config("password")
30
+ @ConfigDefault("null")
31
+ def getPassword: Optional[String]
36
32
 
37
- }
33
+ }
38
34
 
39
- def apply(task: Task): HttpProxy =
40
- {
41
- new HttpProxy(task)
42
- }
35
+ def apply(task: Task): HttpProxy = {
36
+ new HttpProxy(task)
37
+ }
43
38
 
44
39
  }
45
40
 
46
- class HttpProxy(task: Task)
47
- {
48
-
49
- def configureClientConfiguration(cc: ClientConfiguration): Unit =
50
- {
51
- task.getHost.ifPresent(v => cc.setProxyHost(v))
52
- task.getPort.ifPresent(v => cc.setProxyPort(v))
53
-
54
- Protocol.values.find(p => p.name().equals(task.getProtocol)) match {
55
- case Some(v) =>
56
- cc.setProtocol(v)
57
- case None =>
58
- throw new ConfigException(s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values.map(v => s"'$v'").mkString(", ")}].")
59
- }
60
-
61
- task.getUser.ifPresent(v => cc.setProxyUsername(v))
62
- task.getPassword.ifPresent(v => cc.setProxyPassword(v))
41
+ class HttpProxy(task: Task) {
42
+
43
+ def configureClientConfiguration(cc: ClientConfiguration): Unit = {
44
+ task.getHost.ifPresent(v => cc.setProxyHost(v))
45
+ task.getPort.ifPresent(v => cc.setProxyPort(v))
46
+
47
+ Protocol.values.find(p => p.name().equals(task.getProtocol)) match {
48
+ case Some(v) =>
49
+ cc.setProtocol(v)
50
+ case None =>
51
+ throw new ConfigException(
52
+ s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values
53
+ .map(v => s"'$v'")
54
+ .mkString(", ")}]."
55
+ )
63
56
  }
57
+
58
+ task.getUser.ifPresent(v => cc.setProxyUsername(v))
59
+ task.getPassword.ifPresent(v => cc.setProxyPassword(v))
60
+ }
64
61
  }
@@ -1,107 +1,153 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import com.google.common.collect.ImmutableList
5
- import org.apache.parquet.schema.{MessageType, OriginalType, PrimitiveType, Type}
4
+ import org.apache.parquet.schema.{
5
+ MessageType,
6
+ OriginalType,
7
+ PrimitiveType,
8
+ Type
9
+ }
6
10
  import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
7
11
  import org.embulk.spi.{Column, ColumnVisitor, Schema}
8
12
 
13
+ object EmbulkMessageType {
14
+
15
+ def builder(): Builder = {
16
+ Builder()
17
+ }
18
+
19
+ case class Builder(
20
+ name: String = "embulk",
21
+ schema: Schema = Schema.builder().build(),
22
+ logicalTypeHandlers: LogicalTypeHandlerStore =
23
+ LogicalTypeHandlerStore.empty
24
+ ) {
25
+
26
+ def withName(name: String): Builder = {
27
+ Builder(
28
+ name = name,
29
+ schema = schema,
30
+ logicalTypeHandlers = logicalTypeHandlers
31
+ )
32
+ }
33
+
34
+ def withSchema(schema: Schema): Builder = {
35
+ Builder(
36
+ name = name,
37
+ schema = schema,
38
+ logicalTypeHandlers = logicalTypeHandlers
39
+ )
40
+ }
41
+
42
+ def withLogicalTypeHandlers(
43
+ logicalTypeHandlers: LogicalTypeHandlerStore
44
+ ): Builder = {
45
+ Builder(
46
+ name = name,
47
+ schema = schema,
48
+ logicalTypeHandlers = logicalTypeHandlers
49
+ )
50
+ }
51
+
52
+ def build(): MessageType = {
53
+ val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
54
+ schema.visitColumns(
55
+ EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers)
56
+ )
57
+ new MessageType("embulk", builder.build())
58
+ }
59
+
60
+ }
61
+
62
+ private case class EmbulkMessageTypeColumnVisitor(
63
+ builder: ImmutableList.Builder[Type],
64
+ logicalTypeHandlers: LogicalTypeHandlerStore =
65
+ LogicalTypeHandlerStore.empty
66
+ ) extends ColumnVisitor {
67
+
68
+ override def booleanColumn(column: Column): Unit = {
69
+ builder.add(
70
+ new PrimitiveType(
71
+ Type.Repetition.OPTIONAL,
72
+ PrimitiveTypeName.BOOLEAN,
73
+ column.getName
74
+ )
75
+ )
76
+ }
9
77
 
10
- object EmbulkMessageType
11
- {
78
+ override def longColumn(column: Column): Unit = {
79
+ val name = column.getName
80
+ val et = column.getType
81
+
82
+ val t = logicalTypeHandlers.get(name, et) match {
83
+ case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
84
+ case _ =>
85
+ new PrimitiveType(
86
+ Type.Repetition.OPTIONAL,
87
+ PrimitiveTypeName.INT64,
88
+ column.getName
89
+ )
90
+ }
91
+
92
+ builder.add(t)
93
+ }
12
94
 
13
- def builder(): Builder =
14
- {
15
- Builder()
95
+ override def doubleColumn(column: Column): Unit = {
96
+ builder.add(
97
+ new PrimitiveType(
98
+ Type.Repetition.OPTIONAL,
99
+ PrimitiveTypeName.DOUBLE,
100
+ column.getName
101
+ )
102
+ )
16
103
  }
17
104
 
18
- case class Builder(name: String = "embulk",
19
- schema: Schema = Schema.builder().build(),
20
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
21
- {
22
-
23
- def withName(name: String): Builder =
24
- {
25
- Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
26
- }
27
-
28
- def withSchema(schema: Schema): Builder =
29
- {
30
- Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
31
- }
32
-
33
- def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
34
- {
35
- Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
36
- }
37
-
38
- def build(): MessageType =
39
- {
40
- val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
41
- schema.visitColumns(EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers))
42
- new MessageType("embulk", builder.build())
43
- }
105
+ override def stringColumn(column: Column): Unit = {
106
+ builder.add(
107
+ new PrimitiveType(
108
+ Type.Repetition.OPTIONAL,
109
+ PrimitiveTypeName.BINARY,
110
+ column.getName,
111
+ OriginalType.UTF8
112
+ )
113
+ )
114
+ }
44
115
 
116
+ override def timestampColumn(column: Column): Unit = {
117
+ val name = column.getName
118
+ val et = column.getType
119
+
120
+ val t = logicalTypeHandlers.get(name, et) match {
121
+ case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
122
+ case _ =>
123
+ new PrimitiveType(
124
+ Type.Repetition.OPTIONAL,
125
+ PrimitiveTypeName.BINARY,
126
+ name,
127
+ OriginalType.UTF8
128
+ )
129
+ }
130
+
131
+ builder.add(t)
45
132
  }
46
133
 
47
- private case class EmbulkMessageTypeColumnVisitor(builder: ImmutableList.Builder[Type],
48
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
49
- extends ColumnVisitor
50
- {
51
-
52
- override def booleanColumn(column: Column): Unit =
53
- {
54
- builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BOOLEAN, column.getName))
55
- }
56
-
57
- override def longColumn(column: Column): Unit =
58
- {
59
- val name = column.getName
60
- val et = column.getType
61
-
62
- val t = logicalTypeHandlers.get(name, et) match {
63
- case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
64
- case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.INT64, column.getName)
65
- }
66
-
67
- builder.add(t)
68
- }
69
-
70
- override def doubleColumn(column: Column): Unit =
71
- {
72
- builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.DOUBLE, column.getName))
73
- }
74
-
75
- override def stringColumn(column: Column): Unit =
76
- {
77
- builder.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName, OriginalType.UTF8))
78
- }
79
-
80
- override def timestampColumn(column: Column): Unit =
81
- {
82
- val name = column.getName
83
- val et = column.getType
84
-
85
- val t = logicalTypeHandlers.get(name, et) match {
86
- case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
87
- case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
88
- }
89
-
90
- builder.add(t)
91
- }
92
-
93
- override def jsonColumn(column: Column): Unit =
94
- {
95
- val name = column.getName
96
- val et = column.getType
97
-
98
- val t = logicalTypeHandlers.get(name, et) match {
99
- case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
100
- case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
101
- }
102
-
103
- builder.add(t)
104
- }
134
+ override def jsonColumn(column: Column): Unit = {
135
+ val name = column.getName
136
+ val et = column.getType
137
+
138
+ val t = logicalTypeHandlers.get(name, et) match {
139
+ case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
140
+ case _ =>
141
+ new PrimitiveType(
142
+ Type.Repetition.OPTIONAL,
143
+ PrimitiveTypeName.BINARY,
144
+ name,
145
+ OriginalType.UTF8
146
+ )
147
+ }
148
+
149
+ builder.add(t)
105
150
  }
151
+ }
106
152
 
107
- }
153
+ }