embulk-output-s3_parquet 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -0,0 +1,18 @@
|
|
1
|
+
package org.embulk.output.s3_parquet
|
2
|
+
|
3
|
+
// WARNING: This object should be used for limited purposes only.
|
4
|
+
object ContextClassLoaderSwapper {
|
5
|
+
|
6
|
+
def using[A](klass: Class[_])(f: => A): A = {
|
7
|
+
val currentTread = Thread.currentThread()
|
8
|
+
val original = currentTread.getContextClassLoader
|
9
|
+
val target = klass.getClassLoader
|
10
|
+
currentTread.setContextClassLoader(target)
|
11
|
+
try f
|
12
|
+
finally currentTread.setContextClassLoader(original)
|
13
|
+
}
|
14
|
+
|
15
|
+
def usingPluginClass[A](f: => A): A = {
|
16
|
+
using(classOf[S3ParquetOutputPlugin])(f)
|
17
|
+
}
|
18
|
+
}
|
@@ -1,18 +1,44 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.nio.file.{Files, Paths}
|
5
|
-
import java.util.{
|
4
|
+
import java.util.{
|
5
|
+
IllegalFormatException,
|
6
|
+
Locale,
|
7
|
+
Optional,
|
8
|
+
List => JList,
|
9
|
+
Map => JMap
|
10
|
+
}
|
6
11
|
|
7
12
|
import com.amazonaws.services.s3.model.CannedAccessControlList
|
8
13
|
import org.apache.parquet.column.ParquetProperties
|
9
14
|
import org.apache.parquet.hadoop.ParquetWriter
|
10
15
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
11
|
-
import org.embulk.config.{
|
12
|
-
|
16
|
+
import org.embulk.config.{
|
17
|
+
Config,
|
18
|
+
ConfigDefault,
|
19
|
+
ConfigDiff,
|
20
|
+
ConfigException,
|
21
|
+
ConfigSource,
|
22
|
+
Task,
|
23
|
+
TaskReport,
|
24
|
+
TaskSource
|
25
|
+
}
|
26
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
27
|
+
ColumnOptionTask,
|
28
|
+
PluginTask
|
29
|
+
}
|
13
30
|
import org.embulk.output.s3_parquet.aws.Aws
|
14
|
-
import org.embulk.output.s3_parquet.parquet.{
|
15
|
-
|
31
|
+
import org.embulk.output.s3_parquet.parquet.{
|
32
|
+
LogicalTypeHandlerStore,
|
33
|
+
ParquetFileWriter
|
34
|
+
}
|
35
|
+
import org.embulk.spi.{
|
36
|
+
Exec,
|
37
|
+
OutputPlugin,
|
38
|
+
PageReader,
|
39
|
+
Schema,
|
40
|
+
TransactionalPageOutput
|
41
|
+
}
|
16
42
|
import org.embulk.spi.time.TimestampFormatter
|
17
43
|
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
44
|
import org.embulk.spi.util.Timestamps
|
@@ -21,239 +47,302 @@ import org.slf4j.{Logger, LoggerFactory}
|
|
21
47
|
import scala.jdk.CollectionConverters._
|
22
48
|
import scala.util.chaining._
|
23
49
|
|
50
|
+
object S3ParquetOutputPlugin {
|
24
51
|
|
25
|
-
|
26
|
-
{
|
52
|
+
trait PluginTask extends Task with TimestampFormatter.Task with Aws.Task {
|
27
53
|
|
28
|
-
|
29
|
-
|
30
|
-
with TimestampFormatter.Task
|
31
|
-
with Aws.Task
|
32
|
-
{
|
54
|
+
@Config("bucket")
|
55
|
+
def getBucket: String
|
33
56
|
|
34
|
-
|
35
|
-
|
57
|
+
@Config("path_prefix")
|
58
|
+
@ConfigDefault("\"\"")
|
59
|
+
def getPathPrefix: String
|
36
60
|
|
37
|
-
|
38
|
-
|
39
|
-
|
61
|
+
@Config("sequence_format")
|
62
|
+
@ConfigDefault("\"%03d.%02d.\"")
|
63
|
+
def getSequenceFormat: String
|
40
64
|
|
41
|
-
|
42
|
-
|
43
|
-
|
65
|
+
@Config("file_ext")
|
66
|
+
@ConfigDefault("\"parquet\"")
|
67
|
+
def getFileExt: String
|
44
68
|
|
45
|
-
|
46
|
-
|
47
|
-
|
69
|
+
@Config("compression_codec")
|
70
|
+
@ConfigDefault("\"uncompressed\"")
|
71
|
+
def getCompressionCodecString: String
|
48
72
|
|
49
|
-
|
50
|
-
@ConfigDefault("\"uncompressed\"")
|
51
|
-
def getCompressionCodecString: String
|
73
|
+
def setCompressionCodec(v: CompressionCodecName): Unit
|
52
74
|
|
53
|
-
|
75
|
+
def getCompressionCodec: CompressionCodecName
|
54
76
|
|
55
|
-
|
77
|
+
@Config("column_options")
|
78
|
+
@ConfigDefault("{}")
|
79
|
+
def getColumnOptions: JMap[String, ColumnOptionTask]
|
56
80
|
|
57
|
-
|
58
|
-
|
59
|
-
|
81
|
+
@Config("canned_acl")
|
82
|
+
@ConfigDefault("\"private\"")
|
83
|
+
def getCannedAclString: String
|
60
84
|
|
61
|
-
|
62
|
-
@ConfigDefault("\"private\"")
|
63
|
-
def getCannedAclString: String
|
85
|
+
def setCannedAcl(v: CannedAccessControlList): Unit
|
64
86
|
|
65
|
-
|
87
|
+
def getCannedAcl: CannedAccessControlList
|
66
88
|
|
67
|
-
|
89
|
+
@Config("block_size")
|
90
|
+
@ConfigDefault("null")
|
91
|
+
def getBlockSize: Optional[Int]
|
68
92
|
|
69
|
-
|
70
|
-
|
71
|
-
|
93
|
+
@Config("page_size")
|
94
|
+
@ConfigDefault("null")
|
95
|
+
def getPageSize: Optional[Int]
|
72
96
|
|
73
|
-
|
74
|
-
|
75
|
-
|
97
|
+
@Config("max_padding_size")
|
98
|
+
@ConfigDefault("null")
|
99
|
+
def getMaxPaddingSize: Optional[Int]
|
76
100
|
|
77
|
-
|
78
|
-
|
79
|
-
|
101
|
+
@Config("enable_dictionary_encoding")
|
102
|
+
@ConfigDefault("null")
|
103
|
+
def getEnableDictionaryEncoding: Optional[Boolean]
|
80
104
|
|
81
|
-
|
82
|
-
|
83
|
-
|
105
|
+
@Config("buffer_dir")
|
106
|
+
@ConfigDefault("null")
|
107
|
+
def getBufferDir: Optional[String]
|
84
108
|
|
85
|
-
|
86
|
-
|
87
|
-
|
109
|
+
@Config("catalog")
|
110
|
+
@ConfigDefault("null")
|
111
|
+
def getCatalog: Optional[CatalogRegistrator.Task]
|
88
112
|
|
89
|
-
|
90
|
-
|
91
|
-
|
113
|
+
@Config("type_options")
|
114
|
+
@ConfigDefault("{}")
|
115
|
+
def getTypeOptions: JMap[String, TypeOptionTask]
|
116
|
+
}
|
92
117
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
118
|
+
trait ColumnOptionTask
|
119
|
+
extends Task
|
120
|
+
with TimestampColumnOption
|
121
|
+
with LogicalTypeOption
|
97
122
|
|
98
|
-
|
99
|
-
extends Task with TimestampColumnOption with LogicalTypeOption
|
123
|
+
trait TypeOptionTask extends Task with LogicalTypeOption
|
100
124
|
|
101
|
-
|
102
|
-
extends Task with LogicalTypeOption
|
125
|
+
trait LogicalTypeOption {
|
103
126
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
def getLogicalType: Optional[String]
|
108
|
-
}
|
127
|
+
@Config("logical_type")
|
128
|
+
def getLogicalType: Optional[String]
|
129
|
+
}
|
109
130
|
}
|
110
131
|
|
111
|
-
class S3ParquetOutputPlugin
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
val cOptions = task.getColumnOptions.asScala
|
140
|
-
val tOptions = task.getTypeOptions.asScala
|
141
|
-
schema.getColumns.asScala.foreach {c =>
|
142
|
-
cOptions.get(c.getName)
|
143
|
-
if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
|
144
|
-
builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
|
145
|
-
}
|
146
|
-
else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
|
147
|
-
builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
|
148
|
-
}
|
149
|
-
}
|
150
|
-
builder.result()
|
132
|
+
class S3ParquetOutputPlugin extends OutputPlugin {
|
133
|
+
|
134
|
+
val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
|
135
|
+
|
136
|
+
override def transaction(
|
137
|
+
config: ConfigSource,
|
138
|
+
schema: Schema,
|
139
|
+
taskCount: Int,
|
140
|
+
control: OutputPlugin.Control
|
141
|
+
): ConfigDiff = {
|
142
|
+
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
143
|
+
|
144
|
+
configure(task, schema)
|
145
|
+
control.run(task.dump)
|
146
|
+
|
147
|
+
task.getCatalog.ifPresent { catalog =>
|
148
|
+
val location =
|
149
|
+
s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
|
150
|
+
val parquetColumnLogicalTypes: Map[String, String] =
|
151
|
+
Map.newBuilder[String, String].pipe { builder =>
|
152
|
+
val cOptions = task.getColumnOptions.asScala
|
153
|
+
val tOptions = task.getTypeOptions.asScala
|
154
|
+
schema.getColumns.asScala.foreach { c =>
|
155
|
+
cOptions.get(c.getName)
|
156
|
+
if (cOptions
|
157
|
+
.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
|
158
|
+
builder
|
159
|
+
.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
|
151
160
|
}
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
cr.run()
|
159
|
-
}
|
160
|
-
|
161
|
-
Exec.newConfigDiff
|
162
|
-
}
|
163
|
-
|
164
|
-
private def configure(task: PluginTask,
|
165
|
-
schema: Schema): Unit =
|
166
|
-
{
|
167
|
-
// sequence_format
|
168
|
-
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
169
|
-
catch {
|
170
|
-
case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
|
171
|
-
}
|
172
|
-
|
173
|
-
// compression_codec
|
174
|
-
CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
|
175
|
-
case Some(v) => task.setCompressionCodec(v)
|
176
|
-
case None =>
|
177
|
-
val unsupported: String = task.getCompressionCodecString
|
178
|
-
val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
|
179
|
-
throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
|
180
|
-
}
|
181
|
-
|
182
|
-
// column_options
|
183
|
-
task.getColumnOptions.forEach { (k: String,
|
184
|
-
opt: ColumnOptionTask) =>
|
185
|
-
val c = schema.lookupColumn(k)
|
186
|
-
val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
|
187
|
-
if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
|
188
|
-
throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
161
|
+
else if (tOptions.contains(c.getType.getName) && tOptions(
|
162
|
+
c.getType.getName
|
163
|
+
).getLogicalType.isPresent) {
|
164
|
+
builder.addOne(
|
165
|
+
c.getName -> tOptions(c.getType.getName).getLogicalType.get()
|
166
|
+
)
|
189
167
|
}
|
168
|
+
}
|
169
|
+
builder.result()
|
190
170
|
}
|
171
|
+
val cr = CatalogRegistrator(
|
172
|
+
aws = Aws(task),
|
173
|
+
task = catalog,
|
174
|
+
schema = schema,
|
175
|
+
location = location,
|
176
|
+
compressionCodec = task.getCompressionCodec,
|
177
|
+
parquetColumnLogicalTypes = parquetColumnLogicalTypes
|
178
|
+
)
|
179
|
+
cr.run()
|
180
|
+
}
|
191
181
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
182
|
+
Exec.newConfigDiff
|
183
|
+
}
|
184
|
+
|
185
|
+
private def configure(task: PluginTask, schema: Schema): Unit = {
|
186
|
+
// sequence_format
|
187
|
+
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
188
|
+
catch {
|
189
|
+
case e: IllegalFormatException =>
|
190
|
+
throw new ConfigException(
|
191
|
+
s"Invalid sequence_format: ${task.getSequenceFormat}",
|
192
|
+
e
|
193
|
+
)
|
200
194
|
}
|
201
195
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
196
|
+
// compression_codec
|
197
|
+
CompressionCodecName
|
198
|
+
.values()
|
199
|
+
.find(v =>
|
200
|
+
v.name()
|
201
|
+
.toLowerCase(Locale.ENGLISH)
|
202
|
+
.equals(task.getCompressionCodecString)
|
203
|
+
) match {
|
204
|
+
case Some(v) => task.setCompressionCodec(v)
|
205
|
+
case None =>
|
206
|
+
val unsupported: String = task.getCompressionCodecString
|
207
|
+
val supported: String = CompressionCodecName
|
208
|
+
.values()
|
209
|
+
.map(v => s"'${v.name().toLowerCase}'")
|
210
|
+
.mkString(", ")
|
211
|
+
throw new ConfigException(
|
212
|
+
s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported]."
|
213
|
+
)
|
208
214
|
}
|
209
215
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
218
|
-
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
219
|
-
+ s"etag: ${tr.get(classOf[String], "etag", null)}")
|
220
|
-
}
|
216
|
+
// column_options
|
217
|
+
task.getColumnOptions.forEach { (k: String, opt: ColumnOptionTask) =>
|
218
|
+
val c = schema.lookupColumn(k)
|
219
|
+
val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
|
220
|
+
if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
|
221
|
+
throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
222
|
+
}
|
221
223
|
}
|
222
224
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
val
|
230
|
-
val
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
225
|
+
// canned_acl
|
226
|
+
CannedAccessControlList
|
227
|
+
.values()
|
228
|
+
.find(v => v.toString.equals(task.getCannedAclString)) match {
|
229
|
+
case Some(v) => task.setCannedAcl(v)
|
230
|
+
case None =>
|
231
|
+
val unsupported: String = task.getCannedAclString
|
232
|
+
val supported: String = CannedAccessControlList
|
233
|
+
.values()
|
234
|
+
.map(v => s"'${v.toString}'")
|
235
|
+
.mkString(", ")
|
236
|
+
throw new ConfigException(
|
237
|
+
s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported]."
|
238
|
+
)
|
239
|
+
}
|
240
|
+
}
|
241
|
+
|
242
|
+
override def resume(
|
243
|
+
taskSource: TaskSource,
|
244
|
+
schema: Schema,
|
245
|
+
taskCount: Int,
|
246
|
+
control: OutputPlugin.Control
|
247
|
+
): ConfigDiff = {
|
248
|
+
throw new UnsupportedOperationException(
|
249
|
+
"s3_parquet output plugin does not support resuming"
|
250
|
+
)
|
251
|
+
}
|
252
|
+
|
253
|
+
override def cleanup(
|
254
|
+
taskSource: TaskSource,
|
255
|
+
schema: Schema,
|
256
|
+
taskCount: Int,
|
257
|
+
successTaskReports: JList[TaskReport]
|
258
|
+
): Unit = {
|
259
|
+
successTaskReports.forEach { tr =>
|
260
|
+
logger.info(
|
261
|
+
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
262
|
+
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
263
|
+
+ s"etag: ${tr.get(classOf[String], "etag", null)}"
|
264
|
+
)
|
257
265
|
}
|
266
|
+
}
|
267
|
+
|
268
|
+
override def open(
|
269
|
+
taskSource: TaskSource,
|
270
|
+
schema: Schema,
|
271
|
+
taskIndex: Int
|
272
|
+
): TransactionalPageOutput = {
|
273
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
274
|
+
val bufferDir: String = task.getBufferDir.orElse(
|
275
|
+
Files.createTempDirectory("embulk-output-s3_parquet-").toString
|
276
|
+
)
|
277
|
+
val bufferFile: String = Paths
|
278
|
+
.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet")
|
279
|
+
.toString
|
280
|
+
val destS3bucket: String = task.getBucket
|
281
|
+
val destS3Key: String = task.getPathPrefix + String.format(
|
282
|
+
task.getSequenceFormat,
|
283
|
+
taskIndex: Integer,
|
284
|
+
0: Integer
|
285
|
+
) + task.getFileExt
|
286
|
+
|
287
|
+
val pageReader: PageReader = new PageReader(schema)
|
288
|
+
val aws: Aws = Aws(task)
|
289
|
+
val timestampFormatters: Seq[TimestampFormatter] = Timestamps
|
290
|
+
.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
291
|
+
.toSeq
|
292
|
+
val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(
|
293
|
+
task.getTypeOptions,
|
294
|
+
task.getColumnOptions
|
295
|
+
)
|
296
|
+
val parquetWriter: ParquetWriter[PageReader] =
|
297
|
+
ContextClassLoaderSwapper.usingPluginClass {
|
298
|
+
ParquetFileWriter
|
299
|
+
.builder()
|
300
|
+
.withPath(bufferFile)
|
301
|
+
.withSchema(schema)
|
302
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
303
|
+
.withTimestampFormatters(timestampFormatters)
|
304
|
+
.withCompressionCodec(task.getCompressionCodec)
|
305
|
+
.withDictionaryEncoding(
|
306
|
+
task.getEnableDictionaryEncoding.orElse(
|
307
|
+
ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED
|
308
|
+
)
|
309
|
+
)
|
310
|
+
.withDictionaryPageSize(
|
311
|
+
task.getPageSize.orElse(
|
312
|
+
ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE
|
313
|
+
)
|
314
|
+
)
|
315
|
+
.withMaxPaddingSize(
|
316
|
+
task.getMaxPaddingSize.orElse(
|
317
|
+
ParquetWriter.MAX_PADDING_SIZE_DEFAULT
|
318
|
+
)
|
319
|
+
)
|
320
|
+
.withPageSize(
|
321
|
+
task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE)
|
322
|
+
)
|
323
|
+
.withRowGroupSize(
|
324
|
+
task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE)
|
325
|
+
)
|
326
|
+
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
327
|
+
.withWriteMode(
|
328
|
+
org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE
|
329
|
+
)
|
330
|
+
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
331
|
+
.build()
|
332
|
+
}
|
333
|
+
|
334
|
+
logger.info(
|
335
|
+
s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key"
|
336
|
+
)
|
337
|
+
|
338
|
+
S3ParquetPageOutput(
|
339
|
+
bufferFile,
|
340
|
+
pageReader,
|
341
|
+
parquetWriter,
|
342
|
+
aws,
|
343
|
+
destS3bucket,
|
344
|
+
destS3Key
|
345
|
+
)
|
346
|
+
}
|
258
347
|
|
259
348
|
}
|