embulk-output-s3_parquet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -0,0 +1,18 @@
|
|
1
|
+
package org.embulk.output.s3_parquet
|
2
|
+
|
3
|
+
// WARNING: This object should be used for limited purposes only.
|
4
|
+
object ContextClassLoaderSwapper {
|
5
|
+
|
6
|
+
def using[A](klass: Class[_])(f: => A): A = {
|
7
|
+
val currentTread = Thread.currentThread()
|
8
|
+
val original = currentTread.getContextClassLoader
|
9
|
+
val target = klass.getClassLoader
|
10
|
+
currentTread.setContextClassLoader(target)
|
11
|
+
try f
|
12
|
+
finally currentTread.setContextClassLoader(original)
|
13
|
+
}
|
14
|
+
|
15
|
+
def usingPluginClass[A](f: => A): A = {
|
16
|
+
using(classOf[S3ParquetOutputPlugin])(f)
|
17
|
+
}
|
18
|
+
}
|
@@ -1,18 +1,44 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.nio.file.{Files, Paths}
|
5
|
-
import java.util.{
|
4
|
+
import java.util.{
|
5
|
+
IllegalFormatException,
|
6
|
+
Locale,
|
7
|
+
Optional,
|
8
|
+
List => JList,
|
9
|
+
Map => JMap
|
10
|
+
}
|
6
11
|
|
7
12
|
import com.amazonaws.services.s3.model.CannedAccessControlList
|
8
13
|
import org.apache.parquet.column.ParquetProperties
|
9
14
|
import org.apache.parquet.hadoop.ParquetWriter
|
10
15
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
11
|
-
import org.embulk.config.{
|
12
|
-
|
16
|
+
import org.embulk.config.{
|
17
|
+
Config,
|
18
|
+
ConfigDefault,
|
19
|
+
ConfigDiff,
|
20
|
+
ConfigException,
|
21
|
+
ConfigSource,
|
22
|
+
Task,
|
23
|
+
TaskReport,
|
24
|
+
TaskSource
|
25
|
+
}
|
26
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
27
|
+
ColumnOptionTask,
|
28
|
+
PluginTask
|
29
|
+
}
|
13
30
|
import org.embulk.output.s3_parquet.aws.Aws
|
14
|
-
import org.embulk.output.s3_parquet.parquet.{
|
15
|
-
|
31
|
+
import org.embulk.output.s3_parquet.parquet.{
|
32
|
+
LogicalTypeHandlerStore,
|
33
|
+
ParquetFileWriter
|
34
|
+
}
|
35
|
+
import org.embulk.spi.{
|
36
|
+
Exec,
|
37
|
+
OutputPlugin,
|
38
|
+
PageReader,
|
39
|
+
Schema,
|
40
|
+
TransactionalPageOutput
|
41
|
+
}
|
16
42
|
import org.embulk.spi.time.TimestampFormatter
|
17
43
|
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
44
|
import org.embulk.spi.util.Timestamps
|
@@ -21,239 +47,302 @@ import org.slf4j.{Logger, LoggerFactory}
|
|
21
47
|
import scala.jdk.CollectionConverters._
|
22
48
|
import scala.util.chaining._
|
23
49
|
|
50
|
+
object S3ParquetOutputPlugin {
|
24
51
|
|
25
|
-
|
26
|
-
{
|
52
|
+
trait PluginTask extends Task with TimestampFormatter.Task with Aws.Task {
|
27
53
|
|
28
|
-
|
29
|
-
|
30
|
-
with TimestampFormatter.Task
|
31
|
-
with Aws.Task
|
32
|
-
{
|
54
|
+
@Config("bucket")
|
55
|
+
def getBucket: String
|
33
56
|
|
34
|
-
|
35
|
-
|
57
|
+
@Config("path_prefix")
|
58
|
+
@ConfigDefault("\"\"")
|
59
|
+
def getPathPrefix: String
|
36
60
|
|
37
|
-
|
38
|
-
|
39
|
-
|
61
|
+
@Config("sequence_format")
|
62
|
+
@ConfigDefault("\"%03d.%02d.\"")
|
63
|
+
def getSequenceFormat: String
|
40
64
|
|
41
|
-
|
42
|
-
|
43
|
-
|
65
|
+
@Config("file_ext")
|
66
|
+
@ConfigDefault("\"parquet\"")
|
67
|
+
def getFileExt: String
|
44
68
|
|
45
|
-
|
46
|
-
|
47
|
-
|
69
|
+
@Config("compression_codec")
|
70
|
+
@ConfigDefault("\"uncompressed\"")
|
71
|
+
def getCompressionCodecString: String
|
48
72
|
|
49
|
-
|
50
|
-
@ConfigDefault("\"uncompressed\"")
|
51
|
-
def getCompressionCodecString: String
|
73
|
+
def setCompressionCodec(v: CompressionCodecName): Unit
|
52
74
|
|
53
|
-
|
75
|
+
def getCompressionCodec: CompressionCodecName
|
54
76
|
|
55
|
-
|
77
|
+
@Config("column_options")
|
78
|
+
@ConfigDefault("{}")
|
79
|
+
def getColumnOptions: JMap[String, ColumnOptionTask]
|
56
80
|
|
57
|
-
|
58
|
-
|
59
|
-
|
81
|
+
@Config("canned_acl")
|
82
|
+
@ConfigDefault("\"private\"")
|
83
|
+
def getCannedAclString: String
|
60
84
|
|
61
|
-
|
62
|
-
@ConfigDefault("\"private\"")
|
63
|
-
def getCannedAclString: String
|
85
|
+
def setCannedAcl(v: CannedAccessControlList): Unit
|
64
86
|
|
65
|
-
|
87
|
+
def getCannedAcl: CannedAccessControlList
|
66
88
|
|
67
|
-
|
89
|
+
@Config("block_size")
|
90
|
+
@ConfigDefault("null")
|
91
|
+
def getBlockSize: Optional[Int]
|
68
92
|
|
69
|
-
|
70
|
-
|
71
|
-
|
93
|
+
@Config("page_size")
|
94
|
+
@ConfigDefault("null")
|
95
|
+
def getPageSize: Optional[Int]
|
72
96
|
|
73
|
-
|
74
|
-
|
75
|
-
|
97
|
+
@Config("max_padding_size")
|
98
|
+
@ConfigDefault("null")
|
99
|
+
def getMaxPaddingSize: Optional[Int]
|
76
100
|
|
77
|
-
|
78
|
-
|
79
|
-
|
101
|
+
@Config("enable_dictionary_encoding")
|
102
|
+
@ConfigDefault("null")
|
103
|
+
def getEnableDictionaryEncoding: Optional[Boolean]
|
80
104
|
|
81
|
-
|
82
|
-
|
83
|
-
|
105
|
+
@Config("buffer_dir")
|
106
|
+
@ConfigDefault("null")
|
107
|
+
def getBufferDir: Optional[String]
|
84
108
|
|
85
|
-
|
86
|
-
|
87
|
-
|
109
|
+
@Config("catalog")
|
110
|
+
@ConfigDefault("null")
|
111
|
+
def getCatalog: Optional[CatalogRegistrator.Task]
|
88
112
|
|
89
|
-
|
90
|
-
|
91
|
-
|
113
|
+
@Config("type_options")
|
114
|
+
@ConfigDefault("{}")
|
115
|
+
def getTypeOptions: JMap[String, TypeOptionTask]
|
116
|
+
}
|
92
117
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
118
|
+
trait ColumnOptionTask
|
119
|
+
extends Task
|
120
|
+
with TimestampColumnOption
|
121
|
+
with LogicalTypeOption
|
97
122
|
|
98
|
-
|
99
|
-
extends Task with TimestampColumnOption with LogicalTypeOption
|
123
|
+
trait TypeOptionTask extends Task with LogicalTypeOption
|
100
124
|
|
101
|
-
|
102
|
-
extends Task with LogicalTypeOption
|
125
|
+
trait LogicalTypeOption {
|
103
126
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
def getLogicalType: Optional[String]
|
108
|
-
}
|
127
|
+
@Config("logical_type")
|
128
|
+
def getLogicalType: Optional[String]
|
129
|
+
}
|
109
130
|
}
|
110
131
|
|
111
|
-
class S3ParquetOutputPlugin
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
val cOptions = task.getColumnOptions.asScala
|
140
|
-
val tOptions = task.getTypeOptions.asScala
|
141
|
-
schema.getColumns.asScala.foreach {c =>
|
142
|
-
cOptions.get(c.getName)
|
143
|
-
if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
|
144
|
-
builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
|
145
|
-
}
|
146
|
-
else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
|
147
|
-
builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
|
148
|
-
}
|
149
|
-
}
|
150
|
-
builder.result()
|
132
|
+
class S3ParquetOutputPlugin extends OutputPlugin {
|
133
|
+
|
134
|
+
val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
|
135
|
+
|
136
|
+
override def transaction(
|
137
|
+
config: ConfigSource,
|
138
|
+
schema: Schema,
|
139
|
+
taskCount: Int,
|
140
|
+
control: OutputPlugin.Control
|
141
|
+
): ConfigDiff = {
|
142
|
+
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
143
|
+
|
144
|
+
configure(task, schema)
|
145
|
+
control.run(task.dump)
|
146
|
+
|
147
|
+
task.getCatalog.ifPresent { catalog =>
|
148
|
+
val location =
|
149
|
+
s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
|
150
|
+
val parquetColumnLogicalTypes: Map[String, String] =
|
151
|
+
Map.newBuilder[String, String].pipe { builder =>
|
152
|
+
val cOptions = task.getColumnOptions.asScala
|
153
|
+
val tOptions = task.getTypeOptions.asScala
|
154
|
+
schema.getColumns.asScala.foreach { c =>
|
155
|
+
cOptions.get(c.getName)
|
156
|
+
if (cOptions
|
157
|
+
.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
|
158
|
+
builder
|
159
|
+
.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
|
151
160
|
}
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
cr.run()
|
159
|
-
}
|
160
|
-
|
161
|
-
Exec.newConfigDiff
|
162
|
-
}
|
163
|
-
|
164
|
-
private def configure(task: PluginTask,
|
165
|
-
schema: Schema): Unit =
|
166
|
-
{
|
167
|
-
// sequence_format
|
168
|
-
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
169
|
-
catch {
|
170
|
-
case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
|
171
|
-
}
|
172
|
-
|
173
|
-
// compression_codec
|
174
|
-
CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
|
175
|
-
case Some(v) => task.setCompressionCodec(v)
|
176
|
-
case None =>
|
177
|
-
val unsupported: String = task.getCompressionCodecString
|
178
|
-
val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
|
179
|
-
throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
|
180
|
-
}
|
181
|
-
|
182
|
-
// column_options
|
183
|
-
task.getColumnOptions.forEach { (k: String,
|
184
|
-
opt: ColumnOptionTask) =>
|
185
|
-
val c = schema.lookupColumn(k)
|
186
|
-
val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
|
187
|
-
if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
|
188
|
-
throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
161
|
+
else if (tOptions.contains(c.getType.getName) && tOptions(
|
162
|
+
c.getType.getName
|
163
|
+
).getLogicalType.isPresent) {
|
164
|
+
builder.addOne(
|
165
|
+
c.getName -> tOptions(c.getType.getName).getLogicalType.get()
|
166
|
+
)
|
189
167
|
}
|
168
|
+
}
|
169
|
+
builder.result()
|
190
170
|
}
|
171
|
+
val cr = CatalogRegistrator(
|
172
|
+
aws = Aws(task),
|
173
|
+
task = catalog,
|
174
|
+
schema = schema,
|
175
|
+
location = location,
|
176
|
+
compressionCodec = task.getCompressionCodec,
|
177
|
+
parquetColumnLogicalTypes = parquetColumnLogicalTypes
|
178
|
+
)
|
179
|
+
cr.run()
|
180
|
+
}
|
191
181
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
182
|
+
Exec.newConfigDiff
|
183
|
+
}
|
184
|
+
|
185
|
+
private def configure(task: PluginTask, schema: Schema): Unit = {
|
186
|
+
// sequence_format
|
187
|
+
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
188
|
+
catch {
|
189
|
+
case e: IllegalFormatException =>
|
190
|
+
throw new ConfigException(
|
191
|
+
s"Invalid sequence_format: ${task.getSequenceFormat}",
|
192
|
+
e
|
193
|
+
)
|
200
194
|
}
|
201
195
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
196
|
+
// compression_codec
|
197
|
+
CompressionCodecName
|
198
|
+
.values()
|
199
|
+
.find(v =>
|
200
|
+
v.name()
|
201
|
+
.toLowerCase(Locale.ENGLISH)
|
202
|
+
.equals(task.getCompressionCodecString)
|
203
|
+
) match {
|
204
|
+
case Some(v) => task.setCompressionCodec(v)
|
205
|
+
case None =>
|
206
|
+
val unsupported: String = task.getCompressionCodecString
|
207
|
+
val supported: String = CompressionCodecName
|
208
|
+
.values()
|
209
|
+
.map(v => s"'${v.name().toLowerCase}'")
|
210
|
+
.mkString(", ")
|
211
|
+
throw new ConfigException(
|
212
|
+
s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported]."
|
213
|
+
)
|
208
214
|
}
|
209
215
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
218
|
-
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
219
|
-
+ s"etag: ${tr.get(classOf[String], "etag", null)}")
|
220
|
-
}
|
216
|
+
// column_options
|
217
|
+
task.getColumnOptions.forEach { (k: String, opt: ColumnOptionTask) =>
|
218
|
+
val c = schema.lookupColumn(k)
|
219
|
+
val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
|
220
|
+
if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
|
221
|
+
throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
222
|
+
}
|
221
223
|
}
|
222
224
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
val
|
230
|
-
val
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
225
|
+
// canned_acl
|
226
|
+
CannedAccessControlList
|
227
|
+
.values()
|
228
|
+
.find(v => v.toString.equals(task.getCannedAclString)) match {
|
229
|
+
case Some(v) => task.setCannedAcl(v)
|
230
|
+
case None =>
|
231
|
+
val unsupported: String = task.getCannedAclString
|
232
|
+
val supported: String = CannedAccessControlList
|
233
|
+
.values()
|
234
|
+
.map(v => s"'${v.toString}'")
|
235
|
+
.mkString(", ")
|
236
|
+
throw new ConfigException(
|
237
|
+
s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported]."
|
238
|
+
)
|
239
|
+
}
|
240
|
+
}
|
241
|
+
|
242
|
+
override def resume(
|
243
|
+
taskSource: TaskSource,
|
244
|
+
schema: Schema,
|
245
|
+
taskCount: Int,
|
246
|
+
control: OutputPlugin.Control
|
247
|
+
): ConfigDiff = {
|
248
|
+
throw new UnsupportedOperationException(
|
249
|
+
"s3_parquet output plugin does not support resuming"
|
250
|
+
)
|
251
|
+
}
|
252
|
+
|
253
|
+
override def cleanup(
|
254
|
+
taskSource: TaskSource,
|
255
|
+
schema: Schema,
|
256
|
+
taskCount: Int,
|
257
|
+
successTaskReports: JList[TaskReport]
|
258
|
+
): Unit = {
|
259
|
+
successTaskReports.forEach { tr =>
|
260
|
+
logger.info(
|
261
|
+
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
262
|
+
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
263
|
+
+ s"etag: ${tr.get(classOf[String], "etag", null)}"
|
264
|
+
)
|
257
265
|
}
|
266
|
+
}
|
267
|
+
|
268
|
+
override def open(
|
269
|
+
taskSource: TaskSource,
|
270
|
+
schema: Schema,
|
271
|
+
taskIndex: Int
|
272
|
+
): TransactionalPageOutput = {
|
273
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
274
|
+
val bufferDir: String = task.getBufferDir.orElse(
|
275
|
+
Files.createTempDirectory("embulk-output-s3_parquet-").toString
|
276
|
+
)
|
277
|
+
val bufferFile: String = Paths
|
278
|
+
.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet")
|
279
|
+
.toString
|
280
|
+
val destS3bucket: String = task.getBucket
|
281
|
+
val destS3Key: String = task.getPathPrefix + String.format(
|
282
|
+
task.getSequenceFormat,
|
283
|
+
taskIndex: Integer,
|
284
|
+
0: Integer
|
285
|
+
) + task.getFileExt
|
286
|
+
|
287
|
+
val pageReader: PageReader = new PageReader(schema)
|
288
|
+
val aws: Aws = Aws(task)
|
289
|
+
val timestampFormatters: Seq[TimestampFormatter] = Timestamps
|
290
|
+
.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
291
|
+
.toSeq
|
292
|
+
val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(
|
293
|
+
task.getTypeOptions,
|
294
|
+
task.getColumnOptions
|
295
|
+
)
|
296
|
+
val parquetWriter: ParquetWriter[PageReader] =
|
297
|
+
ContextClassLoaderSwapper.usingPluginClass {
|
298
|
+
ParquetFileWriter
|
299
|
+
.builder()
|
300
|
+
.withPath(bufferFile)
|
301
|
+
.withSchema(schema)
|
302
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
303
|
+
.withTimestampFormatters(timestampFormatters)
|
304
|
+
.withCompressionCodec(task.getCompressionCodec)
|
305
|
+
.withDictionaryEncoding(
|
306
|
+
task.getEnableDictionaryEncoding.orElse(
|
307
|
+
ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED
|
308
|
+
)
|
309
|
+
)
|
310
|
+
.withDictionaryPageSize(
|
311
|
+
task.getPageSize.orElse(
|
312
|
+
ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE
|
313
|
+
)
|
314
|
+
)
|
315
|
+
.withMaxPaddingSize(
|
316
|
+
task.getMaxPaddingSize.orElse(
|
317
|
+
ParquetWriter.MAX_PADDING_SIZE_DEFAULT
|
318
|
+
)
|
319
|
+
)
|
320
|
+
.withPageSize(
|
321
|
+
task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE)
|
322
|
+
)
|
323
|
+
.withRowGroupSize(
|
324
|
+
task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE)
|
325
|
+
)
|
326
|
+
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
327
|
+
.withWriteMode(
|
328
|
+
org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE
|
329
|
+
)
|
330
|
+
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
331
|
+
.build()
|
332
|
+
}
|
333
|
+
|
334
|
+
logger.info(
|
335
|
+
s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key"
|
336
|
+
)
|
337
|
+
|
338
|
+
S3ParquetPageOutput(
|
339
|
+
bufferFile,
|
340
|
+
pageReader,
|
341
|
+
parquetWriter,
|
342
|
+
aws,
|
343
|
+
destS3bucket,
|
344
|
+
destS3Key
|
345
|
+
)
|
346
|
+
}
|
258
347
|
|
259
348
|
}
|