embulk-output-s3_parquet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -0,0 +1,18 @@
1
+ package org.embulk.output.s3_parquet
2
+
3
+ // WARNING: This object should be used for limited purposes only.
4
+ object ContextClassLoaderSwapper {
5
+
6
+ def using[A](klass: Class[_])(f: => A): A = {
7
+ val currentTread = Thread.currentThread()
8
+ val original = currentTread.getContextClassLoader
9
+ val target = klass.getClassLoader
10
+ currentTread.setContextClassLoader(target)
11
+ try f
12
+ finally currentTread.setContextClassLoader(original)
13
+ }
14
+
15
+ def usingPluginClass[A](f: => A): A = {
16
+ using(classOf[S3ParquetOutputPlugin])(f)
17
+ }
18
+ }
@@ -1,18 +1,44 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
-
4
3
  import java.nio.file.{Files, Paths}
5
- import java.util.{IllegalFormatException, Locale, Optional, List => JList, Map => JMap}
4
+ import java.util.{
5
+ IllegalFormatException,
6
+ Locale,
7
+ Optional,
8
+ List => JList,
9
+ Map => JMap
10
+ }
6
11
 
7
12
  import com.amazonaws.services.s3.model.CannedAccessControlList
8
13
  import org.apache.parquet.column.ParquetProperties
9
14
  import org.apache.parquet.hadoop.ParquetWriter
10
15
  import org.apache.parquet.hadoop.metadata.CompressionCodecName
11
- import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
12
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, PluginTask}
16
+ import org.embulk.config.{
17
+ Config,
18
+ ConfigDefault,
19
+ ConfigDiff,
20
+ ConfigException,
21
+ ConfigSource,
22
+ Task,
23
+ TaskReport,
24
+ TaskSource
25
+ }
26
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
27
+ ColumnOptionTask,
28
+ PluginTask
29
+ }
13
30
  import org.embulk.output.s3_parquet.aws.Aws
14
- import org.embulk.output.s3_parquet.parquet.{LogicalTypeHandlerStore, ParquetFileWriter}
15
- import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
31
+ import org.embulk.output.s3_parquet.parquet.{
32
+ LogicalTypeHandlerStore,
33
+ ParquetFileWriter
34
+ }
35
+ import org.embulk.spi.{
36
+ Exec,
37
+ OutputPlugin,
38
+ PageReader,
39
+ Schema,
40
+ TransactionalPageOutput
41
+ }
16
42
  import org.embulk.spi.time.TimestampFormatter
17
43
  import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
18
44
  import org.embulk.spi.util.Timestamps
@@ -21,239 +47,302 @@ import org.slf4j.{Logger, LoggerFactory}
21
47
  import scala.jdk.CollectionConverters._
22
48
  import scala.util.chaining._
23
49
 
50
+ object S3ParquetOutputPlugin {
24
51
 
25
- object S3ParquetOutputPlugin
26
- {
52
+ trait PluginTask extends Task with TimestampFormatter.Task with Aws.Task {
27
53
 
28
- trait PluginTask
29
- extends Task
30
- with TimestampFormatter.Task
31
- with Aws.Task
32
- {
54
+ @Config("bucket")
55
+ def getBucket: String
33
56
 
34
- @Config("bucket")
35
- def getBucket: String
57
+ @Config("path_prefix")
58
+ @ConfigDefault("\"\"")
59
+ def getPathPrefix: String
36
60
 
37
- @Config("path_prefix")
38
- @ConfigDefault("\"\"")
39
- def getPathPrefix: String
61
+ @Config("sequence_format")
62
+ @ConfigDefault("\"%03d.%02d.\"")
63
+ def getSequenceFormat: String
40
64
 
41
- @Config("sequence_format")
42
- @ConfigDefault("\"%03d.%02d.\"")
43
- def getSequenceFormat: String
65
+ @Config("file_ext")
66
+ @ConfigDefault("\"parquet\"")
67
+ def getFileExt: String
44
68
 
45
- @Config("file_ext")
46
- @ConfigDefault("\"parquet\"")
47
- def getFileExt: String
69
+ @Config("compression_codec")
70
+ @ConfigDefault("\"uncompressed\"")
71
+ def getCompressionCodecString: String
48
72
 
49
- @Config("compression_codec")
50
- @ConfigDefault("\"uncompressed\"")
51
- def getCompressionCodecString: String
73
+ def setCompressionCodec(v: CompressionCodecName): Unit
52
74
 
53
- def setCompressionCodec(v: CompressionCodecName): Unit
75
+ def getCompressionCodec: CompressionCodecName
54
76
 
55
- def getCompressionCodec: CompressionCodecName
77
+ @Config("column_options")
78
+ @ConfigDefault("{}")
79
+ def getColumnOptions: JMap[String, ColumnOptionTask]
56
80
 
57
- @Config("column_options")
58
- @ConfigDefault("{}")
59
- def getColumnOptions: JMap[String, ColumnOptionTask]
81
+ @Config("canned_acl")
82
+ @ConfigDefault("\"private\"")
83
+ def getCannedAclString: String
60
84
 
61
- @Config("canned_acl")
62
- @ConfigDefault("\"private\"")
63
- def getCannedAclString: String
85
+ def setCannedAcl(v: CannedAccessControlList): Unit
64
86
 
65
- def setCannedAcl(v: CannedAccessControlList): Unit
87
+ def getCannedAcl: CannedAccessControlList
66
88
 
67
- def getCannedAcl: CannedAccessControlList
89
+ @Config("block_size")
90
+ @ConfigDefault("null")
91
+ def getBlockSize: Optional[Int]
68
92
 
69
- @Config("block_size")
70
- @ConfigDefault("null")
71
- def getBlockSize: Optional[Int]
93
+ @Config("page_size")
94
+ @ConfigDefault("null")
95
+ def getPageSize: Optional[Int]
72
96
 
73
- @Config("page_size")
74
- @ConfigDefault("null")
75
- def getPageSize: Optional[Int]
97
+ @Config("max_padding_size")
98
+ @ConfigDefault("null")
99
+ def getMaxPaddingSize: Optional[Int]
76
100
 
77
- @Config("max_padding_size")
78
- @ConfigDefault("null")
79
- def getMaxPaddingSize: Optional[Int]
101
+ @Config("enable_dictionary_encoding")
102
+ @ConfigDefault("null")
103
+ def getEnableDictionaryEncoding: Optional[Boolean]
80
104
 
81
- @Config("enable_dictionary_encoding")
82
- @ConfigDefault("null")
83
- def getEnableDictionaryEncoding: Optional[Boolean]
105
+ @Config("buffer_dir")
106
+ @ConfigDefault("null")
107
+ def getBufferDir: Optional[String]
84
108
 
85
- @Config("buffer_dir")
86
- @ConfigDefault("null")
87
- def getBufferDir: Optional[String]
109
+ @Config("catalog")
110
+ @ConfigDefault("null")
111
+ def getCatalog: Optional[CatalogRegistrator.Task]
88
112
 
89
- @Config("catalog")
90
- @ConfigDefault("null")
91
- def getCatalog: Optional[CatalogRegistrator.Task]
113
+ @Config("type_options")
114
+ @ConfigDefault("{}")
115
+ def getTypeOptions: JMap[String, TypeOptionTask]
116
+ }
92
117
 
93
- @Config("type_options")
94
- @ConfigDefault("{}")
95
- def getTypeOptions: JMap[String, TypeOptionTask]
96
- }
118
+ trait ColumnOptionTask
119
+ extends Task
120
+ with TimestampColumnOption
121
+ with LogicalTypeOption
97
122
 
98
- trait ColumnOptionTask
99
- extends Task with TimestampColumnOption with LogicalTypeOption
123
+ trait TypeOptionTask extends Task with LogicalTypeOption
100
124
 
101
- trait TypeOptionTask
102
- extends Task with LogicalTypeOption
125
+ trait LogicalTypeOption {
103
126
 
104
- trait LogicalTypeOption
105
- {
106
- @Config("logical_type")
107
- def getLogicalType: Optional[String]
108
- }
127
+ @Config("logical_type")
128
+ def getLogicalType: Optional[String]
129
+ }
109
130
  }
110
131
 
111
- class S3ParquetOutputPlugin
112
- extends OutputPlugin
113
- {
114
-
115
- val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
116
-
117
- private def withPluginContextClassLoader[A](f: => A): A =
118
- {
119
- val original: ClassLoader = Thread.currentThread.getContextClassLoader
120
- Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
121
- try f
122
- finally Thread.currentThread.setContextClassLoader(original)
123
- }
124
-
125
- override def transaction(config: ConfigSource,
126
- schema: Schema,
127
- taskCount: Int,
128
- control: OutputPlugin.Control): ConfigDiff =
129
- {
130
- val task: PluginTask = config.loadConfig(classOf[PluginTask])
131
-
132
- withPluginContextClassLoader {
133
- configure(task, schema)
134
- control.run(task.dump)
135
- }
136
- task.getCatalog.ifPresent { catalog =>
137
- val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
138
- val parquetColumnLogicalTypes: Map[String, String] = Map.newBuilder[String, String].pipe {builder =>
139
- val cOptions = task.getColumnOptions.asScala
140
- val tOptions = task.getTypeOptions.asScala
141
- schema.getColumns.asScala.foreach {c =>
142
- cOptions.get(c.getName)
143
- if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
144
- builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
145
- }
146
- else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
147
- builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
148
- }
149
- }
150
- builder.result()
132
+ class S3ParquetOutputPlugin extends OutputPlugin {
133
+
134
+ val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
135
+
136
+ override def transaction(
137
+ config: ConfigSource,
138
+ schema: Schema,
139
+ taskCount: Int,
140
+ control: OutputPlugin.Control
141
+ ): ConfigDiff = {
142
+ val task: PluginTask = config.loadConfig(classOf[PluginTask])
143
+
144
+ configure(task, schema)
145
+ control.run(task.dump)
146
+
147
+ task.getCatalog.ifPresent { catalog =>
148
+ val location =
149
+ s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
150
+ val parquetColumnLogicalTypes: Map[String, String] =
151
+ Map.newBuilder[String, String].pipe { builder =>
152
+ val cOptions = task.getColumnOptions.asScala
153
+ val tOptions = task.getTypeOptions.asScala
154
+ schema.getColumns.asScala.foreach { c =>
155
+ cOptions.get(c.getName)
156
+ if (cOptions
157
+ .contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
158
+ builder
159
+ .addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
151
160
  }
152
- val cr = CatalogRegistrator(aws = Aws(task),
153
- task = catalog,
154
- schema = schema,
155
- location = location,
156
- compressionCodec = task.getCompressionCodec,
157
- parquetColumnLogicalTypes = parquetColumnLogicalTypes)
158
- cr.run()
159
- }
160
-
161
- Exec.newConfigDiff
162
- }
163
-
164
- private def configure(task: PluginTask,
165
- schema: Schema): Unit =
166
- {
167
- // sequence_format
168
- try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
169
- catch {
170
- case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
171
- }
172
-
173
- // compression_codec
174
- CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
175
- case Some(v) => task.setCompressionCodec(v)
176
- case None =>
177
- val unsupported: String = task.getCompressionCodecString
178
- val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
179
- throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
180
- }
181
-
182
- // column_options
183
- task.getColumnOptions.forEach { (k: String,
184
- opt: ColumnOptionTask) =>
185
- val c = schema.lookupColumn(k)
186
- val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
187
- if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
188
- throw new ConfigException(s"column:$k is not 'timestamp' type.")
161
+ else if (tOptions.contains(c.getType.getName) && tOptions(
162
+ c.getType.getName
163
+ ).getLogicalType.isPresent) {
164
+ builder.addOne(
165
+ c.getName -> tOptions(c.getType.getName).getLogicalType.get()
166
+ )
189
167
  }
168
+ }
169
+ builder.result()
190
170
  }
171
+ val cr = CatalogRegistrator(
172
+ aws = Aws(task),
173
+ task = catalog,
174
+ schema = schema,
175
+ location = location,
176
+ compressionCodec = task.getCompressionCodec,
177
+ parquetColumnLogicalTypes = parquetColumnLogicalTypes
178
+ )
179
+ cr.run()
180
+ }
191
181
 
192
- // canned_acl
193
- CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
194
- case Some(v) => task.setCannedAcl(v)
195
- case None =>
196
- val unsupported: String = task.getCannedAclString
197
- val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
198
- throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
199
- }
182
+ Exec.newConfigDiff
183
+ }
184
+
185
+ private def configure(task: PluginTask, schema: Schema): Unit = {
186
+ // sequence_format
187
+ try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
188
+ catch {
189
+ case e: IllegalFormatException =>
190
+ throw new ConfigException(
191
+ s"Invalid sequence_format: ${task.getSequenceFormat}",
192
+ e
193
+ )
200
194
  }
201
195
 
202
- override def resume(taskSource: TaskSource,
203
- schema: Schema,
204
- taskCount: Int,
205
- control: OutputPlugin.Control): ConfigDiff =
206
- {
207
- throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
196
+ // compression_codec
197
+ CompressionCodecName
198
+ .values()
199
+ .find(v =>
200
+ v.name()
201
+ .toLowerCase(Locale.ENGLISH)
202
+ .equals(task.getCompressionCodecString)
203
+ ) match {
204
+ case Some(v) => task.setCompressionCodec(v)
205
+ case None =>
206
+ val unsupported: String = task.getCompressionCodecString
207
+ val supported: String = CompressionCodecName
208
+ .values()
209
+ .map(v => s"'${v.name().toLowerCase}'")
210
+ .mkString(", ")
211
+ throw new ConfigException(
212
+ s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported]."
213
+ )
208
214
  }
209
215
 
210
- override def cleanup(taskSource: TaskSource,
211
- schema: Schema,
212
- taskCount: Int,
213
- successTaskReports: JList[TaskReport]): Unit =
214
- {
215
- successTaskReports.forEach { tr =>
216
- logger.info(
217
- s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
218
- + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
219
- + s"etag: ${tr.get(classOf[String], "etag", null)}")
220
- }
216
+ // column_options
217
+ task.getColumnOptions.forEach { (k: String, opt: ColumnOptionTask) =>
218
+ val c = schema.lookupColumn(k)
219
+ val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
220
+ if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
221
+ throw new ConfigException(s"column:$k is not 'timestamp' type.")
222
+ }
221
223
  }
222
224
 
223
- override def open(taskSource: TaskSource,
224
- schema: Schema,
225
- taskIndex: Int): TransactionalPageOutput =
226
- {
227
- val task = taskSource.loadTask(classOf[PluginTask])
228
- val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
229
- val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
230
- val destS3bucket: String = task.getBucket
231
- val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
232
-
233
-
234
- val pageReader: PageReader = new PageReader(schema)
235
- val aws: Aws = Aws(task)
236
- val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
237
- val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(task.getTypeOptions, task.getColumnOptions)
238
- val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
239
- .withPath(bufferFile)
240
- .withSchema(schema)
241
- .withLogicalTypeHandlers(logicalTypeHandlers)
242
- .withTimestampFormatters(timestampFormatters)
243
- .withCompressionCodec(task.getCompressionCodec)
244
- .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
245
- .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
246
- .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
247
- .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
248
- .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
249
- .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
250
- .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
251
- .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
252
- .build()
253
-
254
- logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
255
-
256
- S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
225
+ // canned_acl
226
+ CannedAccessControlList
227
+ .values()
228
+ .find(v => v.toString.equals(task.getCannedAclString)) match {
229
+ case Some(v) => task.setCannedAcl(v)
230
+ case None =>
231
+ val unsupported: String = task.getCannedAclString
232
+ val supported: String = CannedAccessControlList
233
+ .values()
234
+ .map(v => s"'${v.toString}'")
235
+ .mkString(", ")
236
+ throw new ConfigException(
237
+ s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported]."
238
+ )
239
+ }
240
+ }
241
+
242
+ override def resume(
243
+ taskSource: TaskSource,
244
+ schema: Schema,
245
+ taskCount: Int,
246
+ control: OutputPlugin.Control
247
+ ): ConfigDiff = {
248
+ throw new UnsupportedOperationException(
249
+ "s3_parquet output plugin does not support resuming"
250
+ )
251
+ }
252
+
253
+ override def cleanup(
254
+ taskSource: TaskSource,
255
+ schema: Schema,
256
+ taskCount: Int,
257
+ successTaskReports: JList[TaskReport]
258
+ ): Unit = {
259
+ successTaskReports.forEach { tr =>
260
+ logger.info(
261
+ s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
262
+ + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
263
+ + s"etag: ${tr.get(classOf[String], "etag", null)}"
264
+ )
257
265
  }
266
+ }
267
+
268
+ override def open(
269
+ taskSource: TaskSource,
270
+ schema: Schema,
271
+ taskIndex: Int
272
+ ): TransactionalPageOutput = {
273
+ val task = taskSource.loadTask(classOf[PluginTask])
274
+ val bufferDir: String = task.getBufferDir.orElse(
275
+ Files.createTempDirectory("embulk-output-s3_parquet-").toString
276
+ )
277
+ val bufferFile: String = Paths
278
+ .get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet")
279
+ .toString
280
+ val destS3bucket: String = task.getBucket
281
+ val destS3Key: String = task.getPathPrefix + String.format(
282
+ task.getSequenceFormat,
283
+ taskIndex: Integer,
284
+ 0: Integer
285
+ ) + task.getFileExt
286
+
287
+ val pageReader: PageReader = new PageReader(schema)
288
+ val aws: Aws = Aws(task)
289
+ val timestampFormatters: Seq[TimestampFormatter] = Timestamps
290
+ .newTimestampColumnFormatters(task, schema, task.getColumnOptions)
291
+ .toSeq
292
+ val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(
293
+ task.getTypeOptions,
294
+ task.getColumnOptions
295
+ )
296
+ val parquetWriter: ParquetWriter[PageReader] =
297
+ ContextClassLoaderSwapper.usingPluginClass {
298
+ ParquetFileWriter
299
+ .builder()
300
+ .withPath(bufferFile)
301
+ .withSchema(schema)
302
+ .withLogicalTypeHandlers(logicalTypeHandlers)
303
+ .withTimestampFormatters(timestampFormatters)
304
+ .withCompressionCodec(task.getCompressionCodec)
305
+ .withDictionaryEncoding(
306
+ task.getEnableDictionaryEncoding.orElse(
307
+ ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED
308
+ )
309
+ )
310
+ .withDictionaryPageSize(
311
+ task.getPageSize.orElse(
312
+ ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE
313
+ )
314
+ )
315
+ .withMaxPaddingSize(
316
+ task.getMaxPaddingSize.orElse(
317
+ ParquetWriter.MAX_PADDING_SIZE_DEFAULT
318
+ )
319
+ )
320
+ .withPageSize(
321
+ task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE)
322
+ )
323
+ .withRowGroupSize(
324
+ task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE)
325
+ )
326
+ .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
327
+ .withWriteMode(
328
+ org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE
329
+ )
330
+ .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
331
+ .build()
332
+ }
333
+
334
+ logger.info(
335
+ s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key"
336
+ )
337
+
338
+ S3ParquetPageOutput(
339
+ bufferFile,
340
+ pageReader,
341
+ parquetWriter,
342
+ aws,
343
+ destS3bucket,
344
+ destS3Key
345
+ )
346
+ }
258
347
 
259
348
  }