embulk-output-s3_parquet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -0,0 +1,18 @@
1
+ package org.embulk.output.s3_parquet
2
+
3
+ // WARNING: This object should be used for limited purposes only.
4
+ object ContextClassLoaderSwapper {
5
+
6
+ def using[A](klass: Class[_])(f: => A): A = {
7
+ val currentTread = Thread.currentThread()
8
+ val original = currentTread.getContextClassLoader
9
+ val target = klass.getClassLoader
10
+ currentTread.setContextClassLoader(target)
11
+ try f
12
+ finally currentTread.setContextClassLoader(original)
13
+ }
14
+
15
+ def usingPluginClass[A](f: => A): A = {
16
+ using(classOf[S3ParquetOutputPlugin])(f)
17
+ }
18
+ }
@@ -1,18 +1,44 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
-
4
3
  import java.nio.file.{Files, Paths}
5
- import java.util.{IllegalFormatException, Locale, Optional, List => JList, Map => JMap}
4
+ import java.util.{
5
+ IllegalFormatException,
6
+ Locale,
7
+ Optional,
8
+ List => JList,
9
+ Map => JMap
10
+ }
6
11
 
7
12
  import com.amazonaws.services.s3.model.CannedAccessControlList
8
13
  import org.apache.parquet.column.ParquetProperties
9
14
  import org.apache.parquet.hadoop.ParquetWriter
10
15
  import org.apache.parquet.hadoop.metadata.CompressionCodecName
11
- import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
12
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, PluginTask}
16
+ import org.embulk.config.{
17
+ Config,
18
+ ConfigDefault,
19
+ ConfigDiff,
20
+ ConfigException,
21
+ ConfigSource,
22
+ Task,
23
+ TaskReport,
24
+ TaskSource
25
+ }
26
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
27
+ ColumnOptionTask,
28
+ PluginTask
29
+ }
13
30
  import org.embulk.output.s3_parquet.aws.Aws
14
- import org.embulk.output.s3_parquet.parquet.{LogicalTypeHandlerStore, ParquetFileWriter}
15
- import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
31
+ import org.embulk.output.s3_parquet.parquet.{
32
+ LogicalTypeHandlerStore,
33
+ ParquetFileWriter
34
+ }
35
+ import org.embulk.spi.{
36
+ Exec,
37
+ OutputPlugin,
38
+ PageReader,
39
+ Schema,
40
+ TransactionalPageOutput
41
+ }
16
42
  import org.embulk.spi.time.TimestampFormatter
17
43
  import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
18
44
  import org.embulk.spi.util.Timestamps
@@ -21,239 +47,302 @@ import org.slf4j.{Logger, LoggerFactory}
21
47
  import scala.jdk.CollectionConverters._
22
48
  import scala.util.chaining._
23
49
 
50
+ object S3ParquetOutputPlugin {
24
51
 
25
- object S3ParquetOutputPlugin
26
- {
52
+ trait PluginTask extends Task with TimestampFormatter.Task with Aws.Task {
27
53
 
28
- trait PluginTask
29
- extends Task
30
- with TimestampFormatter.Task
31
- with Aws.Task
32
- {
54
+ @Config("bucket")
55
+ def getBucket: String
33
56
 
34
- @Config("bucket")
35
- def getBucket: String
57
+ @Config("path_prefix")
58
+ @ConfigDefault("\"\"")
59
+ def getPathPrefix: String
36
60
 
37
- @Config("path_prefix")
38
- @ConfigDefault("\"\"")
39
- def getPathPrefix: String
61
+ @Config("sequence_format")
62
+ @ConfigDefault("\"%03d.%02d.\"")
63
+ def getSequenceFormat: String
40
64
 
41
- @Config("sequence_format")
42
- @ConfigDefault("\"%03d.%02d.\"")
43
- def getSequenceFormat: String
65
+ @Config("file_ext")
66
+ @ConfigDefault("\"parquet\"")
67
+ def getFileExt: String
44
68
 
45
- @Config("file_ext")
46
- @ConfigDefault("\"parquet\"")
47
- def getFileExt: String
69
+ @Config("compression_codec")
70
+ @ConfigDefault("\"uncompressed\"")
71
+ def getCompressionCodecString: String
48
72
 
49
- @Config("compression_codec")
50
- @ConfigDefault("\"uncompressed\"")
51
- def getCompressionCodecString: String
73
+ def setCompressionCodec(v: CompressionCodecName): Unit
52
74
 
53
- def setCompressionCodec(v: CompressionCodecName): Unit
75
+ def getCompressionCodec: CompressionCodecName
54
76
 
55
- def getCompressionCodec: CompressionCodecName
77
+ @Config("column_options")
78
+ @ConfigDefault("{}")
79
+ def getColumnOptions: JMap[String, ColumnOptionTask]
56
80
 
57
- @Config("column_options")
58
- @ConfigDefault("{}")
59
- def getColumnOptions: JMap[String, ColumnOptionTask]
81
+ @Config("canned_acl")
82
+ @ConfigDefault("\"private\"")
83
+ def getCannedAclString: String
60
84
 
61
- @Config("canned_acl")
62
- @ConfigDefault("\"private\"")
63
- def getCannedAclString: String
85
+ def setCannedAcl(v: CannedAccessControlList): Unit
64
86
 
65
- def setCannedAcl(v: CannedAccessControlList): Unit
87
+ def getCannedAcl: CannedAccessControlList
66
88
 
67
- def getCannedAcl: CannedAccessControlList
89
+ @Config("block_size")
90
+ @ConfigDefault("null")
91
+ def getBlockSize: Optional[Int]
68
92
 
69
- @Config("block_size")
70
- @ConfigDefault("null")
71
- def getBlockSize: Optional[Int]
93
+ @Config("page_size")
94
+ @ConfigDefault("null")
95
+ def getPageSize: Optional[Int]
72
96
 
73
- @Config("page_size")
74
- @ConfigDefault("null")
75
- def getPageSize: Optional[Int]
97
+ @Config("max_padding_size")
98
+ @ConfigDefault("null")
99
+ def getMaxPaddingSize: Optional[Int]
76
100
 
77
- @Config("max_padding_size")
78
- @ConfigDefault("null")
79
- def getMaxPaddingSize: Optional[Int]
101
+ @Config("enable_dictionary_encoding")
102
+ @ConfigDefault("null")
103
+ def getEnableDictionaryEncoding: Optional[Boolean]
80
104
 
81
- @Config("enable_dictionary_encoding")
82
- @ConfigDefault("null")
83
- def getEnableDictionaryEncoding: Optional[Boolean]
105
+ @Config("buffer_dir")
106
+ @ConfigDefault("null")
107
+ def getBufferDir: Optional[String]
84
108
 
85
- @Config("buffer_dir")
86
- @ConfigDefault("null")
87
- def getBufferDir: Optional[String]
109
+ @Config("catalog")
110
+ @ConfigDefault("null")
111
+ def getCatalog: Optional[CatalogRegistrator.Task]
88
112
 
89
- @Config("catalog")
90
- @ConfigDefault("null")
91
- def getCatalog: Optional[CatalogRegistrator.Task]
113
+ @Config("type_options")
114
+ @ConfigDefault("{}")
115
+ def getTypeOptions: JMap[String, TypeOptionTask]
116
+ }
92
117
 
93
- @Config("type_options")
94
- @ConfigDefault("{}")
95
- def getTypeOptions: JMap[String, TypeOptionTask]
96
- }
118
+ trait ColumnOptionTask
119
+ extends Task
120
+ with TimestampColumnOption
121
+ with LogicalTypeOption
97
122
 
98
- trait ColumnOptionTask
99
- extends Task with TimestampColumnOption with LogicalTypeOption
123
+ trait TypeOptionTask extends Task with LogicalTypeOption
100
124
 
101
- trait TypeOptionTask
102
- extends Task with LogicalTypeOption
125
+ trait LogicalTypeOption {
103
126
 
104
- trait LogicalTypeOption
105
- {
106
- @Config("logical_type")
107
- def getLogicalType: Optional[String]
108
- }
127
+ @Config("logical_type")
128
+ def getLogicalType: Optional[String]
129
+ }
109
130
  }
110
131
 
111
- class S3ParquetOutputPlugin
112
- extends OutputPlugin
113
- {
114
-
115
- val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
116
-
117
- private def withPluginContextClassLoader[A](f: => A): A =
118
- {
119
- val original: ClassLoader = Thread.currentThread.getContextClassLoader
120
- Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
121
- try f
122
- finally Thread.currentThread.setContextClassLoader(original)
123
- }
124
-
125
- override def transaction(config: ConfigSource,
126
- schema: Schema,
127
- taskCount: Int,
128
- control: OutputPlugin.Control): ConfigDiff =
129
- {
130
- val task: PluginTask = config.loadConfig(classOf[PluginTask])
131
-
132
- withPluginContextClassLoader {
133
- configure(task, schema)
134
- control.run(task.dump)
135
- }
136
- task.getCatalog.ifPresent { catalog =>
137
- val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
138
- val parquetColumnLogicalTypes: Map[String, String] = Map.newBuilder[String, String].pipe {builder =>
139
- val cOptions = task.getColumnOptions.asScala
140
- val tOptions = task.getTypeOptions.asScala
141
- schema.getColumns.asScala.foreach {c =>
142
- cOptions.get(c.getName)
143
- if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
144
- builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
145
- }
146
- else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
147
- builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
148
- }
149
- }
150
- builder.result()
132
+ class S3ParquetOutputPlugin extends OutputPlugin {
133
+
134
+ val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
135
+
136
+ override def transaction(
137
+ config: ConfigSource,
138
+ schema: Schema,
139
+ taskCount: Int,
140
+ control: OutputPlugin.Control
141
+ ): ConfigDiff = {
142
+ val task: PluginTask = config.loadConfig(classOf[PluginTask])
143
+
144
+ configure(task, schema)
145
+ control.run(task.dump)
146
+
147
+ task.getCatalog.ifPresent { catalog =>
148
+ val location =
149
+ s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
150
+ val parquetColumnLogicalTypes: Map[String, String] =
151
+ Map.newBuilder[String, String].pipe { builder =>
152
+ val cOptions = task.getColumnOptions.asScala
153
+ val tOptions = task.getTypeOptions.asScala
154
+ schema.getColumns.asScala.foreach { c =>
155
+ cOptions.get(c.getName)
156
+ if (cOptions
157
+ .contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
158
+ builder
159
+ .addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
151
160
  }
152
- val cr = CatalogRegistrator(aws = Aws(task),
153
- task = catalog,
154
- schema = schema,
155
- location = location,
156
- compressionCodec = task.getCompressionCodec,
157
- parquetColumnLogicalTypes = parquetColumnLogicalTypes)
158
- cr.run()
159
- }
160
-
161
- Exec.newConfigDiff
162
- }
163
-
164
- private def configure(task: PluginTask,
165
- schema: Schema): Unit =
166
- {
167
- // sequence_format
168
- try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
169
- catch {
170
- case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
171
- }
172
-
173
- // compression_codec
174
- CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
175
- case Some(v) => task.setCompressionCodec(v)
176
- case None =>
177
- val unsupported: String = task.getCompressionCodecString
178
- val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
179
- throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
180
- }
181
-
182
- // column_options
183
- task.getColumnOptions.forEach { (k: String,
184
- opt: ColumnOptionTask) =>
185
- val c = schema.lookupColumn(k)
186
- val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
187
- if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
188
- throw new ConfigException(s"column:$k is not 'timestamp' type.")
161
+ else if (tOptions.contains(c.getType.getName) && tOptions(
162
+ c.getType.getName
163
+ ).getLogicalType.isPresent) {
164
+ builder.addOne(
165
+ c.getName -> tOptions(c.getType.getName).getLogicalType.get()
166
+ )
189
167
  }
168
+ }
169
+ builder.result()
190
170
  }
171
+ val cr = CatalogRegistrator(
172
+ aws = Aws(task),
173
+ task = catalog,
174
+ schema = schema,
175
+ location = location,
176
+ compressionCodec = task.getCompressionCodec,
177
+ parquetColumnLogicalTypes = parquetColumnLogicalTypes
178
+ )
179
+ cr.run()
180
+ }
191
181
 
192
- // canned_acl
193
- CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
194
- case Some(v) => task.setCannedAcl(v)
195
- case None =>
196
- val unsupported: String = task.getCannedAclString
197
- val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
198
- throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
199
- }
182
+ Exec.newConfigDiff
183
+ }
184
+
185
+ private def configure(task: PluginTask, schema: Schema): Unit = {
186
+ // sequence_format
187
+ try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
188
+ catch {
189
+ case e: IllegalFormatException =>
190
+ throw new ConfigException(
191
+ s"Invalid sequence_format: ${task.getSequenceFormat}",
192
+ e
193
+ )
200
194
  }
201
195
 
202
- override def resume(taskSource: TaskSource,
203
- schema: Schema,
204
- taskCount: Int,
205
- control: OutputPlugin.Control): ConfigDiff =
206
- {
207
- throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
196
+ // compression_codec
197
+ CompressionCodecName
198
+ .values()
199
+ .find(v =>
200
+ v.name()
201
+ .toLowerCase(Locale.ENGLISH)
202
+ .equals(task.getCompressionCodecString)
203
+ ) match {
204
+ case Some(v) => task.setCompressionCodec(v)
205
+ case None =>
206
+ val unsupported: String = task.getCompressionCodecString
207
+ val supported: String = CompressionCodecName
208
+ .values()
209
+ .map(v => s"'${v.name().toLowerCase}'")
210
+ .mkString(", ")
211
+ throw new ConfigException(
212
+ s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported]."
213
+ )
208
214
  }
209
215
 
210
- override def cleanup(taskSource: TaskSource,
211
- schema: Schema,
212
- taskCount: Int,
213
- successTaskReports: JList[TaskReport]): Unit =
214
- {
215
- successTaskReports.forEach { tr =>
216
- logger.info(
217
- s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
218
- + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
219
- + s"etag: ${tr.get(classOf[String], "etag", null)}")
220
- }
216
+ // column_options
217
+ task.getColumnOptions.forEach { (k: String, opt: ColumnOptionTask) =>
218
+ val c = schema.lookupColumn(k)
219
+ val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
220
+ if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
221
+ throw new ConfigException(s"column:$k is not 'timestamp' type.")
222
+ }
221
223
  }
222
224
 
223
- override def open(taskSource: TaskSource,
224
- schema: Schema,
225
- taskIndex: Int): TransactionalPageOutput =
226
- {
227
- val task = taskSource.loadTask(classOf[PluginTask])
228
- val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
229
- val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
230
- val destS3bucket: String = task.getBucket
231
- val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
232
-
233
-
234
- val pageReader: PageReader = new PageReader(schema)
235
- val aws: Aws = Aws(task)
236
- val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
237
- val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(task.getTypeOptions, task.getColumnOptions)
238
- val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
239
- .withPath(bufferFile)
240
- .withSchema(schema)
241
- .withLogicalTypeHandlers(logicalTypeHandlers)
242
- .withTimestampFormatters(timestampFormatters)
243
- .withCompressionCodec(task.getCompressionCodec)
244
- .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
245
- .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
246
- .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
247
- .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
248
- .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
249
- .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
250
- .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
251
- .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
252
- .build()
253
-
254
- logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
255
-
256
- S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
225
+ // canned_acl
226
+ CannedAccessControlList
227
+ .values()
228
+ .find(v => v.toString.equals(task.getCannedAclString)) match {
229
+ case Some(v) => task.setCannedAcl(v)
230
+ case None =>
231
+ val unsupported: String = task.getCannedAclString
232
+ val supported: String = CannedAccessControlList
233
+ .values()
234
+ .map(v => s"'${v.toString}'")
235
+ .mkString(", ")
236
+ throw new ConfigException(
237
+ s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported]."
238
+ )
239
+ }
240
+ }
241
+
242
+ override def resume(
243
+ taskSource: TaskSource,
244
+ schema: Schema,
245
+ taskCount: Int,
246
+ control: OutputPlugin.Control
247
+ ): ConfigDiff = {
248
+ throw new UnsupportedOperationException(
249
+ "s3_parquet output plugin does not support resuming"
250
+ )
251
+ }
252
+
253
+ override def cleanup(
254
+ taskSource: TaskSource,
255
+ schema: Schema,
256
+ taskCount: Int,
257
+ successTaskReports: JList[TaskReport]
258
+ ): Unit = {
259
+ successTaskReports.forEach { tr =>
260
+ logger.info(
261
+ s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
262
+ + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
263
+ + s"etag: ${tr.get(classOf[String], "etag", null)}"
264
+ )
257
265
  }
266
+ }
267
+
268
+ override def open(
269
+ taskSource: TaskSource,
270
+ schema: Schema,
271
+ taskIndex: Int
272
+ ): TransactionalPageOutput = {
273
+ val task = taskSource.loadTask(classOf[PluginTask])
274
+ val bufferDir: String = task.getBufferDir.orElse(
275
+ Files.createTempDirectory("embulk-output-s3_parquet-").toString
276
+ )
277
+ val bufferFile: String = Paths
278
+ .get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet")
279
+ .toString
280
+ val destS3bucket: String = task.getBucket
281
+ val destS3Key: String = task.getPathPrefix + String.format(
282
+ task.getSequenceFormat,
283
+ taskIndex: Integer,
284
+ 0: Integer
285
+ ) + task.getFileExt
286
+
287
+ val pageReader: PageReader = new PageReader(schema)
288
+ val aws: Aws = Aws(task)
289
+ val timestampFormatters: Seq[TimestampFormatter] = Timestamps
290
+ .newTimestampColumnFormatters(task, schema, task.getColumnOptions)
291
+ .toSeq
292
+ val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(
293
+ task.getTypeOptions,
294
+ task.getColumnOptions
295
+ )
296
+ val parquetWriter: ParquetWriter[PageReader] =
297
+ ContextClassLoaderSwapper.usingPluginClass {
298
+ ParquetFileWriter
299
+ .builder()
300
+ .withPath(bufferFile)
301
+ .withSchema(schema)
302
+ .withLogicalTypeHandlers(logicalTypeHandlers)
303
+ .withTimestampFormatters(timestampFormatters)
304
+ .withCompressionCodec(task.getCompressionCodec)
305
+ .withDictionaryEncoding(
306
+ task.getEnableDictionaryEncoding.orElse(
307
+ ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED
308
+ )
309
+ )
310
+ .withDictionaryPageSize(
311
+ task.getPageSize.orElse(
312
+ ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE
313
+ )
314
+ )
315
+ .withMaxPaddingSize(
316
+ task.getMaxPaddingSize.orElse(
317
+ ParquetWriter.MAX_PADDING_SIZE_DEFAULT
318
+ )
319
+ )
320
+ .withPageSize(
321
+ task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE)
322
+ )
323
+ .withRowGroupSize(
324
+ task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE)
325
+ )
326
+ .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
327
+ .withWriteMode(
328
+ org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE
329
+ )
330
+ .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
331
+ .build()
332
+ }
333
+
334
+ logger.info(
335
+ s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key"
336
+ )
337
+
338
+ S3ParquetPageOutput(
339
+ bufferFile,
340
+ pageReader,
341
+ parquetWriter,
342
+ aws,
343
+ destS3bucket,
344
+ destS3Key
345
+ )
346
+ }
258
347
 
259
348
  }