RubyGems - embulk-output-s3_parquet - Versions diffs - 0.0.2 → 0.0.3 - Mend

embulk-output-s3_parquet 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fac4eca9b96e218930333123d01d82ec5acae146
-  data.tar.gz: dc1de337cd4ada9fa86d53239be85b5a154115c1
+  metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
+  data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
 SHA512:
-  metadata.gz: 8451431ba506a80159ad768017268b826e73f6010fcb923804f6b3a8f8cb27e7669e90c4d8f25a6d348d55ce558fd8ea488942451a973c7d8387bd70a825fb77
-  data.tar.gz: 835f5d30265595270925e57587bea633a1e6156cfa8f7b660af3eac1539fcf9b40f1e7dfb986ae4a290c67b7eee2638275846cfa86edd217168ff5b2c0672313
+  metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
+  data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3

data/CHANGELOG.md CHANGED

@@ -1,3 +1,9 @@
+0.0.3 (2019-07-17)
+==================
+* [New Feature] Add `catalog` option to register a new table that has data created by `s3_parquet` plugin.
+* [Enhancement] Update dependencies.
 0.0.2 (2019-01-21)
 ==================

data/README.md CHANGED

@@ -59,6 +59,13 @@
 - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
 - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
 - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
+- **catalog**: Register a table if this option is specified (optional)
+  - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
+  - **database**: The name of the database (string, required)
+  - **table**: The name of the table (string, required)
+  - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
+    - **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
+  - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
 - **endpoint**: The AWS Service endpoint (string, optional)
 - **region**: The AWS region (string, optional)
 - **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)

data/build.gradle CHANGED

@@ -2,7 +2,6 @@ plugins {
     id "scala"
     id "com.jfrog.bintray" version "1.1"
     id "com.github.jruby-gradle.base" version "1.5.0"
-    id "com.diffplug.gradle.spotless" version "3.13.0"
     id "com.adarshr.test-logger" version "1.6.0"  // For Pretty test logging
 }
 import com.github.jrubygradle.JRubyExec
@@ -14,30 +13,30 @@ configurations {
     provided
 }
-version = "0.0.2"
+version = "0.0.3"
 sourceCompatibility = 1.8
 targetCompatibility = 1.8
 dependencies {
-    compile  "org.embulk:embulk-core:0.9.12"
-    provided "org.embulk:embulk-core:0.9.12"
+    compile  "org.embulk:embulk-core:0.9.17"
+    provided "org.embulk:embulk-core:0.9.17"
-    compile 'org.scala-lang:scala-library:2.12.8'
-    ['s3', 'sts'].each { v ->
-        compile "com.amazonaws:aws-java-sdk-${v}:1.11.479"
+    compile 'org.scala-lang:scala-library:2.13.0'
+    ['glue', 's3', 'sts'].each { v ->
+        compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
     }
     ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
-        compile "org.apache.parquet:parquet-${v}:1.10.0"
+        compile "org.apache.parquet:parquet-${v}:1.10.1"
     }
     compile 'org.apache.hadoop:hadoop-common:2.9.2'
-    compile 'org.xerial.snappy:snappy-java:1.1.7.2'
+    compile 'org.xerial.snappy:snappy-java:1.1.7.3'
-    testCompile 'org.scalatest:scalatest_2.12:3.0.5'
-    testCompile 'org.embulk:embulk-test:0.9.12'
-    testCompile 'org.embulk:embulk-standards:0.9.12'
+    testCompile 'org.scalatest:scalatest_2.13:3.0.8'
+    testCompile 'org.embulk:embulk-test:0.9.17'
+    testCompile 'org.embulk:embulk-standards:0.9.17'
     testCompile 'cloud.localstack:localstack-utils:0.1.15'
-    testCompile 'org.apache.parquet:parquet-tools:1.8.0'
+    testCompile 'org.apache.parquet:parquet-tools:1.10.1'
     testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
 }

data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala ADDED

@@ -0,0 +1,178 @@
+package org.embulk.output.s3_parquet
+import java.util.{Optional, Map => JMap}
+import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.embulk.config.{Config, ConfigDefault, ConfigException}
+import org.embulk.output.s3_parquet.aws.Aws
+import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
+import org.embulk.spi.Schema
+import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
+import org.slf4j.{Logger, LoggerFactory}
+import scala.jdk.CollectionConverters._
+import scala.util.Try
+object CatalogRegistrator
+{
+    trait Task
+        extends org.embulk.config.Task
+    {
+        @Config("catalog_id")
+        @ConfigDefault("null")
+        def getCatalogId: Optional[String]
+        @Config("database")
+        def getDatabase: String
+        @Config("table")
+        def getTable: String
+        @Config("column_options")
+        @ConfigDefault("{}")
+        def getColumnOptions: JMap[String, ColumnOptions]
+        @Config("operation_if_exists")
+        @ConfigDefault("\"delete\"")
+        def getOperationIfExists: String
+    }
+    trait ColumnOptions
+    {
+        @Config("type")
+        def getType: String
+    }
+    def apply(aws: Aws,
+              task: Task,
+              schema: Schema,
+              location: String,
+              compressionCodec: CompressionCodecName,
+              loggerOption: Option[Logger] = None): CatalogRegistrator =
+    {
+        new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
+    }
+}
+class CatalogRegistrator(aws: Aws,
+                         task: CatalogRegistrator.Task,
+                         schema: Schema,
+                         location: String,
+                         compressionCodec: CompressionCodecName,
+                         loggerOption: Option[Logger] = None)
+{
+    val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
+    def run(): Unit =
+    {
+        if (doesTableExists()) {
+            task.getOperationIfExists match {
+                case "skip" =>
+                    logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
+                    return
+                case "delete" =>
+                    logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
+                    deleteTable()
+                case unknown =>
+                    throw new ConfigException(s"Unsupported operation: $unknown")
+            }
+        }
+        registerNewParquetTable()
+        showNewTableInfo()
+    }
+    def showNewTableInfo(): Unit =
+    {
+        val req = new GetTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setName(task.getTable)
+        val t = aws.withGlue(_.getTable(req)).getTable
+        logger.info(s"Created a table: ${t.toString}")
+    }
+    def doesTableExists(): Boolean =
+    {
+        val req = new GetTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setName(task.getTable)
+        Try(aws.withGlue(_.getTable(req))).isSuccess
+    }
+    def deleteTable(): Unit =
+    {
+        val req = new DeleteTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setName(task.getTable)
+        aws.withGlue(_.deleteTable(req))
+    }
+    def registerNewParquetTable(): Unit =
+    {
+        logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
+        val req = new CreateTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setTableInput(new TableInput()
+                              .withName(task.getTable)
+                              .withDescription("Created by embulk-output-s3_parquet")
+                              .withTableType("EXTERNAL_TABLE")
+                              .withParameters(Map("EXTERNAL" -> "TRUE",
+                                                  "classification" -> "parquet",
+                                                  "parquet.compression" -> compressionCodec.name()).asJava)
+                              .withStorageDescriptor(new StorageDescriptor()
+                                                         .withColumns(getGlueSchema: _*)
+                                                         .withLocation(location)
+                                                         .withCompressed(isCompressed)
+                                                         .withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
+                                                         .withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
+                                                         .withSerdeInfo(new SerDeInfo()
+                                                                            .withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
+                                                                            .withParameters(Map("serialization.format" -> "1").asJava)
+                                                                        )
+                                                     )
+                          )
+        aws.withGlue(_.createTable(req))
+    }
+    private def getGlueSchema: Seq[Column] =
+    {
+        val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
+        schema.getColumns.asScala.toSeq.map { c =>
+            val cType: String =
+                if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
+                else convertEmbulkType2GlueType(c.getType)
+            new Column()
+                .withName(c.getName)
+                .withType(cType)
+        }
+    }
+    private def convertEmbulkType2GlueType(t: Type): String =
+    {
+        t match {
+            case _: BooleanType   => "boolean"
+            case _: LongType      => "bigint"
+            case _: DoubleType    => "double"
+            case _: StringType    => "string"
+            case _: TimestampType => "string"
+            case _: JsonType      => "string"
+            case unknown          => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
+        }
+    }
+    private def isCompressed: Boolean =
+    {
+        !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
+    }
+}

data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala CHANGED

@@ -16,184 +16,206 @@ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPage
 import org.embulk.spi.time.TimestampFormatter
 import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
 import org.embulk.spi.util.Timestamps
-import org.slf4j.Logger
+import org.slf4j.{Logger, LoggerFactory}
-object S3ParquetOutputPlugin {
-  trait PluginTask
-    extends Task
-    with TimestampFormatter.Task
-    with Aws.Task {
+object S3ParquetOutputPlugin
+{
-    @Config("bucket")
-    def getBucket: String
+    trait PluginTask
+        extends Task
+            with TimestampFormatter.Task
+            with Aws.Task
+    {
-    @Config("path_prefix")
-    @ConfigDefault("\"\"")
-    def getPathPrefix: String
+        @Config("bucket")
+        def getBucket: String
-    @Config("sequence_format")
-    @ConfigDefault("\"%03d.%02d.\"")
-    def getSequenceFormat: String
+        @Config("path_prefix")
+        @ConfigDefault("\"\"")
+        def getPathPrefix: String
-    @Config("file_ext")
-    @ConfigDefault("\"parquet\"")
-    def getFileExt: String
+        @Config("sequence_format")
+        @ConfigDefault("\"%03d.%02d.\"")
+        def getSequenceFormat: String
-    @Config("compression_codec")
-    @ConfigDefault("\"uncompressed\"")
-    def getCompressionCodecString: String
+        @Config("file_ext")
+        @ConfigDefault("\"parquet\"")
+        def getFileExt: String
-    def setCompressionCodec(v: CompressionCodecName): Unit
+        @Config("compression_codec")
+        @ConfigDefault("\"uncompressed\"")
+        def getCompressionCodecString: String
-    def getCompressionCodec: CompressionCodecName
+        def setCompressionCodec(v: CompressionCodecName): Unit
-    @Config("column_options")
-    @ConfigDefault("{}")
-    def getColumnOptions: JMap[String, TimestampColumnOption]
+        def getCompressionCodec: CompressionCodecName
-    @Config("canned_acl")
-    @ConfigDefault("\"private\"")
-    def getCannedAclString: String
+        @Config("column_options")
+        @ConfigDefault("{}")
+        def getColumnOptions: JMap[String, TimestampColumnOption]
-    def setCannedAcl(v: CannedAccessControlList): Unit
+        @Config("canned_acl")
+        @ConfigDefault("\"private\"")
+        def getCannedAclString: String
-    def getCannedAcl: CannedAccessControlList
+        def setCannedAcl(v: CannedAccessControlList): Unit
-    @Config("block_size")
-    @ConfigDefault("null")
-    def getBlockSize: Optional[Int]
+        def getCannedAcl: CannedAccessControlList
-    @Config("page_size")
-    @ConfigDefault("null")
-    def getPageSize: Optional[Int]
+        @Config("block_size")
+        @ConfigDefault("null")
+        def getBlockSize: Optional[Int]
-    @Config("max_padding_size")
-    @ConfigDefault("null")
-    def getMaxPaddingSize: Optional[Int]
+        @Config("page_size")
+        @ConfigDefault("null")
+        def getPageSize: Optional[Int]
-    @Config("enable_dictionary_encoding")
-    @ConfigDefault("null")
-    def getEnableDictionaryEncoding: Optional[Boolean]
+        @Config("max_padding_size")
+        @ConfigDefault("null")
+        def getMaxPaddingSize: Optional[Int]
-    @Config("buffer_dir")
-    @ConfigDefault("null")
-    def getBufferDir: Optional[String]
+        @Config("enable_dictionary_encoding")
+        @ConfigDefault("null")
+        def getEnableDictionaryEncoding: Optional[Boolean]
-  }
+        @Config("buffer_dir")
+        @ConfigDefault("null")
+        def getBufferDir: Optional[String]
+        @Config("catalog")
+        @ConfigDefault("null")
+        def getCatalog: Optional[CatalogRegistrator.Task]
+    }
 }
 class S3ParquetOutputPlugin
-  extends OutputPlugin {
-  val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
-  private def withPluginContextClassLoader[A](f: => A): A = {
-    val original: ClassLoader = Thread.currentThread.getContextClassLoader
-    Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
-    try f
-    finally Thread.currentThread.setContextClassLoader(original)
-  }
-  override def transaction(config: ConfigSource,
-                           schema: Schema,
-                           taskCount: Int,
-                           control: OutputPlugin.Control): ConfigDiff = {
-    val task: PluginTask = config.loadConfig(classOf[PluginTask])
-    withPluginContextClassLoader {
-      configure(task, schema)
-      control.run(task.dump)
-    }
+    extends OutputPlugin
+{
-    Exec.newConfigDiff
-  }
+    val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
+    private def withPluginContextClassLoader[A](f: => A): A =
+    {
+        val original: ClassLoader = Thread.currentThread.getContextClassLoader
+        Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
+        try f
+        finally Thread.currentThread.setContextClassLoader(original)
+    }
-  private def configure(task: PluginTask,
-                        schema: Schema): Unit = {
-    // sequence_format
-    try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
-    catch {
-      case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
+    override def transaction(config: ConfigSource,
+                             schema: Schema,
+                             taskCount: Int,
+                             control: OutputPlugin.Control): ConfigDiff =
+    {
+        val task: PluginTask = config.loadConfig(classOf[PluginTask])
+        withPluginContextClassLoader {
+            configure(task, schema)
+            control.run(task.dump)
+        }
+        task.getCatalog.ifPresent { catalog =>
+            val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
+            val cr = CatalogRegistrator(aws = Aws(task),
+                                        task = catalog,
+                                        schema = schema,
+                                        location = location,
+                                        compressionCodec = task.getCompressionCodec)
+            cr.run()
+        }
+        Exec.newConfigDiff
     }
-    // compression_codec
-    CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
-      case Some(v) => task.setCompressionCodec(v)
-      case None    =>
-        val unsupported: String = task.getCompressionCodecString
-        val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
-        throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
+    private def configure(task: PluginTask,
+                          schema: Schema): Unit =
+    {
+        // sequence_format
+        try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
+        catch {
+            case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
+        }
+        // compression_codec
+        CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
+            case Some(v) => task.setCompressionCodec(v)
+            case None    =>
+                val unsupported: String = task.getCompressionCodecString
+                val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
+                throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
+        }
+        // column_options
+        task.getColumnOptions.forEach { (k: String,
+                                         _) =>
+            val c = schema.lookupColumn(k)
+            if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
+        }
+        // canned_acl
+        CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
+            case Some(v) => task.setCannedAcl(v)
+            case None    =>
+                val unsupported: String = task.getCannedAclString
+                val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
+                throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
+        }
     }
-    // column_options
-    task.getColumnOptions.forEach { (k: String,
-                                     _) =>
-      val c = schema.lookupColumn(k)
-      if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
+    override def resume(taskSource: TaskSource,
+                        schema: Schema,
+                        taskCount: Int,
+                        control: OutputPlugin.Control): ConfigDiff =
+    {
+        throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
     }
-    // canned_acl
-    CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
-      case Some(v) => task.setCannedAcl(v)
-      case None    =>
-        val unsupported: String = task.getCannedAclString
-        val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
-        throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
+    override def cleanup(taskSource: TaskSource,
+                         schema: Schema,
+                         taskCount: Int,
+                         successTaskReports: JList[TaskReport]): Unit =
+    {
+        successTaskReports.forEach { tr =>
+            logger.info(
+                s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
+                    + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
+                    + s"etag: ${tr.get(classOf[String], "etag", null)}")
+        }
     }
-  }
-  override def resume(taskSource: TaskSource,
+    override def open(taskSource: TaskSource,
                       schema: Schema,
-                      taskCount: Int,
-                      control: OutputPlugin.Control): ConfigDiff = {
-    throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
-  }
-  override def cleanup(taskSource: TaskSource,
-                       schema: Schema,
-                       taskCount: Int,
-                       successTaskReports: JList[TaskReport]): Unit = {
-    successTaskReports.forEach { tr =>
-      logger.info(
-        s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
-          + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
-          + s"etag: ${tr.get(classOf[String], "etag", null)}")
+                      taskIndex: Int): TransactionalPageOutput =
+    {
+        val task = taskSource.loadTask(classOf[PluginTask])
+        val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
+        val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
+        val destS3bucket: String = task.getBucket
+        val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
+        val pageReader: PageReader = new PageReader(schema)
+        val aws: Aws = Aws(task)
+        val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
+        val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
+            .withPath(bufferFile)
+            .withSchema(schema)
+            .withTimestampFormatters(timestampFormatters)
+            .withCompressionCodec(task.getCompressionCodec)
+            .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
+            .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
+            .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
+            .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
+            .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
+            .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
+            .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
+            .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
+            .build()
+        logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
+        S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
     }
-  }
-  override def open(taskSource: TaskSource,
-                    schema: Schema,
-                    taskIndex: Int): TransactionalPageOutput = {
-    val task = taskSource.loadTask(classOf[PluginTask])
-    val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
-    val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
-    val destS3bucket: String = task.getBucket
-    val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
-    val pageReader: PageReader = new PageReader(schema)
-    val aws: Aws = Aws(task)
-    val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
-    val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
-      .withPath(bufferFile)
-      .withSchema(schema)
-      .withTimestampFormatters(timestampFormatters)
-      .withCompressionCodec(task.getCompressionCodec)
-      .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
-      .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
-      .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
-      .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
-      .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
-      .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
-      .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
-      .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
-      .build()
-    logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
-    S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
-  }
 }