RubyGems - embulk-output-s3_parquet - Versions diffs - 0.0.2 → 0.0.3 - Mend

embulk-output-s3_parquet 0.0.2 → 0.0.3

Files changed (19) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fac4eca9b96e218930333123d01d82ec5acae146
-  data.tar.gz: dc1de337cd4ada9fa86d53239be85b5a154115c1
+  metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
+  data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
 SHA512:
-  metadata.gz: 8451431ba506a80159ad768017268b826e73f6010fcb923804f6b3a8f8cb27e7669e90c4d8f25a6d348d55ce558fd8ea488942451a973c7d8387bd70a825fb77
-  data.tar.gz: 835f5d30265595270925e57587bea633a1e6156cfa8f7b660af3eac1539fcf9b40f1e7dfb986ae4a290c67b7eee2638275846cfa86edd217168ff5b2c0672313
+  metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
+  data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3

data/CHANGELOG.md CHANGED

@@ -1,3 +1,9 @@
+0.0.3 (2019-07-17)
+==================
+* [New Feature] Add `catalog` option to register a new table that has data created by `s3_parquet` plugin.
+* [Enhancement] Update dependencies.
 0.0.2 (2019-01-21)
 ==================

data/README.md CHANGED

@@ -59,6 +59,13 @@
 - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
 - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
 - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
+- **catalog**: Register a table if this option is specified (optional)
+  - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
+  - **database**: The name of the database (string, required)
+  - **table**: The name of the table (string, required)
+  - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
+    - **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
+  - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
 - **endpoint**: The AWS Service endpoint (string, optional)
 - **region**: The AWS region (string, optional)
 - **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)

data/build.gradle CHANGED

@@ -2,7 +2,6 @@ plugins {
     id "scala"
     id "com.jfrog.bintray" version "1.1"
     id "com.github.jruby-gradle.base" version "1.5.0"
-    id "com.diffplug.gradle.spotless" version "3.13.0"
     id "com.adarshr.test-logger" version "1.6.0"  // For Pretty test logging
 }
 import com.github.jrubygradle.JRubyExec
@@ -14,30 +13,30 @@ configurations {
     provided
 }
-version = "0.0.2"
+version = "0.0.3"
 sourceCompatibility = 1.8
 targetCompatibility = 1.8
 dependencies {
-    compile  "org.embulk:embulk-core:0.9.12"
-    provided "org.embulk:embulk-core:0.9.12"
+    compile  "org.embulk:embulk-core:0.9.17"
+    provided "org.embulk:embulk-core:0.9.17"
-    compile 'org.scala-lang:scala-library:2.12.8'
-    ['s3', 'sts'].each { v ->
-        compile "com.amazonaws:aws-java-sdk-${v}:1.11.479"
+    compile 'org.scala-lang:scala-library:2.13.0'
+    ['glue', 's3', 'sts'].each { v ->
+        compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
     }
     ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
-        compile "org.apache.parquet:parquet-${v}:1.10.0"
+        compile "org.apache.parquet:parquet-${v}:1.10.1"
     }
     compile 'org.apache.hadoop:hadoop-common:2.9.2'
-    compile 'org.xerial.snappy:snappy-java:1.1.7.2'
+    compile 'org.xerial.snappy:snappy-java:1.1.7.3'
-    testCompile 'org.scalatest:scalatest_2.12:3.0.5'
-    testCompile 'org.embulk:embulk-test:0.9.12'
-    testCompile 'org.embulk:embulk-standards:0.9.12'
+    testCompile 'org.scalatest:scalatest_2.13:3.0.8'
+    testCompile 'org.embulk:embulk-test:0.9.17'
+    testCompile 'org.embulk:embulk-standards:0.9.17'
     testCompile 'cloud.localstack:localstack-utils:0.1.15'
-    testCompile 'org.apache.parquet:parquet-tools:1.8.0'
+    testCompile 'org.apache.parquet:parquet-tools:1.10.1'
     testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
 }

data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala ADDED

@@ -0,0 +1,178 @@
+package org.embulk.output.s3_parquet
+import java.util.{Optional, Map => JMap}
+import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.embulk.config.{Config, ConfigDefault, ConfigException}
+import org.embulk.output.s3_parquet.aws.Aws
+import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
+import org.embulk.spi.Schema
+import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
+import org.slf4j.{Logger, LoggerFactory}
+import scala.jdk.CollectionConverters._
+import scala.util.Try
+object CatalogRegistrator
+{
+    trait Task
+        extends org.embulk.config.Task
+    {
+        @Config("catalog_id")
+        @ConfigDefault("null")
+        def getCatalogId: Optional[String]
+        @Config("database")
+        def getDatabase: String
+        @Config("table")
+        def getTable: String
+        @Config("column_options")
+        @ConfigDefault("{}")
+        def getColumnOptions: JMap[String, ColumnOptions]
+        @Config("operation_if_exists")
+        @ConfigDefault("\"delete\"")
+        def getOperationIfExists: String
+    }
+    trait ColumnOptions
+    {
+        @Config("type")
+        def getType: String
+    }
+    def apply(aws: Aws,
+              task: Task,
+              schema: Schema,
+              location: String,
+              compressionCodec: CompressionCodecName,
+              loggerOption: Option[Logger] = None): CatalogRegistrator =
+    {
+        new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
+    }
+}
+class CatalogRegistrator(aws: Aws,
+                         task: CatalogRegistrator.Task,
+                         schema: Schema,
+                         location: String,
+                         compressionCodec: CompressionCodecName,
+                         loggerOption: Option[Logger] = None)
+{
+    val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
+    def run(): Unit =
+    {
+        if (doesTableExists()) {
+            task.getOperationIfExists match {
+                case "skip" =>
+                    logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
+                    return
+                case "delete" =>
+                    logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
+                    deleteTable()
+                case unknown =>
+                    throw new ConfigException(s"Unsupported operation: $unknown")
+            }
+        }
+        registerNewParquetTable()
+        showNewTableInfo()
+    }
+    def showNewTableInfo(): Unit =
+    {
+        val req = new GetTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setName(task.getTable)
+        val t = aws.withGlue(_.getTable(req)).getTable
+        logger.info(s"Created a table: ${t.toString}")
+    }
+    def doesTableExists(): Boolean =
+    {
+        val req = new GetTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setName(task.getTable)
+        Try(aws.withGlue(_.getTable(req))).isSuccess
+    }
+    def deleteTable(): Unit =
+    {
+        val req = new DeleteTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setName(task.getTable)
+        aws.withGlue(_.deleteTable(req))
+    }
+    def registerNewParquetTable(): Unit =
+    {
+        logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
+        val req = new CreateTableRequest()
+        task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
+        req.setDatabaseName(task.getDatabase)
+        req.setTableInput(new TableInput()
+                              .withName(task.getTable)
+                              .withDescription("Created by embulk-output-s3_parquet")
+                              .withTableType("EXTERNAL_TABLE")
+                              .withParameters(Map("EXTERNAL" -> "TRUE",
+                                                  "classification" -> "parquet",
+                                                  "parquet.compression" -> compressionCodec.name()).asJava)
+                              .withStorageDescriptor(new StorageDescriptor()
+                                                         .withColumns(getGlueSchema: _*)
+                                                         .withLocation(location)
+                                                         .withCompressed(isCompressed)
+                                                         .withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
+                                                         .withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
+                                                         .withSerdeInfo(new SerDeInfo()
+                                                                            .withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
+                                                                            .withParameters(Map("serialization.format" -> "1").asJava)
+                                                                        )
+                                                     )
+                          )
+        aws.withGlue(_.createTable(req))
+    }
+    private def getGlueSchema: Seq[Column] =
+    {
+        val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
+        schema.getColumns.asScala.toSeq.map { c =>
+            val cType: String =
+                if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
+                else convertEmbulkType2GlueType(c.getType)
+            new Column()
+                .withName(c.getName)
+                .withType(cType)
+        }
+    }
+    private def convertEmbulkType2GlueType(t: Type): String =
+    {
+        t match {
+            case _: BooleanType   => "boolean"
+            case _: LongType      => "bigint"
+            case _: DoubleType    => "double"
+            case _: StringType    => "string"
+            case _: TimestampType => "string"
+            case _: JsonType      => "string"
+            case unknown          => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
+        }
+    }
+    private def isCompressed: Boolean =
+    {
+        !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
+    }
+}

data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala CHANGED

@@ -16,184 +16,206 @@ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPage
 import org.embulk.spi.time.TimestampFormatter
 import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
 import org.embulk.spi.util.Timestamps
-import org.slf4j.Logger
+import org.slf4j.{Logger, LoggerFactory}
-object S3ParquetOutputPlugin {
-  trait PluginTask
-    extends Task
-    with TimestampFormatter.Task
-    with Aws.Task {
+object S3ParquetOutputPlugin
+{
-    @Config("bucket")
-    def getBucket: String
+    trait PluginTask
+        extends Task
+            with TimestampFormatter.Task
+            with Aws.Task
+    {
-    @Config("path_prefix")
-    @ConfigDefault("\"\"")
-    def getPathPrefix: String
+        @Config("bucket")
+        def getBucket: String
-    @Config("sequence_format")
-    @ConfigDefault("\"%03d.%02d.\"")
-    def getSequenceFormat: String
+        @Config("path_prefix")
+        @ConfigDefault("\"\"")
+        def getPathPrefix: String
-    @Config("file_ext")
-    @ConfigDefault("\"parquet\"")
-    def getFileExt: String
+        @Config("sequence_format")
+        @ConfigDefault("\"%03d.%02d.\"")
+        def getSequenceFormat: String
-    @Config("compression_codec")
-    @ConfigDefault("\"uncompressed\"")
-    def getCompressionCodecString: String
+        @Config("file_ext")
+        @ConfigDefault("\"parquet\"")
+        def getFileExt: String
-    def setCompressionCodec(v: CompressionCodecName): Unit
+        @Config("compression_codec")
+        @ConfigDefault("\"uncompressed\"")
+        def getCompressionCodecString: String
-    def getCompressionCodec: CompressionCodecName
+        def setCompressionCodec(v: CompressionCodecName): Unit
-    @Config("column_options")
-    @ConfigDefault("{}")
-    def getColumnOptions: JMap[String, TimestampColumnOption]
+        def getCompressionCodec: CompressionCodecName
-    @Config("canned_acl")
-    @ConfigDefault("\"private\"")
-    def getCannedAclString: String
+        @Config("column_options")
+        @ConfigDefault("{}")
+        def getColumnOptions: JMap[String, TimestampColumnOption]
-    def setCannedAcl(v: CannedAccessControlList): Unit
+        @Config("canned_acl")
+        @ConfigDefault("\"private\"")
+        def getCannedAclString: String
-    def getCannedAcl: CannedAccessControlList
+        def setCannedAcl(v: CannedAccessControlList): Unit
-    @Config("block_size")
-    @ConfigDefault("null")
-    def getBlockSize: Optional[Int]
+        def getCannedAcl: CannedAccessControlList
-    @Config("page_size")
-    @ConfigDefault("null")
-    def getPageSize: Optional[Int]
+        @Config("block_size")
+        @ConfigDefault("null")
+        def getBlockSize: Optional[Int]
-    @Config("max_padding_size")
-    @ConfigDefault("null")
-    def getMaxPaddingSize: Optional[Int]
+        @Config("page_size")
+        @ConfigDefault("null")
+        def getPageSize: Optional[Int]
-    @Config("enable_dictionary_encoding")
-    @ConfigDefault("null")
-    def getEnableDictionaryEncoding: Optional[Boolean]
+        @Config("max_padding_size")
+        @ConfigDefault("null")
+        def getMaxPaddingSize: Optional[Int]
-    @Config("buffer_dir")
-    @ConfigDefault("null")
-    def getBufferDir: Optional[String]
+        @Config("enable_dictionary_encoding")
+        @ConfigDefault("null")
+        def getEnableDictionaryEncoding: Optional[Boolean]
-  }
+        @Config("buffer_dir")
+        @ConfigDefault("null")
+        def getBufferDir: Optional[String]
+        @Config("catalog")
+        @ConfigDefault("null")
+        def getCatalog: Optional[CatalogRegistrator.Task]
+    }
 }
 class S3ParquetOutputPlugin
-  extends OutputPlugin {
-  val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
-  private def withPluginContextClassLoader[A](f: => A): A = {
-    val original: ClassLoader = Thread.currentThread.getContextClassLoader
-    Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
-    try f
-    finally Thread.currentThread.setContextClassLoader(original)
-  }
-  override def transaction(config: ConfigSource,
-                           schema: Schema,
-                           taskCount: Int,
-                           control: OutputPlugin.Control): ConfigDiff = {
-    val task: PluginTask = config.loadConfig(classOf[PluginTask])
-    withPluginContextClassLoader {
-      configure(task, schema)
-      control.run(task.dump)
-    }
+    extends OutputPlugin
+{
-    Exec.newConfigDiff
-  }
+    val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
+    private def withPluginContextClassLoader[A](f: => A): A =
+    {
+        val original: ClassLoader = Thread.currentThread.getContextClassLoader
+        Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
+        try f
+        finally Thread.currentThread.setContextClassLoader(original)
+    }
-  private def configure(task: PluginTask,
-                        schema: Schema): Unit = {
-    // sequence_format
-    try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
-    catch {
-      case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
+    override def transaction(config: ConfigSource,
+                             schema: Schema,
+                             taskCount: Int,
+                             control: OutputPlugin.Control): ConfigDiff =
+    {
+        val task: PluginTask = config.loadConfig(classOf[PluginTask])
+        withPluginContextClassLoader {
+            configure(task, schema)
+            control.run(task.dump)
+        }
+        task.getCatalog.ifPresent { catalog =>
+            val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
+            val cr = CatalogRegistrator(aws = Aws(task),
+                                        task = catalog,
+                                        schema = schema,
+                                        location = location,
+                                        compressionCodec = task.getCompressionCodec)
+            cr.run()
+        }
+        Exec.newConfigDiff
     }
-    // compression_codec
-    CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
-      case Some(v) => task.setCompressionCodec(v)
-      case None    =>
-        val unsupported: String = task.getCompressionCodecString
-        val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
-        throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
+    private def configure(task: PluginTask,
+                          schema: Schema): Unit =
+    {
+        // sequence_format
+        try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
+        catch {
+            case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
+        }
+        // compression_codec
+        CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
+            case Some(v) => task.setCompressionCodec(v)
+            case None    =>
+                val unsupported: String = task.getCompressionCodecString
+                val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
+                throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
+        }
+        // column_options
+        task.getColumnOptions.forEach { (k: String,
+                                         _) =>
+            val c = schema.lookupColumn(k)
+            if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
+        }
+        // canned_acl
+        CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
+            case Some(v) => task.setCannedAcl(v)
+            case None    =>
+                val unsupported: String = task.getCannedAclString
+                val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
+                throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
+        }
     }
-    // column_options
-    task.getColumnOptions.forEach { (k: String,
-                                     _) =>
-      val c = schema.lookupColumn(k)
-      if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
+    override def resume(taskSource: TaskSource,
+                        schema: Schema,
+                        taskCount: Int,
+                        control: OutputPlugin.Control): ConfigDiff =
+    {
+        throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
     }
-    // canned_acl
-    CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
-      case Some(v) => task.setCannedAcl(v)
-      case None    =>
-        val unsupported: String = task.getCannedAclString
-        val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
-        throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
+    override def cleanup(taskSource: TaskSource,
+                         schema: Schema,
+                         taskCount: Int,
+                         successTaskReports: JList[TaskReport]): Unit =
+    {
+        successTaskReports.forEach { tr =>
+            logger.info(
+                s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
+                    + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
+                    + s"etag: ${tr.get(classOf[String], "etag", null)}")
+        }
     }
-  }
-  override def resume(taskSource: TaskSource,
+    override def open(taskSource: TaskSource,
                       schema: Schema,
-                      taskCount: Int,
-                      control: OutputPlugin.Control): ConfigDiff = {
-    throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
-  }
-  override def cleanup(taskSource: TaskSource,
-                       schema: Schema,
-                       taskCount: Int,
-                       successTaskReports: JList[TaskReport]): Unit = {
-    successTaskReports.forEach { tr =>
-      logger.info(
-        s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
-          + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
-          + s"etag: ${tr.get(classOf[String], "etag", null)}")
+                      taskIndex: Int): TransactionalPageOutput =
+    {
+        val task = taskSource.loadTask(classOf[PluginTask])
+        val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
+        val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
+        val destS3bucket: String = task.getBucket
+        val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
+        val pageReader: PageReader = new PageReader(schema)
+        val aws: Aws = Aws(task)
+        val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
+        val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
+            .withPath(bufferFile)
+            .withSchema(schema)
+            .withTimestampFormatters(timestampFormatters)
+            .withCompressionCodec(task.getCompressionCodec)
+            .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
+            .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
+            .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
+            .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
+            .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
+            .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
+            .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
+            .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
+            .build()
+        logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
+        S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
     }
-  }
-  override def open(taskSource: TaskSource,
-                    schema: Schema,
-                    taskIndex: Int): TransactionalPageOutput = {
-    val task = taskSource.loadTask(classOf[PluginTask])
-    val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
-    val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
-    val destS3bucket: String = task.getBucket
-    val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
-    val pageReader: PageReader = new PageReader(schema)
-    val aws: Aws = Aws(task)
-    val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
-    val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
-      .withPath(bufferFile)
-      .withSchema(schema)
-      .withTimestampFormatters(timestampFormatters)
-      .withCompressionCodec(task.getCompressionCodec)
-      .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
-      .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
-      .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
-      .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
-      .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
-      .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
-      .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
-      .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
-      .build()
-    logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
-    S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
-  }
 }