embulk-output-s3_parquet 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fac4eca9b96e218930333123d01d82ec5acae146
4
- data.tar.gz: dc1de337cd4ada9fa86d53239be85b5a154115c1
3
+ metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
4
+ data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
5
5
  SHA512:
6
- metadata.gz: 8451431ba506a80159ad768017268b826e73f6010fcb923804f6b3a8f8cb27e7669e90c4d8f25a6d348d55ce558fd8ea488942451a973c7d8387bd70a825fb77
7
- data.tar.gz: 835f5d30265595270925e57587bea633a1e6156cfa8f7b660af3eac1539fcf9b40f1e7dfb986ae4a290c67b7eee2638275846cfa86edd217168ff5b2c0672313
6
+ metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
7
+ data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3
@@ -1,3 +1,9 @@
1
+ 0.0.3 (2019-07-17)
2
+ ==================
3
+
4
+ * [New Feature] Add `catalog` option to register a new table that has data created by `s3_parquet` plugin.
5
+ * [Enhancement] Update dependencies.
6
+
1
7
  0.0.2 (2019-01-21)
2
8
  ==================
3
9
 
data/README.md CHANGED
@@ -59,6 +59,13 @@
59
59
  - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
60
60
  - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
61
61
  - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
62
+ - **catalog**: Register a table if this option is specified (optional)
63
+ - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
64
+ - **database**: The name of the database (string, required)
65
+ - **table**: The name of the table (string, required)
66
+ - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
67
+ - **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
68
+ - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
62
69
  - **endpoint**: The AWS Service endpoint (string, optional)
63
70
  - **region**: The AWS region (string, optional)
64
71
  - **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)
@@ -2,7 +2,6 @@ plugins {
2
2
  id "scala"
3
3
  id "com.jfrog.bintray" version "1.1"
4
4
  id "com.github.jruby-gradle.base" version "1.5.0"
5
- id "com.diffplug.gradle.spotless" version "3.13.0"
6
5
  id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
7
6
  }
8
7
  import com.github.jrubygradle.JRubyExec
@@ -14,30 +13,30 @@ configurations {
14
13
  provided
15
14
  }
16
15
 
17
- version = "0.0.2"
16
+ version = "0.0.3"
18
17
 
19
18
  sourceCompatibility = 1.8
20
19
  targetCompatibility = 1.8
21
20
 
22
21
  dependencies {
23
- compile "org.embulk:embulk-core:0.9.12"
24
- provided "org.embulk:embulk-core:0.9.12"
22
+ compile "org.embulk:embulk-core:0.9.17"
23
+ provided "org.embulk:embulk-core:0.9.17"
25
24
 
26
- compile 'org.scala-lang:scala-library:2.12.8'
27
- ['s3', 'sts'].each { v ->
28
- compile "com.amazonaws:aws-java-sdk-${v}:1.11.479"
25
+ compile 'org.scala-lang:scala-library:2.13.0'
26
+ ['glue', 's3', 'sts'].each { v ->
27
+ compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
29
28
  }
30
29
  ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
31
- compile "org.apache.parquet:parquet-${v}:1.10.0"
30
+ compile "org.apache.parquet:parquet-${v}:1.10.1"
32
31
  }
33
32
  compile 'org.apache.hadoop:hadoop-common:2.9.2'
34
- compile 'org.xerial.snappy:snappy-java:1.1.7.2'
33
+ compile 'org.xerial.snappy:snappy-java:1.1.7.3'
35
34
 
36
- testCompile 'org.scalatest:scalatest_2.12:3.0.5'
37
- testCompile 'org.embulk:embulk-test:0.9.12'
38
- testCompile 'org.embulk:embulk-standards:0.9.12'
35
+ testCompile 'org.scalatest:scalatest_2.13:3.0.8'
36
+ testCompile 'org.embulk:embulk-test:0.9.17'
37
+ testCompile 'org.embulk:embulk-standards:0.9.17'
39
38
  testCompile 'cloud.localstack:localstack-utils:0.1.15'
40
- testCompile 'org.apache.parquet:parquet-tools:1.8.0'
39
+ testCompile 'org.apache.parquet:parquet-tools:1.10.1'
41
40
  testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
42
41
  }
43
42
 
@@ -0,0 +1,178 @@
1
+ package org.embulk.output.s3_parquet
2
+
3
+
4
+ import java.util.{Optional, Map => JMap}
5
+
6
+ import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
7
+ import org.apache.parquet.hadoop.metadata.CompressionCodecName
8
+ import org.embulk.config.{Config, ConfigDefault, ConfigException}
9
+ import org.embulk.output.s3_parquet.aws.Aws
10
+ import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
11
+ import org.embulk.spi.Schema
12
+ import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
13
+ import org.slf4j.{Logger, LoggerFactory}
14
+
15
+ import scala.jdk.CollectionConverters._
16
+ import scala.util.Try
17
+
18
+
19
+ object CatalogRegistrator
20
+ {
21
+ trait Task
22
+ extends org.embulk.config.Task
23
+ {
24
+ @Config("catalog_id")
25
+ @ConfigDefault("null")
26
+ def getCatalogId: Optional[String]
27
+
28
+ @Config("database")
29
+ def getDatabase: String
30
+
31
+ @Config("table")
32
+ def getTable: String
33
+
34
+ @Config("column_options")
35
+ @ConfigDefault("{}")
36
+ def getColumnOptions: JMap[String, ColumnOptions]
37
+
38
+ @Config("operation_if_exists")
39
+ @ConfigDefault("\"delete\"")
40
+ def getOperationIfExists: String
41
+ }
42
+
43
+ trait ColumnOptions
44
+ {
45
+ @Config("type")
46
+ def getType: String
47
+ }
48
+
49
+ def apply(aws: Aws,
50
+ task: Task,
51
+ schema: Schema,
52
+ location: String,
53
+ compressionCodec: CompressionCodecName,
54
+ loggerOption: Option[Logger] = None): CatalogRegistrator =
55
+ {
56
+ new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
57
+ }
58
+ }
59
+
60
+ class CatalogRegistrator(aws: Aws,
61
+ task: CatalogRegistrator.Task,
62
+ schema: Schema,
63
+ location: String,
64
+ compressionCodec: CompressionCodecName,
65
+ loggerOption: Option[Logger] = None)
66
+ {
67
+ val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
68
+
69
+ def run(): Unit =
70
+ {
71
+ if (doesTableExists()) {
72
+ task.getOperationIfExists match {
73
+ case "skip" =>
74
+ logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
75
+ return
76
+
77
+ case "delete" =>
78
+ logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
79
+ deleteTable()
80
+
81
+ case unknown =>
82
+ throw new ConfigException(s"Unsupported operation: $unknown")
83
+ }
84
+ }
85
+ registerNewParquetTable()
86
+ showNewTableInfo()
87
+ }
88
+
89
+ def showNewTableInfo(): Unit =
90
+ {
91
+ val req = new GetTableRequest()
92
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
93
+ req.setDatabaseName(task.getDatabase)
94
+ req.setName(task.getTable)
95
+
96
+ val t = aws.withGlue(_.getTable(req)).getTable
97
+ logger.info(s"Created a table: ${t.toString}")
98
+ }
99
+
100
+ def doesTableExists(): Boolean =
101
+ {
102
+ val req = new GetTableRequest()
103
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
104
+ req.setDatabaseName(task.getDatabase)
105
+ req.setName(task.getTable)
106
+
107
+ Try(aws.withGlue(_.getTable(req))).isSuccess
108
+ }
109
+
110
+ def deleteTable(): Unit =
111
+ {
112
+ val req = new DeleteTableRequest()
113
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
114
+ req.setDatabaseName(task.getDatabase)
115
+ req.setName(task.getTable)
116
+ aws.withGlue(_.deleteTable(req))
117
+ }
118
+
119
+ def registerNewParquetTable(): Unit =
120
+ {
121
+ logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
122
+ val req = new CreateTableRequest()
123
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
124
+ req.setDatabaseName(task.getDatabase)
125
+ req.setTableInput(new TableInput()
126
+ .withName(task.getTable)
127
+ .withDescription("Created by embulk-output-s3_parquet")
128
+ .withTableType("EXTERNAL_TABLE")
129
+ .withParameters(Map("EXTERNAL" -> "TRUE",
130
+ "classification" -> "parquet",
131
+ "parquet.compression" -> compressionCodec.name()).asJava)
132
+ .withStorageDescriptor(new StorageDescriptor()
133
+ .withColumns(getGlueSchema: _*)
134
+ .withLocation(location)
135
+ .withCompressed(isCompressed)
136
+ .withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
137
+ .withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
138
+ .withSerdeInfo(new SerDeInfo()
139
+ .withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
140
+ .withParameters(Map("serialization.format" -> "1").asJava)
141
+ )
142
+ )
143
+ )
144
+ aws.withGlue(_.createTable(req))
145
+ }
146
+
147
+ private def getGlueSchema: Seq[Column] =
148
+ {
149
+ val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
150
+ schema.getColumns.asScala.toSeq.map { c =>
151
+ val cType: String =
152
+ if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
153
+ else convertEmbulkType2GlueType(c.getType)
154
+ new Column()
155
+ .withName(c.getName)
156
+ .withType(cType)
157
+ }
158
+ }
159
+
160
+ private def convertEmbulkType2GlueType(t: Type): String =
161
+ {
162
+ t match {
163
+ case _: BooleanType => "boolean"
164
+ case _: LongType => "bigint"
165
+ case _: DoubleType => "double"
166
+ case _: StringType => "string"
167
+ case _: TimestampType => "string"
168
+ case _: JsonType => "string"
169
+ case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
170
+ }
171
+ }
172
+
173
+ private def isCompressed: Boolean =
174
+ {
175
+ !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
176
+ }
177
+
178
+ }
@@ -16,184 +16,206 @@ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPage
16
16
  import org.embulk.spi.time.TimestampFormatter
17
17
  import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
18
18
  import org.embulk.spi.util.Timestamps
19
- import org.slf4j.Logger
19
+ import org.slf4j.{Logger, LoggerFactory}
20
20
 
21
- object S3ParquetOutputPlugin {
22
21
 
23
- trait PluginTask
24
- extends Task
25
- with TimestampFormatter.Task
26
- with Aws.Task {
22
+ object S3ParquetOutputPlugin
23
+ {
27
24
 
28
- @Config("bucket")
29
- def getBucket: String
25
+ trait PluginTask
26
+ extends Task
27
+ with TimestampFormatter.Task
28
+ with Aws.Task
29
+ {
30
30
 
31
- @Config("path_prefix")
32
- @ConfigDefault("\"\"")
33
- def getPathPrefix: String
31
+ @Config("bucket")
32
+ def getBucket: String
34
33
 
35
- @Config("sequence_format")
36
- @ConfigDefault("\"%03d.%02d.\"")
37
- def getSequenceFormat: String
34
+ @Config("path_prefix")
35
+ @ConfigDefault("\"\"")
36
+ def getPathPrefix: String
38
37
 
39
- @Config("file_ext")
40
- @ConfigDefault("\"parquet\"")
41
- def getFileExt: String
38
+ @Config("sequence_format")
39
+ @ConfigDefault("\"%03d.%02d.\"")
40
+ def getSequenceFormat: String
42
41
 
43
- @Config("compression_codec")
44
- @ConfigDefault("\"uncompressed\"")
45
- def getCompressionCodecString: String
42
+ @Config("file_ext")
43
+ @ConfigDefault("\"parquet\"")
44
+ def getFileExt: String
46
45
 
47
- def setCompressionCodec(v: CompressionCodecName): Unit
46
+ @Config("compression_codec")
47
+ @ConfigDefault("\"uncompressed\"")
48
+ def getCompressionCodecString: String
48
49
 
49
- def getCompressionCodec: CompressionCodecName
50
+ def setCompressionCodec(v: CompressionCodecName): Unit
50
51
 
51
- @Config("column_options")
52
- @ConfigDefault("{}")
53
- def getColumnOptions: JMap[String, TimestampColumnOption]
52
+ def getCompressionCodec: CompressionCodecName
54
53
 
55
- @Config("canned_acl")
56
- @ConfigDefault("\"private\"")
57
- def getCannedAclString: String
54
+ @Config("column_options")
55
+ @ConfigDefault("{}")
56
+ def getColumnOptions: JMap[String, TimestampColumnOption]
58
57
 
59
- def setCannedAcl(v: CannedAccessControlList): Unit
58
+ @Config("canned_acl")
59
+ @ConfigDefault("\"private\"")
60
+ def getCannedAclString: String
60
61
 
61
- def getCannedAcl: CannedAccessControlList
62
+ def setCannedAcl(v: CannedAccessControlList): Unit
62
63
 
63
- @Config("block_size")
64
- @ConfigDefault("null")
65
- def getBlockSize: Optional[Int]
64
+ def getCannedAcl: CannedAccessControlList
66
65
 
67
- @Config("page_size")
68
- @ConfigDefault("null")
69
- def getPageSize: Optional[Int]
66
+ @Config("block_size")
67
+ @ConfigDefault("null")
68
+ def getBlockSize: Optional[Int]
70
69
 
71
- @Config("max_padding_size")
72
- @ConfigDefault("null")
73
- def getMaxPaddingSize: Optional[Int]
70
+ @Config("page_size")
71
+ @ConfigDefault("null")
72
+ def getPageSize: Optional[Int]
74
73
 
75
- @Config("enable_dictionary_encoding")
76
- @ConfigDefault("null")
77
- def getEnableDictionaryEncoding: Optional[Boolean]
74
+ @Config("max_padding_size")
75
+ @ConfigDefault("null")
76
+ def getMaxPaddingSize: Optional[Int]
78
77
 
79
- @Config("buffer_dir")
80
- @ConfigDefault("null")
81
- def getBufferDir: Optional[String]
78
+ @Config("enable_dictionary_encoding")
79
+ @ConfigDefault("null")
80
+ def getEnableDictionaryEncoding: Optional[Boolean]
82
81
 
83
- }
82
+ @Config("buffer_dir")
83
+ @ConfigDefault("null")
84
+ def getBufferDir: Optional[String]
85
+
86
+ @Config("catalog")
87
+ @ConfigDefault("null")
88
+ def getCatalog: Optional[CatalogRegistrator.Task]
89
+ }
84
90
 
85
91
  }
86
92
 
87
93
  class S3ParquetOutputPlugin
88
- extends OutputPlugin {
89
-
90
- val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
91
-
92
- private def withPluginContextClassLoader[A](f: => A): A = {
93
- val original: ClassLoader = Thread.currentThread.getContextClassLoader
94
- Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
95
- try f
96
- finally Thread.currentThread.setContextClassLoader(original)
97
- }
98
-
99
- override def transaction(config: ConfigSource,
100
- schema: Schema,
101
- taskCount: Int,
102
- control: OutputPlugin.Control): ConfigDiff = {
103
- val task: PluginTask = config.loadConfig(classOf[PluginTask])
104
-
105
- withPluginContextClassLoader {
106
- configure(task, schema)
107
- control.run(task.dump)
108
- }
94
+ extends OutputPlugin
95
+ {
109
96
 
110
- Exec.newConfigDiff
111
- }
97
+ val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
98
+
99
+ private def withPluginContextClassLoader[A](f: => A): A =
100
+ {
101
+ val original: ClassLoader = Thread.currentThread.getContextClassLoader
102
+ Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
103
+ try f
104
+ finally Thread.currentThread.setContextClassLoader(original)
105
+ }
112
106
 
113
- private def configure(task: PluginTask,
114
- schema: Schema): Unit = {
115
- // sequence_format
116
- try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
117
- catch {
118
- case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
107
+ override def transaction(config: ConfigSource,
108
+ schema: Schema,
109
+ taskCount: Int,
110
+ control: OutputPlugin.Control): ConfigDiff =
111
+ {
112
+ val task: PluginTask = config.loadConfig(classOf[PluginTask])
113
+
114
+ withPluginContextClassLoader {
115
+ configure(task, schema)
116
+ control.run(task.dump)
117
+ }
118
+ task.getCatalog.ifPresent { catalog =>
119
+ val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
120
+ val cr = CatalogRegistrator(aws = Aws(task),
121
+ task = catalog,
122
+ schema = schema,
123
+ location = location,
124
+ compressionCodec = task.getCompressionCodec)
125
+ cr.run()
126
+ }
127
+
128
+ Exec.newConfigDiff
119
129
  }
120
130
 
121
- // compression_codec
122
- CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
123
- case Some(v) => task.setCompressionCodec(v)
124
- case None =>
125
- val unsupported: String = task.getCompressionCodecString
126
- val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
127
- throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
131
+ private def configure(task: PluginTask,
132
+ schema: Schema): Unit =
133
+ {
134
+ // sequence_format
135
+ try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
136
+ catch {
137
+ case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
138
+ }
139
+
140
+ // compression_codec
141
+ CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
142
+ case Some(v) => task.setCompressionCodec(v)
143
+ case None =>
144
+ val unsupported: String = task.getCompressionCodecString
145
+ val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
146
+ throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
147
+ }
148
+
149
+ // column_options
150
+ task.getColumnOptions.forEach { (k: String,
151
+ _) =>
152
+ val c = schema.lookupColumn(k)
153
+ if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
154
+ }
155
+
156
+ // canned_acl
157
+ CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
158
+ case Some(v) => task.setCannedAcl(v)
159
+ case None =>
160
+ val unsupported: String = task.getCannedAclString
161
+ val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
162
+ throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
163
+ }
128
164
  }
129
165
 
130
- // column_options
131
- task.getColumnOptions.forEach { (k: String,
132
- _) =>
133
- val c = schema.lookupColumn(k)
134
- if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
166
+ override def resume(taskSource: TaskSource,
167
+ schema: Schema,
168
+ taskCount: Int,
169
+ control: OutputPlugin.Control): ConfigDiff =
170
+ {
171
+ throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
135
172
  }
136
173
 
137
- // canned_acl
138
- CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
139
- case Some(v) => task.setCannedAcl(v)
140
- case None =>
141
- val unsupported: String = task.getCannedAclString
142
- val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
143
- throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
174
+ override def cleanup(taskSource: TaskSource,
175
+ schema: Schema,
176
+ taskCount: Int,
177
+ successTaskReports: JList[TaskReport]): Unit =
178
+ {
179
+ successTaskReports.forEach { tr =>
180
+ logger.info(
181
+ s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
182
+ + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
183
+ + s"etag: ${tr.get(classOf[String], "etag", null)}")
184
+ }
144
185
  }
145
- }
146
186
 
147
- override def resume(taskSource: TaskSource,
187
+ override def open(taskSource: TaskSource,
148
188
  schema: Schema,
149
- taskCount: Int,
150
- control: OutputPlugin.Control): ConfigDiff = {
151
- throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
152
- }
153
-
154
- override def cleanup(taskSource: TaskSource,
155
- schema: Schema,
156
- taskCount: Int,
157
- successTaskReports: JList[TaskReport]): Unit = {
158
- successTaskReports.forEach { tr =>
159
- logger.info(
160
- s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
161
- + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
162
- + s"etag: ${tr.get(classOf[String], "etag", null)}")
189
+ taskIndex: Int): TransactionalPageOutput =
190
+ {
191
+ val task = taskSource.loadTask(classOf[PluginTask])
192
+ val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
193
+ val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
194
+ val destS3bucket: String = task.getBucket
195
+ val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
196
+
197
+
198
+ val pageReader: PageReader = new PageReader(schema)
199
+ val aws: Aws = Aws(task)
200
+ val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
201
+ val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
202
+ .withPath(bufferFile)
203
+ .withSchema(schema)
204
+ .withTimestampFormatters(timestampFormatters)
205
+ .withCompressionCodec(task.getCompressionCodec)
206
+ .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
207
+ .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
208
+ .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
209
+ .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
210
+ .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
211
+ .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
212
+ .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
213
+ .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
214
+ .build()
215
+
216
+ logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
217
+
218
+ S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
163
219
  }
164
- }
165
-
166
- override def open(taskSource: TaskSource,
167
- schema: Schema,
168
- taskIndex: Int): TransactionalPageOutput = {
169
- val task = taskSource.loadTask(classOf[PluginTask])
170
- val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
171
- val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
172
- val destS3bucket: String = task.getBucket
173
- val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
174
-
175
-
176
- val pageReader: PageReader = new PageReader(schema)
177
- val aws: Aws = Aws(task)
178
- val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
179
- val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
180
- .withPath(bufferFile)
181
- .withSchema(schema)
182
- .withTimestampFormatters(timestampFormatters)
183
- .withCompressionCodec(task.getCompressionCodec)
184
- .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
185
- .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
186
- .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
187
- .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
188
- .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
189
- .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
190
- .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
191
- .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
192
- .build()
193
-
194
- logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
195
-
196
- S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
197
- }
198
220
 
199
221
  }