embulk-output-s3_parquet 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fac4eca9b96e218930333123d01d82ec5acae146
4
- data.tar.gz: dc1de337cd4ada9fa86d53239be85b5a154115c1
3
+ metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
4
+ data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
5
5
  SHA512:
6
- metadata.gz: 8451431ba506a80159ad768017268b826e73f6010fcb923804f6b3a8f8cb27e7669e90c4d8f25a6d348d55ce558fd8ea488942451a973c7d8387bd70a825fb77
7
- data.tar.gz: 835f5d30265595270925e57587bea633a1e6156cfa8f7b660af3eac1539fcf9b40f1e7dfb986ae4a290c67b7eee2638275846cfa86edd217168ff5b2c0672313
6
+ metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
7
+ data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3
@@ -1,3 +1,9 @@
1
+ 0.0.3 (2019-07-17)
2
+ ==================
3
+
4
+ * [New Feature] Add `catalog` option to register a new table that has data created by `s3_parquet` plugin.
5
+ * [Enhancement] Update dependencies.
6
+
1
7
  0.0.2 (2019-01-21)
2
8
  ==================
3
9
 
data/README.md CHANGED
@@ -59,6 +59,13 @@
59
59
  - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
60
60
  - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
61
61
  - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
62
+ - **catalog**: Register a table if this option is specified (optional)
63
+ - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
64
+ - **database**: The name of the database (string, required)
65
+ - **table**: The name of the table (string, required)
66
+ - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
67
+ - **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
68
+ - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
62
69
  - **endpoint**: The AWS Service endpoint (string, optional)
63
70
  - **region**: The AWS region (string, optional)
64
71
  - **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)
@@ -2,7 +2,6 @@ plugins {
2
2
  id "scala"
3
3
  id "com.jfrog.bintray" version "1.1"
4
4
  id "com.github.jruby-gradle.base" version "1.5.0"
5
- id "com.diffplug.gradle.spotless" version "3.13.0"
6
5
  id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
7
6
  }
8
7
  import com.github.jrubygradle.JRubyExec
@@ -14,30 +13,30 @@ configurations {
14
13
  provided
15
14
  }
16
15
 
17
- version = "0.0.2"
16
+ version = "0.0.3"
18
17
 
19
18
  sourceCompatibility = 1.8
20
19
  targetCompatibility = 1.8
21
20
 
22
21
  dependencies {
23
- compile "org.embulk:embulk-core:0.9.12"
24
- provided "org.embulk:embulk-core:0.9.12"
22
+ compile "org.embulk:embulk-core:0.9.17"
23
+ provided "org.embulk:embulk-core:0.9.17"
25
24
 
26
- compile 'org.scala-lang:scala-library:2.12.8'
27
- ['s3', 'sts'].each { v ->
28
- compile "com.amazonaws:aws-java-sdk-${v}:1.11.479"
25
+ compile 'org.scala-lang:scala-library:2.13.0'
26
+ ['glue', 's3', 'sts'].each { v ->
27
+ compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
29
28
  }
30
29
  ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
31
- compile "org.apache.parquet:parquet-${v}:1.10.0"
30
+ compile "org.apache.parquet:parquet-${v}:1.10.1"
32
31
  }
33
32
  compile 'org.apache.hadoop:hadoop-common:2.9.2'
34
- compile 'org.xerial.snappy:snappy-java:1.1.7.2'
33
+ compile 'org.xerial.snappy:snappy-java:1.1.7.3'
35
34
 
36
- testCompile 'org.scalatest:scalatest_2.12:3.0.5'
37
- testCompile 'org.embulk:embulk-test:0.9.12'
38
- testCompile 'org.embulk:embulk-standards:0.9.12'
35
+ testCompile 'org.scalatest:scalatest_2.13:3.0.8'
36
+ testCompile 'org.embulk:embulk-test:0.9.17'
37
+ testCompile 'org.embulk:embulk-standards:0.9.17'
39
38
  testCompile 'cloud.localstack:localstack-utils:0.1.15'
40
- testCompile 'org.apache.parquet:parquet-tools:1.8.0'
39
+ testCompile 'org.apache.parquet:parquet-tools:1.10.1'
41
40
  testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
42
41
  }
43
42
 
@@ -0,0 +1,178 @@
1
+ package org.embulk.output.s3_parquet
2
+
3
+
4
+ import java.util.{Optional, Map => JMap}
5
+
6
+ import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
7
+ import org.apache.parquet.hadoop.metadata.CompressionCodecName
8
+ import org.embulk.config.{Config, ConfigDefault, ConfigException}
9
+ import org.embulk.output.s3_parquet.aws.Aws
10
+ import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
11
+ import org.embulk.spi.Schema
12
+ import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
13
+ import org.slf4j.{Logger, LoggerFactory}
14
+
15
+ import scala.jdk.CollectionConverters._
16
+ import scala.util.Try
17
+
18
+
19
+ object CatalogRegistrator
20
+ {
21
+ trait Task
22
+ extends org.embulk.config.Task
23
+ {
24
+ @Config("catalog_id")
25
+ @ConfigDefault("null")
26
+ def getCatalogId: Optional[String]
27
+
28
+ @Config("database")
29
+ def getDatabase: String
30
+
31
+ @Config("table")
32
+ def getTable: String
33
+
34
+ @Config("column_options")
35
+ @ConfigDefault("{}")
36
+ def getColumnOptions: JMap[String, ColumnOptions]
37
+
38
+ @Config("operation_if_exists")
39
+ @ConfigDefault("\"delete\"")
40
+ def getOperationIfExists: String
41
+ }
42
+
43
+ trait ColumnOptions
44
+ {
45
+ @Config("type")
46
+ def getType: String
47
+ }
48
+
49
+ def apply(aws: Aws,
50
+ task: Task,
51
+ schema: Schema,
52
+ location: String,
53
+ compressionCodec: CompressionCodecName,
54
+ loggerOption: Option[Logger] = None): CatalogRegistrator =
55
+ {
56
+ new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
57
+ }
58
+ }
59
+
60
+ class CatalogRegistrator(aws: Aws,
61
+ task: CatalogRegistrator.Task,
62
+ schema: Schema,
63
+ location: String,
64
+ compressionCodec: CompressionCodecName,
65
+ loggerOption: Option[Logger] = None)
66
+ {
67
+ val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
68
+
69
+ def run(): Unit =
70
+ {
71
+ if (doesTableExists()) {
72
+ task.getOperationIfExists match {
73
+ case "skip" =>
74
+ logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
75
+ return
76
+
77
+ case "delete" =>
78
+ logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
79
+ deleteTable()
80
+
81
+ case unknown =>
82
+ throw new ConfigException(s"Unsupported operation: $unknown")
83
+ }
84
+ }
85
+ registerNewParquetTable()
86
+ showNewTableInfo()
87
+ }
88
+
89
+ def showNewTableInfo(): Unit =
90
+ {
91
+ val req = new GetTableRequest()
92
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
93
+ req.setDatabaseName(task.getDatabase)
94
+ req.setName(task.getTable)
95
+
96
+ val t = aws.withGlue(_.getTable(req)).getTable
97
+ logger.info(s"Created a table: ${t.toString}")
98
+ }
99
+
100
+ def doesTableExists(): Boolean =
101
+ {
102
+ val req = new GetTableRequest()
103
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
104
+ req.setDatabaseName(task.getDatabase)
105
+ req.setName(task.getTable)
106
+
107
+ Try(aws.withGlue(_.getTable(req))).isSuccess
108
+ }
109
+
110
+ def deleteTable(): Unit =
111
+ {
112
+ val req = new DeleteTableRequest()
113
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
114
+ req.setDatabaseName(task.getDatabase)
115
+ req.setName(task.getTable)
116
+ aws.withGlue(_.deleteTable(req))
117
+ }
118
+
119
+ def registerNewParquetTable(): Unit =
120
+ {
121
+ logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
122
+ val req = new CreateTableRequest()
123
+ task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
124
+ req.setDatabaseName(task.getDatabase)
125
+ req.setTableInput(new TableInput()
126
+ .withName(task.getTable)
127
+ .withDescription("Created by embulk-output-s3_parquet")
128
+ .withTableType("EXTERNAL_TABLE")
129
+ .withParameters(Map("EXTERNAL" -> "TRUE",
130
+ "classification" -> "parquet",
131
+ "parquet.compression" -> compressionCodec.name()).asJava)
132
+ .withStorageDescriptor(new StorageDescriptor()
133
+ .withColumns(getGlueSchema: _*)
134
+ .withLocation(location)
135
+ .withCompressed(isCompressed)
136
+ .withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
137
+ .withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
138
+ .withSerdeInfo(new SerDeInfo()
139
+ .withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
140
+ .withParameters(Map("serialization.format" -> "1").asJava)
141
+ )
142
+ )
143
+ )
144
+ aws.withGlue(_.createTable(req))
145
+ }
146
+
147
+ private def getGlueSchema: Seq[Column] =
148
+ {
149
+ val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
150
+ schema.getColumns.asScala.toSeq.map { c =>
151
+ val cType: String =
152
+ if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
153
+ else convertEmbulkType2GlueType(c.getType)
154
+ new Column()
155
+ .withName(c.getName)
156
+ .withType(cType)
157
+ }
158
+ }
159
+
160
+ private def convertEmbulkType2GlueType(t: Type): String =
161
+ {
162
+ t match {
163
+ case _: BooleanType => "boolean"
164
+ case _: LongType => "bigint"
165
+ case _: DoubleType => "double"
166
+ case _: StringType => "string"
167
+ case _: TimestampType => "string"
168
+ case _: JsonType => "string"
169
+ case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
170
+ }
171
+ }
172
+
173
+ private def isCompressed: Boolean =
174
+ {
175
+ !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
176
+ }
177
+
178
+ }
@@ -16,184 +16,206 @@ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPage
16
16
  import org.embulk.spi.time.TimestampFormatter
17
17
  import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
18
18
  import org.embulk.spi.util.Timestamps
19
- import org.slf4j.Logger
19
+ import org.slf4j.{Logger, LoggerFactory}
20
20
 
21
- object S3ParquetOutputPlugin {
22
21
 
23
- trait PluginTask
24
- extends Task
25
- with TimestampFormatter.Task
26
- with Aws.Task {
22
+ object S3ParquetOutputPlugin
23
+ {
27
24
 
28
- @Config("bucket")
29
- def getBucket: String
25
+ trait PluginTask
26
+ extends Task
27
+ with TimestampFormatter.Task
28
+ with Aws.Task
29
+ {
30
30
 
31
- @Config("path_prefix")
32
- @ConfigDefault("\"\"")
33
- def getPathPrefix: String
31
+ @Config("bucket")
32
+ def getBucket: String
34
33
 
35
- @Config("sequence_format")
36
- @ConfigDefault("\"%03d.%02d.\"")
37
- def getSequenceFormat: String
34
+ @Config("path_prefix")
35
+ @ConfigDefault("\"\"")
36
+ def getPathPrefix: String
38
37
 
39
- @Config("file_ext")
40
- @ConfigDefault("\"parquet\"")
41
- def getFileExt: String
38
+ @Config("sequence_format")
39
+ @ConfigDefault("\"%03d.%02d.\"")
40
+ def getSequenceFormat: String
42
41
 
43
- @Config("compression_codec")
44
- @ConfigDefault("\"uncompressed\"")
45
- def getCompressionCodecString: String
42
+ @Config("file_ext")
43
+ @ConfigDefault("\"parquet\"")
44
+ def getFileExt: String
46
45
 
47
- def setCompressionCodec(v: CompressionCodecName): Unit
46
+ @Config("compression_codec")
47
+ @ConfigDefault("\"uncompressed\"")
48
+ def getCompressionCodecString: String
48
49
 
49
- def getCompressionCodec: CompressionCodecName
50
+ def setCompressionCodec(v: CompressionCodecName): Unit
50
51
 
51
- @Config("column_options")
52
- @ConfigDefault("{}")
53
- def getColumnOptions: JMap[String, TimestampColumnOption]
52
+ def getCompressionCodec: CompressionCodecName
54
53
 
55
- @Config("canned_acl")
56
- @ConfigDefault("\"private\"")
57
- def getCannedAclString: String
54
+ @Config("column_options")
55
+ @ConfigDefault("{}")
56
+ def getColumnOptions: JMap[String, TimestampColumnOption]
58
57
 
59
- def setCannedAcl(v: CannedAccessControlList): Unit
58
+ @Config("canned_acl")
59
+ @ConfigDefault("\"private\"")
60
+ def getCannedAclString: String
60
61
 
61
- def getCannedAcl: CannedAccessControlList
62
+ def setCannedAcl(v: CannedAccessControlList): Unit
62
63
 
63
- @Config("block_size")
64
- @ConfigDefault("null")
65
- def getBlockSize: Optional[Int]
64
+ def getCannedAcl: CannedAccessControlList
66
65
 
67
- @Config("page_size")
68
- @ConfigDefault("null")
69
- def getPageSize: Optional[Int]
66
+ @Config("block_size")
67
+ @ConfigDefault("null")
68
+ def getBlockSize: Optional[Int]
70
69
 
71
- @Config("max_padding_size")
72
- @ConfigDefault("null")
73
- def getMaxPaddingSize: Optional[Int]
70
+ @Config("page_size")
71
+ @ConfigDefault("null")
72
+ def getPageSize: Optional[Int]
74
73
 
75
- @Config("enable_dictionary_encoding")
76
- @ConfigDefault("null")
77
- def getEnableDictionaryEncoding: Optional[Boolean]
74
+ @Config("max_padding_size")
75
+ @ConfigDefault("null")
76
+ def getMaxPaddingSize: Optional[Int]
78
77
 
79
- @Config("buffer_dir")
80
- @ConfigDefault("null")
81
- def getBufferDir: Optional[String]
78
+ @Config("enable_dictionary_encoding")
79
+ @ConfigDefault("null")
80
+ def getEnableDictionaryEncoding: Optional[Boolean]
82
81
 
83
- }
82
+ @Config("buffer_dir")
83
+ @ConfigDefault("null")
84
+ def getBufferDir: Optional[String]
85
+
86
+ @Config("catalog")
87
+ @ConfigDefault("null")
88
+ def getCatalog: Optional[CatalogRegistrator.Task]
89
+ }
84
90
 
85
91
  }
86
92
 
87
93
  class S3ParquetOutputPlugin
88
- extends OutputPlugin {
89
-
90
- val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
91
-
92
- private def withPluginContextClassLoader[A](f: => A): A = {
93
- val original: ClassLoader = Thread.currentThread.getContextClassLoader
94
- Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
95
- try f
96
- finally Thread.currentThread.setContextClassLoader(original)
97
- }
98
-
99
- override def transaction(config: ConfigSource,
100
- schema: Schema,
101
- taskCount: Int,
102
- control: OutputPlugin.Control): ConfigDiff = {
103
- val task: PluginTask = config.loadConfig(classOf[PluginTask])
104
-
105
- withPluginContextClassLoader {
106
- configure(task, schema)
107
- control.run(task.dump)
108
- }
94
+ extends OutputPlugin
95
+ {
109
96
 
110
- Exec.newConfigDiff
111
- }
97
+ val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
98
+
99
+ private def withPluginContextClassLoader[A](f: => A): A =
100
+ {
101
+ val original: ClassLoader = Thread.currentThread.getContextClassLoader
102
+ Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
103
+ try f
104
+ finally Thread.currentThread.setContextClassLoader(original)
105
+ }
112
106
 
113
- private def configure(task: PluginTask,
114
- schema: Schema): Unit = {
115
- // sequence_format
116
- try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
117
- catch {
118
- case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
107
+ override def transaction(config: ConfigSource,
108
+ schema: Schema,
109
+ taskCount: Int,
110
+ control: OutputPlugin.Control): ConfigDiff =
111
+ {
112
+ val task: PluginTask = config.loadConfig(classOf[PluginTask])
113
+
114
+ withPluginContextClassLoader {
115
+ configure(task, schema)
116
+ control.run(task.dump)
117
+ }
118
+ task.getCatalog.ifPresent { catalog =>
119
+ val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
120
+ val cr = CatalogRegistrator(aws = Aws(task),
121
+ task = catalog,
122
+ schema = schema,
123
+ location = location,
124
+ compressionCodec = task.getCompressionCodec)
125
+ cr.run()
126
+ }
127
+
128
+ Exec.newConfigDiff
119
129
  }
120
130
 
121
- // compression_codec
122
- CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
123
- case Some(v) => task.setCompressionCodec(v)
124
- case None =>
125
- val unsupported: String = task.getCompressionCodecString
126
- val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
127
- throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
131
+ private def configure(task: PluginTask,
132
+ schema: Schema): Unit =
133
+ {
134
+ // sequence_format
135
+ try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
136
+ catch {
137
+ case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
138
+ }
139
+
140
+ // compression_codec
141
+ CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
142
+ case Some(v) => task.setCompressionCodec(v)
143
+ case None =>
144
+ val unsupported: String = task.getCompressionCodecString
145
+ val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
146
+ throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
147
+ }
148
+
149
+ // column_options
150
+ task.getColumnOptions.forEach { (k: String,
151
+ _) =>
152
+ val c = schema.lookupColumn(k)
153
+ if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
154
+ }
155
+
156
+ // canned_acl
157
+ CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
158
+ case Some(v) => task.setCannedAcl(v)
159
+ case None =>
160
+ val unsupported: String = task.getCannedAclString
161
+ val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
162
+ throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
163
+ }
128
164
  }
129
165
 
130
- // column_options
131
- task.getColumnOptions.forEach { (k: String,
132
- _) =>
133
- val c = schema.lookupColumn(k)
134
- if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
166
+ override def resume(taskSource: TaskSource,
167
+ schema: Schema,
168
+ taskCount: Int,
169
+ control: OutputPlugin.Control): ConfigDiff =
170
+ {
171
+ throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
135
172
  }
136
173
 
137
- // canned_acl
138
- CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
139
- case Some(v) => task.setCannedAcl(v)
140
- case None =>
141
- val unsupported: String = task.getCannedAclString
142
- val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
143
- throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
174
+ override def cleanup(taskSource: TaskSource,
175
+ schema: Schema,
176
+ taskCount: Int,
177
+ successTaskReports: JList[TaskReport]): Unit =
178
+ {
179
+ successTaskReports.forEach { tr =>
180
+ logger.info(
181
+ s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
182
+ + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
183
+ + s"etag: ${tr.get(classOf[String], "etag", null)}")
184
+ }
144
185
  }
145
- }
146
186
 
147
- override def resume(taskSource: TaskSource,
187
+ override def open(taskSource: TaskSource,
148
188
  schema: Schema,
149
- taskCount: Int,
150
- control: OutputPlugin.Control): ConfigDiff = {
151
- throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
152
- }
153
-
154
- override def cleanup(taskSource: TaskSource,
155
- schema: Schema,
156
- taskCount: Int,
157
- successTaskReports: JList[TaskReport]): Unit = {
158
- successTaskReports.forEach { tr =>
159
- logger.info(
160
- s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
161
- + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
162
- + s"etag: ${tr.get(classOf[String], "etag", null)}")
189
+ taskIndex: Int): TransactionalPageOutput =
190
+ {
191
+ val task = taskSource.loadTask(classOf[PluginTask])
192
+ val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
193
+ val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
194
+ val destS3bucket: String = task.getBucket
195
+ val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
196
+
197
+
198
+ val pageReader: PageReader = new PageReader(schema)
199
+ val aws: Aws = Aws(task)
200
+ val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
201
+ val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
202
+ .withPath(bufferFile)
203
+ .withSchema(schema)
204
+ .withTimestampFormatters(timestampFormatters)
205
+ .withCompressionCodec(task.getCompressionCodec)
206
+ .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
207
+ .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
208
+ .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
209
+ .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
210
+ .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
211
+ .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
212
+ .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
213
+ .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
214
+ .build()
215
+
216
+ logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
217
+
218
+ S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
163
219
  }
164
- }
165
-
166
- override def open(taskSource: TaskSource,
167
- schema: Schema,
168
- taskIndex: Int): TransactionalPageOutput = {
169
- val task = taskSource.loadTask(classOf[PluginTask])
170
- val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
171
- val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
172
- val destS3bucket: String = task.getBucket
173
- val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
174
-
175
-
176
- val pageReader: PageReader = new PageReader(schema)
177
- val aws: Aws = Aws(task)
178
- val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
179
- val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
180
- .withPath(bufferFile)
181
- .withSchema(schema)
182
- .withTimestampFormatters(timestampFormatters)
183
- .withCompressionCodec(task.getCompressionCodec)
184
- .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
185
- .withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
186
- .withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
187
- .withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
188
- .withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
189
- .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
190
- .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
191
- .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
192
- .build()
193
-
194
- logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
195
-
196
- S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
197
- }
198
220
 
199
221
  }