embulk-output-s3_parquet 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/build.gradle +12 -13
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +178 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +166 -144
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +43 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +47 -29
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +22 -14
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +104 -95
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +34 -26
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +39 -31
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -32
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +57 -37
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +26 -19
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +128 -94
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +113 -104
- metadata +18 -16
- data/.scalafmt.conf +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
|
4
|
+
data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
|
7
|
+
data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -59,6 +59,13 @@
|
|
59
59
|
- **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
60
60
|
- **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
|
61
61
|
- **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
62
|
+
- **catalog**: Register a table if this option is specified (optional)
|
63
|
+
- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
|
64
|
+
- **database**: The name of the database (string, required)
|
65
|
+
- **table**: The name of the table (string, required)
|
66
|
+
- **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
|
67
|
+
- **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
|
68
|
+
- **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
|
62
69
|
- **endpoint**: The AWS Service endpoint (string, optional)
|
63
70
|
- **region**: The AWS region (string, optional)
|
64
71
|
- **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)
|
data/build.gradle
CHANGED
@@ -2,7 +2,6 @@ plugins {
|
|
2
2
|
id "scala"
|
3
3
|
id "com.jfrog.bintray" version "1.1"
|
4
4
|
id "com.github.jruby-gradle.base" version "1.5.0"
|
5
|
-
id "com.diffplug.gradle.spotless" version "3.13.0"
|
6
5
|
id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
|
7
6
|
}
|
8
7
|
import com.github.jrubygradle.JRubyExec
|
@@ -14,30 +13,30 @@ configurations {
|
|
14
13
|
provided
|
15
14
|
}
|
16
15
|
|
17
|
-
version = "0.0.
|
16
|
+
version = "0.0.3"
|
18
17
|
|
19
18
|
sourceCompatibility = 1.8
|
20
19
|
targetCompatibility = 1.8
|
21
20
|
|
22
21
|
dependencies {
|
23
|
-
compile "org.embulk:embulk-core:0.9.
|
24
|
-
provided "org.embulk:embulk-core:0.9.
|
22
|
+
compile "org.embulk:embulk-core:0.9.17"
|
23
|
+
provided "org.embulk:embulk-core:0.9.17"
|
25
24
|
|
26
|
-
compile 'org.scala-lang:scala-library:2.
|
27
|
-
['s3', 'sts'].each { v ->
|
28
|
-
compile "com.amazonaws:aws-java-sdk-${v}:1.11.
|
25
|
+
compile 'org.scala-lang:scala-library:2.13.0'
|
26
|
+
['glue', 's3', 'sts'].each { v ->
|
27
|
+
compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
|
29
28
|
}
|
30
29
|
['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
|
31
|
-
compile "org.apache.parquet:parquet-${v}:1.10.
|
30
|
+
compile "org.apache.parquet:parquet-${v}:1.10.1"
|
32
31
|
}
|
33
32
|
compile 'org.apache.hadoop:hadoop-common:2.9.2'
|
34
|
-
compile 'org.xerial.snappy:snappy-java:1.1.7.
|
33
|
+
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
|
35
34
|
|
36
|
-
testCompile 'org.scalatest:scalatest_2.
|
37
|
-
testCompile 'org.embulk:embulk-test:0.9.
|
38
|
-
testCompile 'org.embulk:embulk-standards:0.9.
|
35
|
+
testCompile 'org.scalatest:scalatest_2.13:3.0.8'
|
36
|
+
testCompile 'org.embulk:embulk-test:0.9.17'
|
37
|
+
testCompile 'org.embulk:embulk-standards:0.9.17'
|
39
38
|
testCompile 'cloud.localstack:localstack-utils:0.1.15'
|
40
|
-
testCompile 'org.apache.parquet:parquet-tools:1.
|
39
|
+
testCompile 'org.apache.parquet:parquet-tools:1.10.1'
|
41
40
|
testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
|
42
41
|
}
|
43
42
|
|
@@ -0,0 +1,178 @@
|
|
1
|
+
package org.embulk.output.s3_parquet
|
2
|
+
|
3
|
+
|
4
|
+
import java.util.{Optional, Map => JMap}
|
5
|
+
|
6
|
+
import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
|
7
|
+
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
8
|
+
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
9
|
+
import org.embulk.output.s3_parquet.aws.Aws
|
10
|
+
import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
|
11
|
+
import org.embulk.spi.Schema
|
12
|
+
import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
|
13
|
+
import org.slf4j.{Logger, LoggerFactory}
|
14
|
+
|
15
|
+
import scala.jdk.CollectionConverters._
|
16
|
+
import scala.util.Try
|
17
|
+
|
18
|
+
|
19
|
+
object CatalogRegistrator
|
20
|
+
{
|
21
|
+
trait Task
|
22
|
+
extends org.embulk.config.Task
|
23
|
+
{
|
24
|
+
@Config("catalog_id")
|
25
|
+
@ConfigDefault("null")
|
26
|
+
def getCatalogId: Optional[String]
|
27
|
+
|
28
|
+
@Config("database")
|
29
|
+
def getDatabase: String
|
30
|
+
|
31
|
+
@Config("table")
|
32
|
+
def getTable: String
|
33
|
+
|
34
|
+
@Config("column_options")
|
35
|
+
@ConfigDefault("{}")
|
36
|
+
def getColumnOptions: JMap[String, ColumnOptions]
|
37
|
+
|
38
|
+
@Config("operation_if_exists")
|
39
|
+
@ConfigDefault("\"delete\"")
|
40
|
+
def getOperationIfExists: String
|
41
|
+
}
|
42
|
+
|
43
|
+
trait ColumnOptions
|
44
|
+
{
|
45
|
+
@Config("type")
|
46
|
+
def getType: String
|
47
|
+
}
|
48
|
+
|
49
|
+
def apply(aws: Aws,
|
50
|
+
task: Task,
|
51
|
+
schema: Schema,
|
52
|
+
location: String,
|
53
|
+
compressionCodec: CompressionCodecName,
|
54
|
+
loggerOption: Option[Logger] = None): CatalogRegistrator =
|
55
|
+
{
|
56
|
+
new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
class CatalogRegistrator(aws: Aws,
|
61
|
+
task: CatalogRegistrator.Task,
|
62
|
+
schema: Schema,
|
63
|
+
location: String,
|
64
|
+
compressionCodec: CompressionCodecName,
|
65
|
+
loggerOption: Option[Logger] = None)
|
66
|
+
{
|
67
|
+
val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
|
68
|
+
|
69
|
+
def run(): Unit =
|
70
|
+
{
|
71
|
+
if (doesTableExists()) {
|
72
|
+
task.getOperationIfExists match {
|
73
|
+
case "skip" =>
|
74
|
+
logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
|
75
|
+
return
|
76
|
+
|
77
|
+
case "delete" =>
|
78
|
+
logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
|
79
|
+
deleteTable()
|
80
|
+
|
81
|
+
case unknown =>
|
82
|
+
throw new ConfigException(s"Unsupported operation: $unknown")
|
83
|
+
}
|
84
|
+
}
|
85
|
+
registerNewParquetTable()
|
86
|
+
showNewTableInfo()
|
87
|
+
}
|
88
|
+
|
89
|
+
def showNewTableInfo(): Unit =
|
90
|
+
{
|
91
|
+
val req = new GetTableRequest()
|
92
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
93
|
+
req.setDatabaseName(task.getDatabase)
|
94
|
+
req.setName(task.getTable)
|
95
|
+
|
96
|
+
val t = aws.withGlue(_.getTable(req)).getTable
|
97
|
+
logger.info(s"Created a table: ${t.toString}")
|
98
|
+
}
|
99
|
+
|
100
|
+
def doesTableExists(): Boolean =
|
101
|
+
{
|
102
|
+
val req = new GetTableRequest()
|
103
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
104
|
+
req.setDatabaseName(task.getDatabase)
|
105
|
+
req.setName(task.getTable)
|
106
|
+
|
107
|
+
Try(aws.withGlue(_.getTable(req))).isSuccess
|
108
|
+
}
|
109
|
+
|
110
|
+
def deleteTable(): Unit =
|
111
|
+
{
|
112
|
+
val req = new DeleteTableRequest()
|
113
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
114
|
+
req.setDatabaseName(task.getDatabase)
|
115
|
+
req.setName(task.getTable)
|
116
|
+
aws.withGlue(_.deleteTable(req))
|
117
|
+
}
|
118
|
+
|
119
|
+
def registerNewParquetTable(): Unit =
|
120
|
+
{
|
121
|
+
logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
|
122
|
+
val req = new CreateTableRequest()
|
123
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
124
|
+
req.setDatabaseName(task.getDatabase)
|
125
|
+
req.setTableInput(new TableInput()
|
126
|
+
.withName(task.getTable)
|
127
|
+
.withDescription("Created by embulk-output-s3_parquet")
|
128
|
+
.withTableType("EXTERNAL_TABLE")
|
129
|
+
.withParameters(Map("EXTERNAL" -> "TRUE",
|
130
|
+
"classification" -> "parquet",
|
131
|
+
"parquet.compression" -> compressionCodec.name()).asJava)
|
132
|
+
.withStorageDescriptor(new StorageDescriptor()
|
133
|
+
.withColumns(getGlueSchema: _*)
|
134
|
+
.withLocation(location)
|
135
|
+
.withCompressed(isCompressed)
|
136
|
+
.withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
|
137
|
+
.withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
|
138
|
+
.withSerdeInfo(new SerDeInfo()
|
139
|
+
.withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
|
140
|
+
.withParameters(Map("serialization.format" -> "1").asJava)
|
141
|
+
)
|
142
|
+
)
|
143
|
+
)
|
144
|
+
aws.withGlue(_.createTable(req))
|
145
|
+
}
|
146
|
+
|
147
|
+
private def getGlueSchema: Seq[Column] =
|
148
|
+
{
|
149
|
+
val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
|
150
|
+
schema.getColumns.asScala.toSeq.map { c =>
|
151
|
+
val cType: String =
|
152
|
+
if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
|
153
|
+
else convertEmbulkType2GlueType(c.getType)
|
154
|
+
new Column()
|
155
|
+
.withName(c.getName)
|
156
|
+
.withType(cType)
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
private def convertEmbulkType2GlueType(t: Type): String =
|
161
|
+
{
|
162
|
+
t match {
|
163
|
+
case _: BooleanType => "boolean"
|
164
|
+
case _: LongType => "bigint"
|
165
|
+
case _: DoubleType => "double"
|
166
|
+
case _: StringType => "string"
|
167
|
+
case _: TimestampType => "string"
|
168
|
+
case _: JsonType => "string"
|
169
|
+
case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
private def isCompressed: Boolean =
|
174
|
+
{
|
175
|
+
!compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
|
176
|
+
}
|
177
|
+
|
178
|
+
}
|
@@ -16,184 +16,206 @@ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPage
|
|
16
16
|
import org.embulk.spi.time.TimestampFormatter
|
17
17
|
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
18
|
import org.embulk.spi.util.Timestamps
|
19
|
-
import org.slf4j.Logger
|
19
|
+
import org.slf4j.{Logger, LoggerFactory}
|
20
20
|
|
21
|
-
object S3ParquetOutputPlugin {
|
22
21
|
|
23
|
-
|
24
|
-
|
25
|
-
with TimestampFormatter.Task
|
26
|
-
with Aws.Task {
|
22
|
+
object S3ParquetOutputPlugin
|
23
|
+
{
|
27
24
|
|
28
|
-
|
29
|
-
|
25
|
+
trait PluginTask
|
26
|
+
extends Task
|
27
|
+
with TimestampFormatter.Task
|
28
|
+
with Aws.Task
|
29
|
+
{
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
def getPathPrefix: String
|
31
|
+
@Config("bucket")
|
32
|
+
def getBucket: String
|
34
33
|
|
35
|
-
|
36
|
-
|
37
|
-
|
34
|
+
@Config("path_prefix")
|
35
|
+
@ConfigDefault("\"\"")
|
36
|
+
def getPathPrefix: String
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
@Config("sequence_format")
|
39
|
+
@ConfigDefault("\"%03d.%02d.\"")
|
40
|
+
def getSequenceFormat: String
|
42
41
|
|
43
|
-
|
44
|
-
|
45
|
-
|
42
|
+
@Config("file_ext")
|
43
|
+
@ConfigDefault("\"parquet\"")
|
44
|
+
def getFileExt: String
|
46
45
|
|
47
|
-
|
46
|
+
@Config("compression_codec")
|
47
|
+
@ConfigDefault("\"uncompressed\"")
|
48
|
+
def getCompressionCodecString: String
|
48
49
|
|
49
|
-
|
50
|
+
def setCompressionCodec(v: CompressionCodecName): Unit
|
50
51
|
|
51
|
-
|
52
|
-
@ConfigDefault("{}")
|
53
|
-
def getColumnOptions: JMap[String, TimestampColumnOption]
|
52
|
+
def getCompressionCodec: CompressionCodecName
|
54
53
|
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
@Config("column_options")
|
55
|
+
@ConfigDefault("{}")
|
56
|
+
def getColumnOptions: JMap[String, TimestampColumnOption]
|
58
57
|
|
59
|
-
|
58
|
+
@Config("canned_acl")
|
59
|
+
@ConfigDefault("\"private\"")
|
60
|
+
def getCannedAclString: String
|
60
61
|
|
61
|
-
|
62
|
+
def setCannedAcl(v: CannedAccessControlList): Unit
|
62
63
|
|
63
|
-
|
64
|
-
@ConfigDefault("null")
|
65
|
-
def getBlockSize: Optional[Int]
|
64
|
+
def getCannedAcl: CannedAccessControlList
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
@Config("block_size")
|
67
|
+
@ConfigDefault("null")
|
68
|
+
def getBlockSize: Optional[Int]
|
70
69
|
|
71
|
-
|
72
|
-
|
73
|
-
|
70
|
+
@Config("page_size")
|
71
|
+
@ConfigDefault("null")
|
72
|
+
def getPageSize: Optional[Int]
|
74
73
|
|
75
|
-
|
76
|
-
|
77
|
-
|
74
|
+
@Config("max_padding_size")
|
75
|
+
@ConfigDefault("null")
|
76
|
+
def getMaxPaddingSize: Optional[Int]
|
78
77
|
|
79
|
-
|
80
|
-
|
81
|
-
|
78
|
+
@Config("enable_dictionary_encoding")
|
79
|
+
@ConfigDefault("null")
|
80
|
+
def getEnableDictionaryEncoding: Optional[Boolean]
|
82
81
|
|
83
|
-
|
82
|
+
@Config("buffer_dir")
|
83
|
+
@ConfigDefault("null")
|
84
|
+
def getBufferDir: Optional[String]
|
85
|
+
|
86
|
+
@Config("catalog")
|
87
|
+
@ConfigDefault("null")
|
88
|
+
def getCatalog: Optional[CatalogRegistrator.Task]
|
89
|
+
}
|
84
90
|
|
85
91
|
}
|
86
92
|
|
87
93
|
class S3ParquetOutputPlugin
|
88
|
-
|
89
|
-
|
90
|
-
val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
|
91
|
-
|
92
|
-
private def withPluginContextClassLoader[A](f: => A): A = {
|
93
|
-
val original: ClassLoader = Thread.currentThread.getContextClassLoader
|
94
|
-
Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
|
95
|
-
try f
|
96
|
-
finally Thread.currentThread.setContextClassLoader(original)
|
97
|
-
}
|
98
|
-
|
99
|
-
override def transaction(config: ConfigSource,
|
100
|
-
schema: Schema,
|
101
|
-
taskCount: Int,
|
102
|
-
control: OutputPlugin.Control): ConfigDiff = {
|
103
|
-
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
104
|
-
|
105
|
-
withPluginContextClassLoader {
|
106
|
-
configure(task, schema)
|
107
|
-
control.run(task.dump)
|
108
|
-
}
|
94
|
+
extends OutputPlugin
|
95
|
+
{
|
109
96
|
|
110
|
-
|
111
|
-
|
97
|
+
val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
|
98
|
+
|
99
|
+
private def withPluginContextClassLoader[A](f: => A): A =
|
100
|
+
{
|
101
|
+
val original: ClassLoader = Thread.currentThread.getContextClassLoader
|
102
|
+
Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
|
103
|
+
try f
|
104
|
+
finally Thread.currentThread.setContextClassLoader(original)
|
105
|
+
}
|
112
106
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
107
|
+
override def transaction(config: ConfigSource,
|
108
|
+
schema: Schema,
|
109
|
+
taskCount: Int,
|
110
|
+
control: OutputPlugin.Control): ConfigDiff =
|
111
|
+
{
|
112
|
+
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
113
|
+
|
114
|
+
withPluginContextClassLoader {
|
115
|
+
configure(task, schema)
|
116
|
+
control.run(task.dump)
|
117
|
+
}
|
118
|
+
task.getCatalog.ifPresent { catalog =>
|
119
|
+
val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
|
120
|
+
val cr = CatalogRegistrator(aws = Aws(task),
|
121
|
+
task = catalog,
|
122
|
+
schema = schema,
|
123
|
+
location = location,
|
124
|
+
compressionCodec = task.getCompressionCodec)
|
125
|
+
cr.run()
|
126
|
+
}
|
127
|
+
|
128
|
+
Exec.newConfigDiff
|
119
129
|
}
|
120
130
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
131
|
+
private def configure(task: PluginTask,
|
132
|
+
schema: Schema): Unit =
|
133
|
+
{
|
134
|
+
// sequence_format
|
135
|
+
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
136
|
+
catch {
|
137
|
+
case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
|
138
|
+
}
|
139
|
+
|
140
|
+
// compression_codec
|
141
|
+
CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
|
142
|
+
case Some(v) => task.setCompressionCodec(v)
|
143
|
+
case None =>
|
144
|
+
val unsupported: String = task.getCompressionCodecString
|
145
|
+
val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
|
146
|
+
throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
|
147
|
+
}
|
148
|
+
|
149
|
+
// column_options
|
150
|
+
task.getColumnOptions.forEach { (k: String,
|
151
|
+
_) =>
|
152
|
+
val c = schema.lookupColumn(k)
|
153
|
+
if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
154
|
+
}
|
155
|
+
|
156
|
+
// canned_acl
|
157
|
+
CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
|
158
|
+
case Some(v) => task.setCannedAcl(v)
|
159
|
+
case None =>
|
160
|
+
val unsupported: String = task.getCannedAclString
|
161
|
+
val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
|
162
|
+
throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
|
163
|
+
}
|
128
164
|
}
|
129
165
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
166
|
+
override def resume(taskSource: TaskSource,
|
167
|
+
schema: Schema,
|
168
|
+
taskCount: Int,
|
169
|
+
control: OutputPlugin.Control): ConfigDiff =
|
170
|
+
{
|
171
|
+
throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
|
135
172
|
}
|
136
173
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
174
|
+
override def cleanup(taskSource: TaskSource,
|
175
|
+
schema: Schema,
|
176
|
+
taskCount: Int,
|
177
|
+
successTaskReports: JList[TaskReport]): Unit =
|
178
|
+
{
|
179
|
+
successTaskReports.forEach { tr =>
|
180
|
+
logger.info(
|
181
|
+
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
182
|
+
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
183
|
+
+ s"etag: ${tr.get(classOf[String], "etag", null)}")
|
184
|
+
}
|
144
185
|
}
|
145
|
-
}
|
146
186
|
|
147
|
-
|
187
|
+
override def open(taskSource: TaskSource,
|
148
188
|
schema: Schema,
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
189
|
+
taskIndex: Int): TransactionalPageOutput =
|
190
|
+
{
|
191
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
192
|
+
val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
|
193
|
+
val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
|
194
|
+
val destS3bucket: String = task.getBucket
|
195
|
+
val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
|
196
|
+
|
197
|
+
|
198
|
+
val pageReader: PageReader = new PageReader(schema)
|
199
|
+
val aws: Aws = Aws(task)
|
200
|
+
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
|
201
|
+
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
202
|
+
.withPath(bufferFile)
|
203
|
+
.withSchema(schema)
|
204
|
+
.withTimestampFormatters(timestampFormatters)
|
205
|
+
.withCompressionCodec(task.getCompressionCodec)
|
206
|
+
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|
207
|
+
.withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
|
208
|
+
.withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
|
209
|
+
.withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
|
210
|
+
.withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
|
211
|
+
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
212
|
+
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
|
213
|
+
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
214
|
+
.build()
|
215
|
+
|
216
|
+
logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
|
217
|
+
|
218
|
+
S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
|
163
219
|
}
|
164
|
-
}
|
165
|
-
|
166
|
-
override def open(taskSource: TaskSource,
|
167
|
-
schema: Schema,
|
168
|
-
taskIndex: Int): TransactionalPageOutput = {
|
169
|
-
val task = taskSource.loadTask(classOf[PluginTask])
|
170
|
-
val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
|
171
|
-
val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
|
172
|
-
val destS3bucket: String = task.getBucket
|
173
|
-
val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
|
174
|
-
|
175
|
-
|
176
|
-
val pageReader: PageReader = new PageReader(schema)
|
177
|
-
val aws: Aws = Aws(task)
|
178
|
-
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
179
|
-
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
180
|
-
.withPath(bufferFile)
|
181
|
-
.withSchema(schema)
|
182
|
-
.withTimestampFormatters(timestampFormatters)
|
183
|
-
.withCompressionCodec(task.getCompressionCodec)
|
184
|
-
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|
185
|
-
.withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
|
186
|
-
.withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
|
187
|
-
.withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
|
188
|
-
.withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
|
189
|
-
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
190
|
-
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
|
191
|
-
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
192
|
-
.build()
|
193
|
-
|
194
|
-
logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
|
195
|
-
|
196
|
-
S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
|
197
|
-
}
|
198
220
|
|
199
221
|
}
|