embulk-output-s3_parquet 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/build.gradle +12 -13
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +178 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +166 -144
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +43 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +47 -29
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +22 -14
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +104 -95
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +34 -26
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +39 -31
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -32
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +57 -37
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +26 -19
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +128 -94
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +113 -104
- metadata +18 -16
- data/.scalafmt.conf +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
|
4
|
+
data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
|
7
|
+
data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -59,6 +59,13 @@
|
|
59
59
|
- **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
60
60
|
- **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
|
61
61
|
- **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
62
|
+
- **catalog**: Register a table if this option is specified (optional)
|
63
|
+
- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
|
64
|
+
- **database**: The name of the database (string, required)
|
65
|
+
- **table**: The name of the table (string, required)
|
66
|
+
- **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
|
67
|
+
- **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
|
68
|
+
- **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
|
62
69
|
- **endpoint**: The AWS Service endpoint (string, optional)
|
63
70
|
- **region**: The AWS region (string, optional)
|
64
71
|
- **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)
|
data/build.gradle
CHANGED
@@ -2,7 +2,6 @@ plugins {
|
|
2
2
|
id "scala"
|
3
3
|
id "com.jfrog.bintray" version "1.1"
|
4
4
|
id "com.github.jruby-gradle.base" version "1.5.0"
|
5
|
-
id "com.diffplug.gradle.spotless" version "3.13.0"
|
6
5
|
id "com.adarshr.test-logger" version "1.6.0" // For Pretty test logging
|
7
6
|
}
|
8
7
|
import com.github.jrubygradle.JRubyExec
|
@@ -14,30 +13,30 @@ configurations {
|
|
14
13
|
provided
|
15
14
|
}
|
16
15
|
|
17
|
-
version = "0.0.
|
16
|
+
version = "0.0.3"
|
18
17
|
|
19
18
|
sourceCompatibility = 1.8
|
20
19
|
targetCompatibility = 1.8
|
21
20
|
|
22
21
|
dependencies {
|
23
|
-
compile "org.embulk:embulk-core:0.9.
|
24
|
-
provided "org.embulk:embulk-core:0.9.
|
22
|
+
compile "org.embulk:embulk-core:0.9.17"
|
23
|
+
provided "org.embulk:embulk-core:0.9.17"
|
25
24
|
|
26
|
-
compile 'org.scala-lang:scala-library:2.
|
27
|
-
['s3', 'sts'].each { v ->
|
28
|
-
compile "com.amazonaws:aws-java-sdk-${v}:1.11.
|
25
|
+
compile 'org.scala-lang:scala-library:2.13.0'
|
26
|
+
['glue', 's3', 'sts'].each { v ->
|
27
|
+
compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
|
29
28
|
}
|
30
29
|
['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
|
31
|
-
compile "org.apache.parquet:parquet-${v}:1.10.
|
30
|
+
compile "org.apache.parquet:parquet-${v}:1.10.1"
|
32
31
|
}
|
33
32
|
compile 'org.apache.hadoop:hadoop-common:2.9.2'
|
34
|
-
compile 'org.xerial.snappy:snappy-java:1.1.7.
|
33
|
+
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
|
35
34
|
|
36
|
-
testCompile 'org.scalatest:scalatest_2.
|
37
|
-
testCompile 'org.embulk:embulk-test:0.9.
|
38
|
-
testCompile 'org.embulk:embulk-standards:0.9.
|
35
|
+
testCompile 'org.scalatest:scalatest_2.13:3.0.8'
|
36
|
+
testCompile 'org.embulk:embulk-test:0.9.17'
|
37
|
+
testCompile 'org.embulk:embulk-standards:0.9.17'
|
39
38
|
testCompile 'cloud.localstack:localstack-utils:0.1.15'
|
40
|
-
testCompile 'org.apache.parquet:parquet-tools:1.
|
39
|
+
testCompile 'org.apache.parquet:parquet-tools:1.10.1'
|
41
40
|
testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
|
42
41
|
}
|
43
42
|
|
@@ -0,0 +1,178 @@
|
|
1
|
+
package org.embulk.output.s3_parquet
|
2
|
+
|
3
|
+
|
4
|
+
import java.util.{Optional, Map => JMap}
|
5
|
+
|
6
|
+
import com.amazonaws.services.glue.model.{Column, CreateTableRequest, DeleteTableRequest, GetTableRequest, SerDeInfo, StorageDescriptor, TableInput}
|
7
|
+
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
8
|
+
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
9
|
+
import org.embulk.output.s3_parquet.aws.Aws
|
10
|
+
import org.embulk.output.s3_parquet.CatalogRegistrator.ColumnOptions
|
11
|
+
import org.embulk.spi.Schema
|
12
|
+
import org.embulk.spi.`type`.{BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType, Type}
|
13
|
+
import org.slf4j.{Logger, LoggerFactory}
|
14
|
+
|
15
|
+
import scala.jdk.CollectionConverters._
|
16
|
+
import scala.util.Try
|
17
|
+
|
18
|
+
|
19
|
+
object CatalogRegistrator
|
20
|
+
{
|
21
|
+
trait Task
|
22
|
+
extends org.embulk.config.Task
|
23
|
+
{
|
24
|
+
@Config("catalog_id")
|
25
|
+
@ConfigDefault("null")
|
26
|
+
def getCatalogId: Optional[String]
|
27
|
+
|
28
|
+
@Config("database")
|
29
|
+
def getDatabase: String
|
30
|
+
|
31
|
+
@Config("table")
|
32
|
+
def getTable: String
|
33
|
+
|
34
|
+
@Config("column_options")
|
35
|
+
@ConfigDefault("{}")
|
36
|
+
def getColumnOptions: JMap[String, ColumnOptions]
|
37
|
+
|
38
|
+
@Config("operation_if_exists")
|
39
|
+
@ConfigDefault("\"delete\"")
|
40
|
+
def getOperationIfExists: String
|
41
|
+
}
|
42
|
+
|
43
|
+
trait ColumnOptions
|
44
|
+
{
|
45
|
+
@Config("type")
|
46
|
+
def getType: String
|
47
|
+
}
|
48
|
+
|
49
|
+
def apply(aws: Aws,
|
50
|
+
task: Task,
|
51
|
+
schema: Schema,
|
52
|
+
location: String,
|
53
|
+
compressionCodec: CompressionCodecName,
|
54
|
+
loggerOption: Option[Logger] = None): CatalogRegistrator =
|
55
|
+
{
|
56
|
+
new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
class CatalogRegistrator(aws: Aws,
|
61
|
+
task: CatalogRegistrator.Task,
|
62
|
+
schema: Schema,
|
63
|
+
location: String,
|
64
|
+
compressionCodec: CompressionCodecName,
|
65
|
+
loggerOption: Option[Logger] = None)
|
66
|
+
{
|
67
|
+
val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
|
68
|
+
|
69
|
+
def run(): Unit =
|
70
|
+
{
|
71
|
+
if (doesTableExists()) {
|
72
|
+
task.getOperationIfExists match {
|
73
|
+
case "skip" =>
|
74
|
+
logger.info(s"Skip to register the table: ${task.getDatabase}.${task.getTable}")
|
75
|
+
return
|
76
|
+
|
77
|
+
case "delete" =>
|
78
|
+
logger.info(s"Delete the table: ${task.getDatabase}.${task.getTable}")
|
79
|
+
deleteTable()
|
80
|
+
|
81
|
+
case unknown =>
|
82
|
+
throw new ConfigException(s"Unsupported operation: $unknown")
|
83
|
+
}
|
84
|
+
}
|
85
|
+
registerNewParquetTable()
|
86
|
+
showNewTableInfo()
|
87
|
+
}
|
88
|
+
|
89
|
+
def showNewTableInfo(): Unit =
|
90
|
+
{
|
91
|
+
val req = new GetTableRequest()
|
92
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
93
|
+
req.setDatabaseName(task.getDatabase)
|
94
|
+
req.setName(task.getTable)
|
95
|
+
|
96
|
+
val t = aws.withGlue(_.getTable(req)).getTable
|
97
|
+
logger.info(s"Created a table: ${t.toString}")
|
98
|
+
}
|
99
|
+
|
100
|
+
def doesTableExists(): Boolean =
|
101
|
+
{
|
102
|
+
val req = new GetTableRequest()
|
103
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
104
|
+
req.setDatabaseName(task.getDatabase)
|
105
|
+
req.setName(task.getTable)
|
106
|
+
|
107
|
+
Try(aws.withGlue(_.getTable(req))).isSuccess
|
108
|
+
}
|
109
|
+
|
110
|
+
def deleteTable(): Unit =
|
111
|
+
{
|
112
|
+
val req = new DeleteTableRequest()
|
113
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
114
|
+
req.setDatabaseName(task.getDatabase)
|
115
|
+
req.setName(task.getTable)
|
116
|
+
aws.withGlue(_.deleteTable(req))
|
117
|
+
}
|
118
|
+
|
119
|
+
def registerNewParquetTable(): Unit =
|
120
|
+
{
|
121
|
+
logger.info(s"Create a new table: ${task.getDatabase}.${task.getTable}")
|
122
|
+
val req = new CreateTableRequest()
|
123
|
+
task.getCatalogId.ifPresent(cid => req.setCatalogId(cid))
|
124
|
+
req.setDatabaseName(task.getDatabase)
|
125
|
+
req.setTableInput(new TableInput()
|
126
|
+
.withName(task.getTable)
|
127
|
+
.withDescription("Created by embulk-output-s3_parquet")
|
128
|
+
.withTableType("EXTERNAL_TABLE")
|
129
|
+
.withParameters(Map("EXTERNAL" -> "TRUE",
|
130
|
+
"classification" -> "parquet",
|
131
|
+
"parquet.compression" -> compressionCodec.name()).asJava)
|
132
|
+
.withStorageDescriptor(new StorageDescriptor()
|
133
|
+
.withColumns(getGlueSchema: _*)
|
134
|
+
.withLocation(location)
|
135
|
+
.withCompressed(isCompressed)
|
136
|
+
.withInputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")
|
137
|
+
.withOutputFormat("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")
|
138
|
+
.withSerdeInfo(new SerDeInfo()
|
139
|
+
.withSerializationLibrary("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
|
140
|
+
.withParameters(Map("serialization.format" -> "1").asJava)
|
141
|
+
)
|
142
|
+
)
|
143
|
+
)
|
144
|
+
aws.withGlue(_.createTable(req))
|
145
|
+
}
|
146
|
+
|
147
|
+
private def getGlueSchema: Seq[Column] =
|
148
|
+
{
|
149
|
+
val columnOptions: Map[String, ColumnOptions] = task.getColumnOptions.asScala.toMap
|
150
|
+
schema.getColumns.asScala.toSeq.map { c =>
|
151
|
+
val cType: String =
|
152
|
+
if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
|
153
|
+
else convertEmbulkType2GlueType(c.getType)
|
154
|
+
new Column()
|
155
|
+
.withName(c.getName)
|
156
|
+
.withType(cType)
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
private def convertEmbulkType2GlueType(t: Type): String =
|
161
|
+
{
|
162
|
+
t match {
|
163
|
+
case _: BooleanType => "boolean"
|
164
|
+
case _: LongType => "bigint"
|
165
|
+
case _: DoubleType => "double"
|
166
|
+
case _: StringType => "string"
|
167
|
+
case _: TimestampType => "string"
|
168
|
+
case _: JsonType => "string"
|
169
|
+
case unknown => throw new ConfigException(s"Unsupported embulk type: ${unknown.getName}")
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
private def isCompressed: Boolean =
|
174
|
+
{
|
175
|
+
!compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
|
176
|
+
}
|
177
|
+
|
178
|
+
}
|
@@ -16,184 +16,206 @@ import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPage
|
|
16
16
|
import org.embulk.spi.time.TimestampFormatter
|
17
17
|
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
18
|
import org.embulk.spi.util.Timestamps
|
19
|
-
import org.slf4j.Logger
|
19
|
+
import org.slf4j.{Logger, LoggerFactory}
|
20
20
|
|
21
|
-
object S3ParquetOutputPlugin {
|
22
21
|
|
23
|
-
|
24
|
-
|
25
|
-
with TimestampFormatter.Task
|
26
|
-
with Aws.Task {
|
22
|
+
object S3ParquetOutputPlugin
|
23
|
+
{
|
27
24
|
|
28
|
-
|
29
|
-
|
25
|
+
trait PluginTask
|
26
|
+
extends Task
|
27
|
+
with TimestampFormatter.Task
|
28
|
+
with Aws.Task
|
29
|
+
{
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
def getPathPrefix: String
|
31
|
+
@Config("bucket")
|
32
|
+
def getBucket: String
|
34
33
|
|
35
|
-
|
36
|
-
|
37
|
-
|
34
|
+
@Config("path_prefix")
|
35
|
+
@ConfigDefault("\"\"")
|
36
|
+
def getPathPrefix: String
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
@Config("sequence_format")
|
39
|
+
@ConfigDefault("\"%03d.%02d.\"")
|
40
|
+
def getSequenceFormat: String
|
42
41
|
|
43
|
-
|
44
|
-
|
45
|
-
|
42
|
+
@Config("file_ext")
|
43
|
+
@ConfigDefault("\"parquet\"")
|
44
|
+
def getFileExt: String
|
46
45
|
|
47
|
-
|
46
|
+
@Config("compression_codec")
|
47
|
+
@ConfigDefault("\"uncompressed\"")
|
48
|
+
def getCompressionCodecString: String
|
48
49
|
|
49
|
-
|
50
|
+
def setCompressionCodec(v: CompressionCodecName): Unit
|
50
51
|
|
51
|
-
|
52
|
-
@ConfigDefault("{}")
|
53
|
-
def getColumnOptions: JMap[String, TimestampColumnOption]
|
52
|
+
def getCompressionCodec: CompressionCodecName
|
54
53
|
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
@Config("column_options")
|
55
|
+
@ConfigDefault("{}")
|
56
|
+
def getColumnOptions: JMap[String, TimestampColumnOption]
|
58
57
|
|
59
|
-
|
58
|
+
@Config("canned_acl")
|
59
|
+
@ConfigDefault("\"private\"")
|
60
|
+
def getCannedAclString: String
|
60
61
|
|
61
|
-
|
62
|
+
def setCannedAcl(v: CannedAccessControlList): Unit
|
62
63
|
|
63
|
-
|
64
|
-
@ConfigDefault("null")
|
65
|
-
def getBlockSize: Optional[Int]
|
64
|
+
def getCannedAcl: CannedAccessControlList
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
@Config("block_size")
|
67
|
+
@ConfigDefault("null")
|
68
|
+
def getBlockSize: Optional[Int]
|
70
69
|
|
71
|
-
|
72
|
-
|
73
|
-
|
70
|
+
@Config("page_size")
|
71
|
+
@ConfigDefault("null")
|
72
|
+
def getPageSize: Optional[Int]
|
74
73
|
|
75
|
-
|
76
|
-
|
77
|
-
|
74
|
+
@Config("max_padding_size")
|
75
|
+
@ConfigDefault("null")
|
76
|
+
def getMaxPaddingSize: Optional[Int]
|
78
77
|
|
79
|
-
|
80
|
-
|
81
|
-
|
78
|
+
@Config("enable_dictionary_encoding")
|
79
|
+
@ConfigDefault("null")
|
80
|
+
def getEnableDictionaryEncoding: Optional[Boolean]
|
82
81
|
|
83
|
-
|
82
|
+
@Config("buffer_dir")
|
83
|
+
@ConfigDefault("null")
|
84
|
+
def getBufferDir: Optional[String]
|
85
|
+
|
86
|
+
@Config("catalog")
|
87
|
+
@ConfigDefault("null")
|
88
|
+
def getCatalog: Optional[CatalogRegistrator.Task]
|
89
|
+
}
|
84
90
|
|
85
91
|
}
|
86
92
|
|
87
93
|
class S3ParquetOutputPlugin
|
88
|
-
|
89
|
-
|
90
|
-
val logger: Logger = Exec.getLogger(classOf[S3ParquetOutputPlugin])
|
91
|
-
|
92
|
-
private def withPluginContextClassLoader[A](f: => A): A = {
|
93
|
-
val original: ClassLoader = Thread.currentThread.getContextClassLoader
|
94
|
-
Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
|
95
|
-
try f
|
96
|
-
finally Thread.currentThread.setContextClassLoader(original)
|
97
|
-
}
|
98
|
-
|
99
|
-
override def transaction(config: ConfigSource,
|
100
|
-
schema: Schema,
|
101
|
-
taskCount: Int,
|
102
|
-
control: OutputPlugin.Control): ConfigDiff = {
|
103
|
-
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
104
|
-
|
105
|
-
withPluginContextClassLoader {
|
106
|
-
configure(task, schema)
|
107
|
-
control.run(task.dump)
|
108
|
-
}
|
94
|
+
extends OutputPlugin
|
95
|
+
{
|
109
96
|
|
110
|
-
|
111
|
-
|
97
|
+
val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
|
98
|
+
|
99
|
+
private def withPluginContextClassLoader[A](f: => A): A =
|
100
|
+
{
|
101
|
+
val original: ClassLoader = Thread.currentThread.getContextClassLoader
|
102
|
+
Thread.currentThread.setContextClassLoader(classOf[S3ParquetOutputPlugin].getClassLoader)
|
103
|
+
try f
|
104
|
+
finally Thread.currentThread.setContextClassLoader(original)
|
105
|
+
}
|
112
106
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
107
|
+
override def transaction(config: ConfigSource,
|
108
|
+
schema: Schema,
|
109
|
+
taskCount: Int,
|
110
|
+
control: OutputPlugin.Control): ConfigDiff =
|
111
|
+
{
|
112
|
+
val task: PluginTask = config.loadConfig(classOf[PluginTask])
|
113
|
+
|
114
|
+
withPluginContextClassLoader {
|
115
|
+
configure(task, schema)
|
116
|
+
control.run(task.dump)
|
117
|
+
}
|
118
|
+
task.getCatalog.ifPresent { catalog =>
|
119
|
+
val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
|
120
|
+
val cr = CatalogRegistrator(aws = Aws(task),
|
121
|
+
task = catalog,
|
122
|
+
schema = schema,
|
123
|
+
location = location,
|
124
|
+
compressionCodec = task.getCompressionCodec)
|
125
|
+
cr.run()
|
126
|
+
}
|
127
|
+
|
128
|
+
Exec.newConfigDiff
|
119
129
|
}
|
120
130
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
131
|
+
private def configure(task: PluginTask,
|
132
|
+
schema: Schema): Unit =
|
133
|
+
{
|
134
|
+
// sequence_format
|
135
|
+
try String.format(task.getSequenceFormat, 0: Integer, 0: Integer)
|
136
|
+
catch {
|
137
|
+
case e: IllegalFormatException => throw new ConfigException(s"Invalid sequence_format: ${task.getSequenceFormat}", e)
|
138
|
+
}
|
139
|
+
|
140
|
+
// compression_codec
|
141
|
+
CompressionCodecName.values().find(v => v.name().toLowerCase(Locale.ENGLISH).equals(task.getCompressionCodecString)) match {
|
142
|
+
case Some(v) => task.setCompressionCodec(v)
|
143
|
+
case None =>
|
144
|
+
val unsupported: String = task.getCompressionCodecString
|
145
|
+
val supported: String = CompressionCodecName.values().map(v => s"'${v.name().toLowerCase}'").mkString(", ")
|
146
|
+
throw new ConfigException(s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported].")
|
147
|
+
}
|
148
|
+
|
149
|
+
// column_options
|
150
|
+
task.getColumnOptions.forEach { (k: String,
|
151
|
+
_) =>
|
152
|
+
val c = schema.lookupColumn(k)
|
153
|
+
if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
154
|
+
}
|
155
|
+
|
156
|
+
// canned_acl
|
157
|
+
CannedAccessControlList.values().find(v => v.toString.equals(task.getCannedAclString)) match {
|
158
|
+
case Some(v) => task.setCannedAcl(v)
|
159
|
+
case None =>
|
160
|
+
val unsupported: String = task.getCannedAclString
|
161
|
+
val supported: String = CannedAccessControlList.values().map(v => s"'${v.toString}'").mkString(", ")
|
162
|
+
throw new ConfigException(s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported].")
|
163
|
+
}
|
128
164
|
}
|
129
165
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
166
|
+
override def resume(taskSource: TaskSource,
|
167
|
+
schema: Schema,
|
168
|
+
taskCount: Int,
|
169
|
+
control: OutputPlugin.Control): ConfigDiff =
|
170
|
+
{
|
171
|
+
throw new UnsupportedOperationException("s3_parquet output plugin does not support resuming")
|
135
172
|
}
|
136
173
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
174
|
+
override def cleanup(taskSource: TaskSource,
|
175
|
+
schema: Schema,
|
176
|
+
taskCount: Int,
|
177
|
+
successTaskReports: JList[TaskReport]): Unit =
|
178
|
+
{
|
179
|
+
successTaskReports.forEach { tr =>
|
180
|
+
logger.info(
|
181
|
+
s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
|
182
|
+
+ s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
|
183
|
+
+ s"etag: ${tr.get(classOf[String], "etag", null)}")
|
184
|
+
}
|
144
185
|
}
|
145
|
-
}
|
146
186
|
|
147
|
-
|
187
|
+
override def open(taskSource: TaskSource,
|
148
188
|
schema: Schema,
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
189
|
+
taskIndex: Int): TransactionalPageOutput =
|
190
|
+
{
|
191
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
192
|
+
val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
|
193
|
+
val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
|
194
|
+
val destS3bucket: String = task.getBucket
|
195
|
+
val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
|
196
|
+
|
197
|
+
|
198
|
+
val pageReader: PageReader = new PageReader(schema)
|
199
|
+
val aws: Aws = Aws(task)
|
200
|
+
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
|
201
|
+
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
202
|
+
.withPath(bufferFile)
|
203
|
+
.withSchema(schema)
|
204
|
+
.withTimestampFormatters(timestampFormatters)
|
205
|
+
.withCompressionCodec(task.getCompressionCodec)
|
206
|
+
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|
207
|
+
.withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
|
208
|
+
.withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
|
209
|
+
.withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
|
210
|
+
.withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
|
211
|
+
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
212
|
+
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
|
213
|
+
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
214
|
+
.build()
|
215
|
+
|
216
|
+
logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
|
217
|
+
|
218
|
+
S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
|
163
219
|
}
|
164
|
-
}
|
165
|
-
|
166
|
-
override def open(taskSource: TaskSource,
|
167
|
-
schema: Schema,
|
168
|
-
taskIndex: Int): TransactionalPageOutput = {
|
169
|
-
val task = taskSource.loadTask(classOf[PluginTask])
|
170
|
-
val bufferDir: String = task.getBufferDir.orElse(Files.createTempDirectory("embulk-output-s3_parquet-").toString)
|
171
|
-
val bufferFile: String = Paths.get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet").toString
|
172
|
-
val destS3bucket: String = task.getBucket
|
173
|
-
val destS3Key: String = task.getPathPrefix + String.format(task.getSequenceFormat, taskIndex: Integer, 0: Integer) + task.getFileExt
|
174
|
-
|
175
|
-
|
176
|
-
val pageReader: PageReader = new PageReader(schema)
|
177
|
-
val aws: Aws = Aws(task)
|
178
|
-
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
|
179
|
-
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
180
|
-
.withPath(bufferFile)
|
181
|
-
.withSchema(schema)
|
182
|
-
.withTimestampFormatters(timestampFormatters)
|
183
|
-
.withCompressionCodec(task.getCompressionCodec)
|
184
|
-
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|
185
|
-
.withDictionaryPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE))
|
186
|
-
.withMaxPaddingSize(task.getMaxPaddingSize.orElse(ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
|
187
|
-
.withPageSize(task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE))
|
188
|
-
.withRowGroupSize(task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE))
|
189
|
-
.withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
|
190
|
-
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE)
|
191
|
-
.withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
|
192
|
-
.build()
|
193
|
-
|
194
|
-
logger.info(s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key")
|
195
|
-
|
196
|
-
S3ParquetPageOutput(bufferFile, pageReader, parquetWriter, aws, destS3bucket, destS3Key)
|
197
|
-
}
|
198
220
|
|
199
221
|
}
|