embulk-output-s3_parquet 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
4
- data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
3
+ metadata.gz: 9a5fcc051188467ff067c7542e3c12d32b9ce57e
4
+ data.tar.gz: c93d01c345e6e3a8b43f335f0467ee47532cc32d
5
5
  SHA512:
6
- metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
7
- data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3
6
+ metadata.gz: 510bf2837f6c57e225b53084790dc0e79feef60247b73d8aee7cd268725676e0783e9ecee0cb2db4a3235969634802b2b8005208f36a4f1a86f1d15777ea9bb9
7
+ data.tar.gz: a1c23cbf8e5bc1c5414e4e906b0060cb5a7da9085767f319df57763724cfc07cda3925ba085015951dd3a5e40a9dba6a211d777474f00383b9d3f08d9f1d706a
@@ -0,0 +1 @@
1
+ github: civitaspo
@@ -0,0 +1,40 @@
1
+ name: Release CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - master
7
+ types:
8
+ - closed
9
+
10
+ jobs:
11
+ release:
12
+
13
+ runs-on: ubuntu-latest
14
+ services:
15
+ localstack:
16
+ image: localstack/localstack
17
+ ports:
18
+ - 4572:4572
19
+ env:
20
+ SERVICES: s3
21
+
22
+ steps:
23
+ - uses: actions/checkout@v1
24
+ - name: Set up JDK 1.8
25
+ uses: actions/setup-java@v1
26
+ with:
27
+ java-version: 1.8
28
+ - name: Test with Gradle
29
+ if: github.event.pull_request.merged == true
30
+ run: ./gradlew test
31
+ - name: Release the new gem
32
+ if: github.event.pull_request.merged == true
33
+ run: |
34
+ mkdir -p $HOME/.gem
35
+ touch $HOME/.gem/credentials
36
+ chmod 0600 $HOME/.gem/credentials
37
+ printf -- "---\n:rubygems_api_key: ${RUBYGEMS_API_KEY}\n" > $HOME/.gem/credentials
38
+ ./gradlew gemPush
39
+ env:
40
+ RUBYGEMS_API_KEY: ${{secrets.RUBYGEMS_API_KEY}}
@@ -0,0 +1,26 @@
1
+ name: Test CI
2
+
3
+ on:
4
+ - push
5
+
6
+ jobs:
7
+ test:
8
+
9
+ runs-on: ubuntu-latest
10
+ services:
11
+ localstack:
12
+ image: localstack/localstack
13
+ ports:
14
+ - 4572:4572
15
+ env:
16
+ SERVICES: s3
17
+
18
+ steps:
19
+ - uses: actions/checkout@v1
20
+ - name: Set up JDK 1.8
21
+ uses: actions/setup-java@v1
22
+ with:
23
+ java-version: 1.8
24
+ - name: Test with Gradle
25
+ run: ./gradlew test
26
+
@@ -1,3 +1,12 @@
1
+ 0.1.0 (2019-11-17)
2
+ ==================
3
+
4
+ * [New Feature] Support Logical Types older representations(OriginalTypes) #12
5
+ * [Enhancement] Add Github Actions CI settings #13
6
+ * [Enhancement] Support LogicalTypes for Glue Data Catalog #14
7
+ * [Enhancement] Update dependencies #15
8
+ * [New Feature] Support `auth_method: web_identity_token` #15
9
+
1
10
  0.0.3 (2019-07-17)
2
11
  ==================
3
12
 
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # S3 Parquet output plugin for Embulk
2
2
 
3
+ ![Release CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Release%20CI/badge.svg) ![Test CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Test%20CI/badge.svg)
4
+
3
5
  [Embulk](https://github.com/embulk/embulk/) output plugin to dump records as [Apache Parquet](https://parquet.apache.org/) files on S3.
4
6
 
5
7
  ## Overview
@@ -22,12 +24,13 @@
22
24
  - **column_options**: a map whose keys are name of columns, and values are configuration with following parameters (optional)
23
25
  - **timezone**: timezone if type of this column is timestamp. If not set, **default_timezone** is used. (string, optional)
24
26
  - **format**: timestamp format if type of this column is timestamp. If not set, **default_timestamp_format**: is used. (string, optional)
27
+ - **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
25
28
  - **canned_acl**: grants one of [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#CannedACL) for created objects (string, default: `private`)
26
29
  - **block_size**: The block size is the size of a row group being buffered in memory. This limits the memory usage when writing. Larger values will improve the I/O when reading but consume more memory when writing. (int, default: `134217728` (128MB))
27
30
  - **page_size**: The page size is for compression. When reading, each page can be decompressed independently. A block is composed of pages. The page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. (int, default: `1048576` (1MB))
28
31
  - **max_padding_size**: The max size (bytes) to write as padding and the min size of a row group (int, default: `8388608` (8MB))
29
32
  - **enable_dictionary_encoding**: The boolean value is to enable/disable dictionary encoding. (boolean, default: `true`)
30
- - **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, or `"session"`, default: `"default"`)
33
+ - **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, `"session"`, `"web_identity_token"`, default: `"default"`)
31
34
  - `"basic"`: uses **access_key_id** and **secret_access_key** to authenticate.
32
35
  - `"env"`: uses `AWS_ACCESS_KEY_ID` (or `AWS_ACCESS_KEY`) and `AWS_SECRET_KEY` (or `AWS_SECRET_ACCESS_KEY`) environment variables.
33
36
  - `"instance"`: uses EC2 instance profile or attached ECS task role.
@@ -44,6 +47,7 @@
44
47
  - `"anonymous"`: uses anonymous access. This auth method can access only public files.
45
48
  - `"session"`: uses temporary-generated **access_key_id**, **secret_access_key** and **session_token**.
46
49
  - `"assume_role"`: uses temporary-generated credentials by assuming **role_arn** role.
50
+ - `"web_identity_token"`: uses temporary-generated credentials by assuming **role_arn** role with web identity.
47
51
  - `"default"`: uses AWS SDK's default strategy to look up available credentials from runtime environment. This method behaves like the combination of the following methods.
48
52
  1. `"env"`
49
53
  1. `"properties"`
@@ -54,17 +58,42 @@
54
58
  - **access_key_id**: aws access key id. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
55
59
  - **secret_access_key**: aws secret access key. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
56
60
  - **session_token**: aws session token. this is required when **auth_method** is `"session"`. (string, optional)
57
- - **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"`. (string, optional)
58
- - **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"`. (string, optional)
61
+ - **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
62
+ - **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
59
63
  - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
60
- - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
64
+ - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
65
+ - **web_identity_token_file**: the absolute path to the web identity token file. this is required when **auth_method** is `"web_identity_token"`. (string, optional)
61
66
  - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
62
67
  - **catalog**: Register a table if this option is specified (optional)
63
68
  - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
64
69
  - **database**: The name of the database (string, required)
65
70
  - **table**: The name of the table (string, required)
66
71
  - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
67
- - **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
72
+ - **type**: type of column when this plugin creates new tables (e.g. `string`, `bigint`) (string, default: depends on the input embulk column type, or the parquet logical type. See the below table)
73
+
74
+ |embulk column type|glue data type|
75
+ |:---|:---|
76
+ |long|bigint|
77
+ |boolean|boolean|
78
+ |double|double|
79
+ |string|string|
80
+ |timestamp|string|
81
+ |json|string|
82
+
83
+ |parquet logical type|glue data type|note|
84
+ |:---|:---|:---|
85
+ |timestamp-millis|timestamp||
86
+ |timestamp-micros|long|Glue cannot recognize timestamp-micros.|
87
+ |int8|tinyint||
88
+ |int16|smallint||
89
+ |int32|int||
90
+ |int64|bigint||
91
+ |uint8|smallint|Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1|
92
+ |uint16|int|Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.|
93
+ |uint32|bigint|Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.|
94
+ |uint64|ConfigException|Glue bigint supports only a 64-bit signed integer.|
95
+ |json|string||
96
+
68
97
  - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
69
98
  - **endpoint**: The AWS Service endpoint (string, optional)
70
99
  - **region**: The AWS region (string, optional)
@@ -75,6 +104,8 @@
75
104
  - **user** proxy user (string, optional)
76
105
  - **password** proxy password (string, optional)
77
106
  - **buffer_dir**: buffer directory for parquet files to be uploaded on S3 (string, default: Create a Temporary Directory)
107
+ - **type_options**: a map whose keys are name of embulk type(`boolean`, `long`, `double`, `string`, `timestamp`, `json`), and values are configuration with following parameters (optional)
108
+ - **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
78
109
 
79
110
 
80
111
  ## Example
@@ -92,7 +123,8 @@ out:
92
123
 
93
124
  ## Note
94
125
 
95
- * The current implementation does not support [LogicalTypes](https://github.com/apache/parquet-format/blob/2b38663/LogicalTypes.md). I will implement them later as **column_options**. So, currently **timestamp** type and **json** type are stored as UTF-8 String. Please be careful.
126
+ * The current Parquet [LogicalTypes](https://github.com/apache/parquet-format/blob/2b38663/LogicalTypes.md) implementation does only old representation.
127
+ * Some kind of LogicalTypes are sometimes not supported on your middleware. Be careful to giving logical type name.
96
128
 
97
129
  ## Development
98
130
 
@@ -106,6 +138,8 @@ $ embulk run example/config.yml -Ilib
106
138
  ### Run test:
107
139
 
108
140
  ```shell
141
+ ## Run fake S3 with localstack
142
+ $ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
109
143
  $ ./gradlew test
110
144
  ```
111
145
 
@@ -121,9 +155,12 @@ Fix [build.gradle](./build.gradle), then
121
155
 
122
156
  ```shell
123
157
  $ ./gradlew gemPush
124
-
125
158
  ```
126
159
 
127
160
  ## ChangeLog
128
161
 
129
162
  [CHANGELOG.md](./CHANGELOG.md)
163
+
164
+ ## Contributors
165
+
166
+ - @syucream
@@ -13,18 +13,18 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.0.3"
16
+ version = "0.1.0"
17
17
 
18
18
  sourceCompatibility = 1.8
19
19
  targetCompatibility = 1.8
20
20
 
21
21
  dependencies {
22
- compile "org.embulk:embulk-core:0.9.17"
23
- provided "org.embulk:embulk-core:0.9.17"
22
+ compile "org.embulk:embulk-core:0.9.20"
23
+ provided "org.embulk:embulk-core:0.9.20"
24
24
 
25
- compile 'org.scala-lang:scala-library:2.13.0'
25
+ compile 'org.scala-lang:scala-library:2.13.1'
26
26
  ['glue', 's3', 'sts'].each { v ->
27
- compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
27
+ compile "com.amazonaws:aws-java-sdk-${v}:1.11.676"
28
28
  }
29
29
  ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
30
30
  compile "org.apache.parquet:parquet-${v}:1.10.1"
@@ -33,9 +33,8 @@ dependencies {
33
33
  compile 'org.xerial.snappy:snappy-java:1.1.7.3'
34
34
 
35
35
  testCompile 'org.scalatest:scalatest_2.13:3.0.8'
36
- testCompile 'org.embulk:embulk-test:0.9.17'
37
- testCompile 'org.embulk:embulk-standards:0.9.17'
38
- testCompile 'cloud.localstack:localstack-utils:0.1.15'
36
+ testCompile 'org.embulk:embulk-test:0.9.20'
37
+ testCompile 'org.embulk:embulk-standards:0.9.20'
39
38
  testCompile 'org.apache.parquet:parquet-tools:1.10.1'
40
39
  testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
41
40
  }
@@ -0,0 +1,36 @@
1
+
2
+ in:
3
+ type: file
4
+ path_prefix: ./example/data.tsv
5
+ parser:
6
+ type: csv
7
+ delimiter: "\t"
8
+ skip_header_lines: 0
9
+ null_string: ""
10
+ columns:
11
+ - { name: id, type: long }
12
+ - { name: description, type: string }
13
+ - { name: name, type: string }
14
+ - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15
+ - { name: payload, type: json}
16
+ stop_on_invalid_record: true
17
+
18
+ out:
19
+ type: s3_parquet
20
+ bucket: dev-baikal-workspace
21
+ path_prefix: path/to/my-obj-2.
22
+ file_ext: snappy.parquet
23
+ compression_codec: snappy
24
+ default_timezone: Asia/Tokyo
25
+ canned_acl: bucket-owner-full-control
26
+ column_options:
27
+ id:
28
+ logical_type: "int64"
29
+ payload:
30
+ logical_type: "json"
31
+ type_options:
32
+ timestamp:
33
+ logical_type: "timestamp-millis"
34
+ catalog:
35
+ database: example_db
36
+ table: example_tbl
@@ -0,0 +1,31 @@
1
+
2
+ in:
3
+ type: file
4
+ path_prefix: ./example/data.tsv
5
+ parser:
6
+ type: csv
7
+ delimiter: "\t"
8
+ skip_header_lines: 0
9
+ null_string: ""
10
+ columns:
11
+ - { name: id, type: long }
12
+ - { name: description, type: string }
13
+ - { name: name, type: string }
14
+ - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15
+ - { name: payload, type: json}
16
+ stop_on_invalid_record: true
17
+
18
+ out:
19
+ type: s3_parquet
20
+ bucket: my-bucket
21
+ path_prefix: path/to/my-obj-2.
22
+ file_ext: snappy.parquet
23
+ compression_codec: snappy
24
+ default_timezone: Asia/Tokyo
25
+ canned_acl: bucket-owner-full-control
26
+ column_options:
27
+ id:
28
+ logical_type: "uint64"
29
+ type_options:
30
+ timestamp:
31
+ logical_type: "timestamp-millis"
@@ -51,9 +51,10 @@ object CatalogRegistrator
51
51
  schema: Schema,
52
52
  location: String,
53
53
  compressionCodec: CompressionCodecName,
54
- loggerOption: Option[Logger] = None): CatalogRegistrator =
54
+ loggerOption: Option[Logger] = None,
55
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty): CatalogRegistrator =
55
56
  {
56
- new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
57
+ new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption, parquetColumnLogicalTypes)
57
58
  }
58
59
  }
59
60
 
@@ -62,7 +63,8 @@ class CatalogRegistrator(aws: Aws,
62
63
  schema: Schema,
63
64
  location: String,
64
65
  compressionCodec: CompressionCodecName,
65
- loggerOption: Option[Logger] = None)
66
+ loggerOption: Option[Logger] = None,
67
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty)
66
68
  {
67
69
  val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
68
70
 
@@ -150,14 +152,36 @@ class CatalogRegistrator(aws: Aws,
150
152
  schema.getColumns.asScala.toSeq.map { c =>
151
153
  val cType: String =
152
154
  if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
153
- else convertEmbulkType2GlueType(c.getType)
155
+ else if (parquetColumnLogicalTypes.contains(c.getName)) convertParquetLogicalTypeToGlueType(parquetColumnLogicalTypes(c.getName))
156
+ else convertEmbulkTypeToGlueType(c.getType)
154
157
  new Column()
155
158
  .withName(c.getName)
156
159
  .withType(cType)
157
160
  }
158
161
  }
159
162
 
160
- private def convertEmbulkType2GlueType(t: Type): String =
163
+ private def convertParquetLogicalTypeToGlueType(t: String): String =
164
+ {
165
+ t match {
166
+ case "timestamp-millis" => "timestamp"
167
+ case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
168
+ case "int8" => "tinyint"
169
+ case "int16" => "smallint"
170
+ case "int32" => "int"
171
+ case "int64" => "bigint"
172
+ case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
173
+ case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
174
+ case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
175
+ case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
176
+ " because the Glue bigint supports a 64-bit signed integer." +
177
+ " Please use `catalog.column_options` to define the type.")
178
+ case "json" => "string"
179
+ case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
180
+ }
181
+
182
+ }
183
+
184
+ private def convertEmbulkTypeToGlueType(t: Type): String =
161
185
  {
162
186
  t match {
163
187
  case _: BooleanType => "boolean"
@@ -9,15 +9,18 @@ import org.apache.parquet.column.ParquetProperties
9
9
  import org.apache.parquet.hadoop.ParquetWriter
10
10
  import org.apache.parquet.hadoop.metadata.CompressionCodecName
11
11
  import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
12
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.PluginTask
12
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, PluginTask}
13
13
  import org.embulk.output.s3_parquet.aws.Aws
14
- import org.embulk.output.s3_parquet.parquet.ParquetFileWriter
14
+ import org.embulk.output.s3_parquet.parquet.{LogicalTypeHandlerStore, ParquetFileWriter}
15
15
  import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
16
16
  import org.embulk.spi.time.TimestampFormatter
17
17
  import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
18
18
  import org.embulk.spi.util.Timestamps
19
19
  import org.slf4j.{Logger, LoggerFactory}
20
20
 
21
+ import scala.jdk.CollectionConverters._
22
+ import scala.util.chaining._
23
+
21
24
 
22
25
  object S3ParquetOutputPlugin
23
26
  {
@@ -53,7 +56,7 @@ object S3ParquetOutputPlugin
53
56
 
54
57
  @Config("column_options")
55
58
  @ConfigDefault("{}")
56
- def getColumnOptions: JMap[String, TimestampColumnOption]
59
+ def getColumnOptions: JMap[String, ColumnOptionTask]
57
60
 
58
61
  @Config("canned_acl")
59
62
  @ConfigDefault("\"private\"")
@@ -86,8 +89,23 @@ object S3ParquetOutputPlugin
86
89
  @Config("catalog")
87
90
  @ConfigDefault("null")
88
91
  def getCatalog: Optional[CatalogRegistrator.Task]
92
+
93
+ @Config("type_options")
94
+ @ConfigDefault("{}")
95
+ def getTypeOptions: JMap[String, TypeOptionTask]
89
96
  }
90
97
 
98
+ trait ColumnOptionTask
99
+ extends Task with TimestampColumnOption with LogicalTypeOption
100
+
101
+ trait TypeOptionTask
102
+ extends Task with LogicalTypeOption
103
+
104
+ trait LogicalTypeOption
105
+ {
106
+ @Config("logical_type")
107
+ def getLogicalType: Optional[String]
108
+ }
91
109
  }
92
110
 
93
111
  class S3ParquetOutputPlugin
@@ -117,11 +135,26 @@ class S3ParquetOutputPlugin
117
135
  }
118
136
  task.getCatalog.ifPresent { catalog =>
119
137
  val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
138
+ val parquetColumnLogicalTypes: Map[String, String] = Map.newBuilder[String, String].pipe {builder =>
139
+ val cOptions = task.getColumnOptions.asScala
140
+ val tOptions = task.getTypeOptions.asScala
141
+ schema.getColumns.asScala.foreach {c =>
142
+ cOptions.get(c.getName)
143
+ if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
144
+ builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
145
+ }
146
+ else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
147
+ builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
148
+ }
149
+ }
150
+ builder.result()
151
+ }
120
152
  val cr = CatalogRegistrator(aws = Aws(task),
121
153
  task = catalog,
122
154
  schema = schema,
123
155
  location = location,
124
- compressionCodec = task.getCompressionCodec)
156
+ compressionCodec = task.getCompressionCodec,
157
+ parquetColumnLogicalTypes = parquetColumnLogicalTypes)
125
158
  cr.run()
126
159
  }
127
160
 
@@ -148,9 +181,12 @@ class S3ParquetOutputPlugin
148
181
 
149
182
  // column_options
150
183
  task.getColumnOptions.forEach { (k: String,
151
- _) =>
184
+ opt: ColumnOptionTask) =>
152
185
  val c = schema.lookupColumn(k)
153
- if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
186
+ val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
187
+ if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
188
+ throw new ConfigException(s"column:$k is not 'timestamp' type.")
189
+ }
154
190
  }
155
191
 
156
192
  // canned_acl
@@ -198,9 +234,11 @@ class S3ParquetOutputPlugin
198
234
  val pageReader: PageReader = new PageReader(schema)
199
235
  val aws: Aws = Aws(task)
200
236
  val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
237
+ val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(task.getTypeOptions, task.getColumnOptions)
201
238
  val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
202
239
  .withPath(bufferFile)
203
240
  .withSchema(schema)
241
+ .withLogicalTypeHandlers(logicalTypeHandlers)
204
242
  .withTimestampFormatters(timestampFormatters)
205
243
  .withCompressionCodec(task.getCompressionCodec)
206
244
  .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))