embulk-output-s3_parquet 0.0.3 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 78723c6d1d8313e303e20b2fba7256b90104a7c7
4
- data.tar.gz: f7e9a77930b81c492a332fbd83de699eb546ec29
3
+ metadata.gz: 9a5fcc051188467ff067c7542e3c12d32b9ce57e
4
+ data.tar.gz: c93d01c345e6e3a8b43f335f0467ee47532cc32d
5
5
  SHA512:
6
- metadata.gz: bbf79cdc13a4e80a4ab622cfe3ab3a411e0c34cf3dd4ea4511bcbddd8165942f92529c3334524b92d2b087595fffb4bc10aa70e58aec442965ad1e0d2d431beb
7
- data.tar.gz: 7bed866b9beed2810888c128fc2ac185de5cf10fb571dfaa8df0fd6571ff97c03d4d4934948fdb82173981295c8c6fedf8caef59e4e8c9f26ba93a0d57b561d3
6
+ metadata.gz: 510bf2837f6c57e225b53084790dc0e79feef60247b73d8aee7cd268725676e0783e9ecee0cb2db4a3235969634802b2b8005208f36a4f1a86f1d15777ea9bb9
7
+ data.tar.gz: a1c23cbf8e5bc1c5414e4e906b0060cb5a7da9085767f319df57763724cfc07cda3925ba085015951dd3a5e40a9dba6a211d777474f00383b9d3f08d9f1d706a
@@ -0,0 +1 @@
1
+ github: civitaspo
@@ -0,0 +1,40 @@
1
+ name: Release CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - master
7
+ types:
8
+ - closed
9
+
10
+ jobs:
11
+ release:
12
+
13
+ runs-on: ubuntu-latest
14
+ services:
15
+ localstack:
16
+ image: localstack/localstack
17
+ ports:
18
+ - 4572:4572
19
+ env:
20
+ SERVICES: s3
21
+
22
+ steps:
23
+ - uses: actions/checkout@v1
24
+ - name: Set up JDK 1.8
25
+ uses: actions/setup-java@v1
26
+ with:
27
+ java-version: 1.8
28
+ - name: Test with Gradle
29
+ if: github.event.pull_request.merged == true
30
+ run: ./gradlew test
31
+ - name: Release the new gem
32
+ if: github.event.pull_request.merged == true
33
+ run: |
34
+ mkdir -p $HOME/.gem
35
+ touch $HOME/.gem/credentials
36
+ chmod 0600 $HOME/.gem/credentials
37
+ printf -- "---\n:rubygems_api_key: ${RUBYGEMS_API_KEY}\n" > $HOME/.gem/credentials
38
+ ./gradlew gemPush
39
+ env:
40
+ RUBYGEMS_API_KEY: ${{secrets.RUBYGEMS_API_KEY}}
@@ -0,0 +1,26 @@
1
+ name: Test CI
2
+
3
+ on:
4
+ - push
5
+
6
+ jobs:
7
+ test:
8
+
9
+ runs-on: ubuntu-latest
10
+ services:
11
+ localstack:
12
+ image: localstack/localstack
13
+ ports:
14
+ - 4572:4572
15
+ env:
16
+ SERVICES: s3
17
+
18
+ steps:
19
+ - uses: actions/checkout@v1
20
+ - name: Set up JDK 1.8
21
+ uses: actions/setup-java@v1
22
+ with:
23
+ java-version: 1.8
24
+ - name: Test with Gradle
25
+ run: ./gradlew test
26
+
@@ -1,3 +1,12 @@
1
+ 0.1.0 (2019-11-17)
2
+ ==================
3
+
4
+ * [New Feature] Support Logical Types older representations(OriginalTypes) #12
5
+ * [Enhancement] Add Github Actions CI settings #13
6
+ * [Enhancement] Support LogicalTypes for Glue Data Catalog #14
7
+ * [Enhancement] Update dependencies #15
8
+ * [New Feature] Support `auth_method: web_identity_token` #15
9
+
1
10
  0.0.3 (2019-07-17)
2
11
  ==================
3
12
 
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # S3 Parquet output plugin for Embulk
2
2
 
3
+ ![Release CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Release%20CI/badge.svg) ![Test CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Test%20CI/badge.svg)
4
+
3
5
  [Embulk](https://github.com/embulk/embulk/) output plugin to dump records as [Apache Parquet](https://parquet.apache.org/) files on S3.
4
6
 
5
7
  ## Overview
@@ -22,12 +24,13 @@
22
24
  - **column_options**: a map whose keys are name of columns, and values are configuration with following parameters (optional)
23
25
  - **timezone**: timezone if type of this column is timestamp. If not set, **default_timezone** is used. (string, optional)
24
26
  - **format**: timestamp format if type of this column is timestamp. If not set, **default_timestamp_format**: is used. (string, optional)
27
+ - **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
25
28
  - **canned_acl**: grants one of [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#CannedACL) for created objects (string, default: `private`)
26
29
  - **block_size**: The block size is the size of a row group being buffered in memory. This limits the memory usage when writing. Larger values will improve the I/O when reading but consume more memory when writing. (int, default: `134217728` (128MB))
27
30
  - **page_size**: The page size is for compression. When reading, each page can be decompressed independently. A block is composed of pages. The page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. (int, default: `1048576` (1MB))
28
31
  - **max_padding_size**: The max size (bytes) to write as padding and the min size of a row group (int, default: `8388608` (8MB))
29
32
  - **enable_dictionary_encoding**: The boolean value is to enable/disable dictionary encoding. (boolean, default: `true`)
30
- - **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, or `"session"`, default: `"default"`)
33
+ - **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, `"session"`, `"web_identity_token"`, default: `"default"`)
31
34
  - `"basic"`: uses **access_key_id** and **secret_access_key** to authenticate.
32
35
  - `"env"`: uses `AWS_ACCESS_KEY_ID` (or `AWS_ACCESS_KEY`) and `AWS_SECRET_KEY` (or `AWS_SECRET_ACCESS_KEY`) environment variables.
33
36
  - `"instance"`: uses EC2 instance profile or attached ECS task role.
@@ -44,6 +47,7 @@
44
47
  - `"anonymous"`: uses anonymous access. This auth method can access only public files.
45
48
  - `"session"`: uses temporary-generated **access_key_id**, **secret_access_key** and **session_token**.
46
49
  - `"assume_role"`: uses temporary-generated credentials by assuming **role_arn** role.
50
+ - `"web_identity_token"`: uses temporary-generated credentials by assuming **role_arn** role with web identity.
47
51
  - `"default"`: uses AWS SDK's default strategy to look up available credentials from runtime environment. This method behaves like the combination of the following methods.
48
52
  1. `"env"`
49
53
  1. `"properties"`
@@ -54,17 +58,42 @@
54
58
  - **access_key_id**: aws access key id. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
55
59
  - **secret_access_key**: aws secret access key. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
56
60
  - **session_token**: aws session token. this is required when **auth_method** is `"session"`. (string, optional)
57
- - **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"`. (string, optional)
58
- - **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"`. (string, optional)
61
+ - **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
62
+ - **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
59
63
  - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
60
- - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
64
+ - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
65
+ - **web_identity_token_file**: the absolute path to the web identity token file. this is required when **auth_method** is `"web_identity_token"`. (string, optional)
61
66
  - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
62
67
  - **catalog**: Register a table if this option is specified (optional)
63
68
  - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
64
69
  - **database**: The name of the database (string, required)
65
70
  - **table**: The name of the table (string, required)
66
71
  - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
67
- - **type**: type of a column when this plugin creates new tables (e.g. `STRING`, `BIGINT`) (string, default: depends on input column type. `BIGINT` if input column type is `long`, `BOOLEAN` if boolean, `DOUBLE` if `double`, `STRING` if `string`, `STRING` if `timestamp`, `STRING` if `json`)
72
+ - **type**: type of column when this plugin creates new tables (e.g. `string`, `bigint`) (string, default: depends on the input embulk column type, or the parquet logical type. See the below table)
73
+
74
+ |embulk column type|glue data type|
75
+ |:---|:---|
76
+ |long|bigint|
77
+ |boolean|boolean|
78
+ |double|double|
79
+ |string|string|
80
+ |timestamp|string|
81
+ |json|string|
82
+
83
+ |parquet logical type|glue data type|note|
84
+ |:---|:---|:---|
85
+ |timestamp-millis|timestamp||
86
+ |timestamp-micros|long|Glue cannot recognize timestamp-micros.|
87
+ |int8|tinyint||
88
+ |int16|smallint||
89
+ |int32|int||
90
+ |int64|bigint||
91
+ |uint8|smallint|Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1|
92
+ |uint16|int|Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.|
93
+ |uint32|bigint|Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.|
94
+ |uint64|ConfigException|Glue bigint supports only a 64-bit signed integer.|
95
+ |json|string||
96
+
68
97
  - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
69
98
  - **endpoint**: The AWS Service endpoint (string, optional)
70
99
  - **region**: The AWS region (string, optional)
@@ -75,6 +104,8 @@
75
104
  - **user** proxy user (string, optional)
76
105
  - **password** proxy password (string, optional)
77
106
  - **buffer_dir**: buffer directory for parquet files to be uploaded on S3 (string, default: Create a Temporary Directory)
107
+ - **type_options**: a map whose keys are name of embulk type(`boolean`, `long`, `double`, `string`, `timestamp`, `json`), and values are configuration with following parameters (optional)
108
+ - **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
78
109
 
79
110
 
80
111
  ## Example
@@ -92,7 +123,8 @@ out:
92
123
 
93
124
  ## Note
94
125
 
95
- * The current implementation does not support [LogicalTypes](https://github.com/apache/parquet-format/blob/2b38663/LogicalTypes.md). I will implement them later as **column_options**. So, currently **timestamp** type and **json** type are stored as UTF-8 String. Please be careful.
126
+ * The current Parquet [LogicalTypes](https://github.com/apache/parquet-format/blob/2b38663/LogicalTypes.md) implementation does only old representation.
127
+ * Some kind of LogicalTypes are sometimes not supported on your middleware. Be careful to giving logical type name.
96
128
 
97
129
  ## Development
98
130
 
@@ -106,6 +138,8 @@ $ embulk run example/config.yml -Ilib
106
138
  ### Run test:
107
139
 
108
140
  ```shell
141
+ ## Run fake S3 with localstack
142
+ $ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
109
143
  $ ./gradlew test
110
144
  ```
111
145
 
@@ -121,9 +155,12 @@ Fix [build.gradle](./build.gradle), then
121
155
 
122
156
  ```shell
123
157
  $ ./gradlew gemPush
124
-
125
158
  ```
126
159
 
127
160
  ## ChangeLog
128
161
 
129
162
  [CHANGELOG.md](./CHANGELOG.md)
163
+
164
+ ## Contributors
165
+
166
+ - @syucream
@@ -13,18 +13,18 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.0.3"
16
+ version = "0.1.0"
17
17
 
18
18
  sourceCompatibility = 1.8
19
19
  targetCompatibility = 1.8
20
20
 
21
21
  dependencies {
22
- compile "org.embulk:embulk-core:0.9.17"
23
- provided "org.embulk:embulk-core:0.9.17"
22
+ compile "org.embulk:embulk-core:0.9.20"
23
+ provided "org.embulk:embulk-core:0.9.20"
24
24
 
25
- compile 'org.scala-lang:scala-library:2.13.0'
25
+ compile 'org.scala-lang:scala-library:2.13.1'
26
26
  ['glue', 's3', 'sts'].each { v ->
27
- compile "com.amazonaws:aws-java-sdk-${v}:1.11.592"
27
+ compile "com.amazonaws:aws-java-sdk-${v}:1.11.676"
28
28
  }
29
29
  ['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
30
30
  compile "org.apache.parquet:parquet-${v}:1.10.1"
@@ -33,9 +33,8 @@ dependencies {
33
33
  compile 'org.xerial.snappy:snappy-java:1.1.7.3'
34
34
 
35
35
  testCompile 'org.scalatest:scalatest_2.13:3.0.8'
36
- testCompile 'org.embulk:embulk-test:0.9.17'
37
- testCompile 'org.embulk:embulk-standards:0.9.17'
38
- testCompile 'cloud.localstack:localstack-utils:0.1.15'
36
+ testCompile 'org.embulk:embulk-test:0.9.20'
37
+ testCompile 'org.embulk:embulk-standards:0.9.20'
39
38
  testCompile 'org.apache.parquet:parquet-tools:1.10.1'
40
39
  testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
41
40
  }
@@ -0,0 +1,36 @@
1
+
2
+ in:
3
+ type: file
4
+ path_prefix: ./example/data.tsv
5
+ parser:
6
+ type: csv
7
+ delimiter: "\t"
8
+ skip_header_lines: 0
9
+ null_string: ""
10
+ columns:
11
+ - { name: id, type: long }
12
+ - { name: description, type: string }
13
+ - { name: name, type: string }
14
+ - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15
+ - { name: payload, type: json}
16
+ stop_on_invalid_record: true
17
+
18
+ out:
19
+ type: s3_parquet
20
+ bucket: dev-baikal-workspace
21
+ path_prefix: path/to/my-obj-2.
22
+ file_ext: snappy.parquet
23
+ compression_codec: snappy
24
+ default_timezone: Asia/Tokyo
25
+ canned_acl: bucket-owner-full-control
26
+ column_options:
27
+ id:
28
+ logical_type: "int64"
29
+ payload:
30
+ logical_type: "json"
31
+ type_options:
32
+ timestamp:
33
+ logical_type: "timestamp-millis"
34
+ catalog:
35
+ database: example_db
36
+ table: example_tbl
@@ -0,0 +1,31 @@
1
+
2
+ in:
3
+ type: file
4
+ path_prefix: ./example/data.tsv
5
+ parser:
6
+ type: csv
7
+ delimiter: "\t"
8
+ skip_header_lines: 0
9
+ null_string: ""
10
+ columns:
11
+ - { name: id, type: long }
12
+ - { name: description, type: string }
13
+ - { name: name, type: string }
14
+ - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15
+ - { name: payload, type: json}
16
+ stop_on_invalid_record: true
17
+
18
+ out:
19
+ type: s3_parquet
20
+ bucket: my-bucket
21
+ path_prefix: path/to/my-obj-2.
22
+ file_ext: snappy.parquet
23
+ compression_codec: snappy
24
+ default_timezone: Asia/Tokyo
25
+ canned_acl: bucket-owner-full-control
26
+ column_options:
27
+ id:
28
+ logical_type: "uint64"
29
+ type_options:
30
+ timestamp:
31
+ logical_type: "timestamp-millis"
@@ -51,9 +51,10 @@ object CatalogRegistrator
51
51
  schema: Schema,
52
52
  location: String,
53
53
  compressionCodec: CompressionCodecName,
54
- loggerOption: Option[Logger] = None): CatalogRegistrator =
54
+ loggerOption: Option[Logger] = None,
55
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty): CatalogRegistrator =
55
56
  {
56
- new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
57
+ new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption, parquetColumnLogicalTypes)
57
58
  }
58
59
  }
59
60
 
@@ -62,7 +63,8 @@ class CatalogRegistrator(aws: Aws,
62
63
  schema: Schema,
63
64
  location: String,
64
65
  compressionCodec: CompressionCodecName,
65
- loggerOption: Option[Logger] = None)
66
+ loggerOption: Option[Logger] = None,
67
+ parquetColumnLogicalTypes: Map[String, String] = Map.empty)
66
68
  {
67
69
  val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
68
70
 
@@ -150,14 +152,36 @@ class CatalogRegistrator(aws: Aws,
150
152
  schema.getColumns.asScala.toSeq.map { c =>
151
153
  val cType: String =
152
154
  if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
153
- else convertEmbulkType2GlueType(c.getType)
155
+ else if (parquetColumnLogicalTypes.contains(c.getName)) convertParquetLogicalTypeToGlueType(parquetColumnLogicalTypes(c.getName))
156
+ else convertEmbulkTypeToGlueType(c.getType)
154
157
  new Column()
155
158
  .withName(c.getName)
156
159
  .withType(cType)
157
160
  }
158
161
  }
159
162
 
160
- private def convertEmbulkType2GlueType(t: Type): String =
163
+ private def convertParquetLogicalTypeToGlueType(t: String): String =
164
+ {
165
+ t match {
166
+ case "timestamp-millis" => "timestamp"
167
+ case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
168
+ case "int8" => "tinyint"
169
+ case "int16" => "smallint"
170
+ case "int32" => "int"
171
+ case "int64" => "bigint"
172
+ case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
173
+ case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
174
+ case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
175
+ case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
176
+ " because the Glue bigint supports a 64-bit signed integer." +
177
+ " Please use `catalog.column_options` to define the type.")
178
+ case "json" => "string"
179
+ case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
180
+ }
181
+
182
+ }
183
+
184
+ private def convertEmbulkTypeToGlueType(t: Type): String =
161
185
  {
162
186
  t match {
163
187
  case _: BooleanType => "boolean"
@@ -9,15 +9,18 @@ import org.apache.parquet.column.ParquetProperties
9
9
  import org.apache.parquet.hadoop.ParquetWriter
10
10
  import org.apache.parquet.hadoop.metadata.CompressionCodecName
11
11
  import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
12
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.PluginTask
12
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, PluginTask}
13
13
  import org.embulk.output.s3_parquet.aws.Aws
14
- import org.embulk.output.s3_parquet.parquet.ParquetFileWriter
14
+ import org.embulk.output.s3_parquet.parquet.{LogicalTypeHandlerStore, ParquetFileWriter}
15
15
  import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
16
16
  import org.embulk.spi.time.TimestampFormatter
17
17
  import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
18
18
  import org.embulk.spi.util.Timestamps
19
19
  import org.slf4j.{Logger, LoggerFactory}
20
20
 
21
+ import scala.jdk.CollectionConverters._
22
+ import scala.util.chaining._
23
+
21
24
 
22
25
  object S3ParquetOutputPlugin
23
26
  {
@@ -53,7 +56,7 @@ object S3ParquetOutputPlugin
53
56
 
54
57
  @Config("column_options")
55
58
  @ConfigDefault("{}")
56
- def getColumnOptions: JMap[String, TimestampColumnOption]
59
+ def getColumnOptions: JMap[String, ColumnOptionTask]
57
60
 
58
61
  @Config("canned_acl")
59
62
  @ConfigDefault("\"private\"")
@@ -86,8 +89,23 @@ object S3ParquetOutputPlugin
86
89
  @Config("catalog")
87
90
  @ConfigDefault("null")
88
91
  def getCatalog: Optional[CatalogRegistrator.Task]
92
+
93
+ @Config("type_options")
94
+ @ConfigDefault("{}")
95
+ def getTypeOptions: JMap[String, TypeOptionTask]
89
96
  }
90
97
 
98
+ trait ColumnOptionTask
99
+ extends Task with TimestampColumnOption with LogicalTypeOption
100
+
101
+ trait TypeOptionTask
102
+ extends Task with LogicalTypeOption
103
+
104
+ trait LogicalTypeOption
105
+ {
106
+ @Config("logical_type")
107
+ def getLogicalType: Optional[String]
108
+ }
91
109
  }
92
110
 
93
111
  class S3ParquetOutputPlugin
@@ -117,11 +135,26 @@ class S3ParquetOutputPlugin
117
135
  }
118
136
  task.getCatalog.ifPresent { catalog =>
119
137
  val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
138
+ val parquetColumnLogicalTypes: Map[String, String] = Map.newBuilder[String, String].pipe {builder =>
139
+ val cOptions = task.getColumnOptions.asScala
140
+ val tOptions = task.getTypeOptions.asScala
141
+ schema.getColumns.asScala.foreach {c =>
142
+ cOptions.get(c.getName)
143
+ if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
144
+ builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
145
+ }
146
+ else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
147
+ builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
148
+ }
149
+ }
150
+ builder.result()
151
+ }
120
152
  val cr = CatalogRegistrator(aws = Aws(task),
121
153
  task = catalog,
122
154
  schema = schema,
123
155
  location = location,
124
- compressionCodec = task.getCompressionCodec)
156
+ compressionCodec = task.getCompressionCodec,
157
+ parquetColumnLogicalTypes = parquetColumnLogicalTypes)
125
158
  cr.run()
126
159
  }
127
160
 
@@ -148,9 +181,12 @@ class S3ParquetOutputPlugin
148
181
 
149
182
  // column_options
150
183
  task.getColumnOptions.forEach { (k: String,
151
- _) =>
184
+ opt: ColumnOptionTask) =>
152
185
  val c = schema.lookupColumn(k)
153
- if (!c.getType.getName.equals("timestamp")) throw new ConfigException(s"column:$k is not 'timestamp' type.")
186
+ val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
187
+ if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
188
+ throw new ConfigException(s"column:$k is not 'timestamp' type.")
189
+ }
154
190
  }
155
191
 
156
192
  // canned_acl
@@ -198,9 +234,11 @@ class S3ParquetOutputPlugin
198
234
  val pageReader: PageReader = new PageReader(schema)
199
235
  val aws: Aws = Aws(task)
200
236
  val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
237
+ val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(task.getTypeOptions, task.getColumnOptions)
201
238
  val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
202
239
  .withPath(bufferFile)
203
240
  .withSchema(schema)
241
+ .withLogicalTypeHandlers(logicalTypeHandlers)
204
242
  .withTimestampFormatters(timestampFormatters)
205
243
  .withCompressionCodec(task.getCompressionCodec)
206
244
  .withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))