embulk-output-s3_parquet 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/FUNDING.yml +1 -0
- data/.github/workflows/release.yml +40 -0
- data/.github/workflows/test.yml +26 -0
- data/CHANGELOG.md +9 -0
- data/README.md +44 -7
- data/build.gradle +7 -8
- data/example/with_catalog.yml +36 -0
- data/example/with_logicaltypes.yml +31 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +29 -5
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +44 -6
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +11 -1
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +39 -11
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +145 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +107 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +4 -2
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +51 -34
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +34 -29
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +78 -0
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +162 -0
- metadata +23 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a5fcc051188467ff067c7542e3c12d32b9ce57e
|
4
|
+
data.tar.gz: c93d01c345e6e3a8b43f335f0467ee47532cc32d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 510bf2837f6c57e225b53084790dc0e79feef60247b73d8aee7cd268725676e0783e9ecee0cb2db4a3235969634802b2b8005208f36a4f1a86f1d15777ea9bb9
|
7
|
+
data.tar.gz: a1c23cbf8e5bc1c5414e4e906b0060cb5a7da9085767f319df57763724cfc07cda3925ba085015951dd3a5e40a9dba6a211d777474f00383b9d3f08d9f1d706a
|
data/.github/FUNDING.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
github: civitaspo
|
@@ -0,0 +1,40 @@
|
|
1
|
+
name: Release CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches:
|
6
|
+
- master
|
7
|
+
types:
|
8
|
+
- closed
|
9
|
+
|
10
|
+
jobs:
|
11
|
+
release:
|
12
|
+
|
13
|
+
runs-on: ubuntu-latest
|
14
|
+
services:
|
15
|
+
localstack:
|
16
|
+
image: localstack/localstack
|
17
|
+
ports:
|
18
|
+
- 4572:4572
|
19
|
+
env:
|
20
|
+
SERVICES: s3
|
21
|
+
|
22
|
+
steps:
|
23
|
+
- uses: actions/checkout@v1
|
24
|
+
- name: Set up JDK 1.8
|
25
|
+
uses: actions/setup-java@v1
|
26
|
+
with:
|
27
|
+
java-version: 1.8
|
28
|
+
- name: Test with Gradle
|
29
|
+
if: github.event.pull_request.merged == true
|
30
|
+
run: ./gradlew test
|
31
|
+
- name: Release the new gem
|
32
|
+
if: github.event.pull_request.merged == true
|
33
|
+
run: |
|
34
|
+
mkdir -p $HOME/.gem
|
35
|
+
touch $HOME/.gem/credentials
|
36
|
+
chmod 0600 $HOME/.gem/credentials
|
37
|
+
printf -- "---\n:rubygems_api_key: ${RUBYGEMS_API_KEY}\n" > $HOME/.gem/credentials
|
38
|
+
./gradlew gemPush
|
39
|
+
env:
|
40
|
+
RUBYGEMS_API_KEY: ${{secrets.RUBYGEMS_API_KEY}}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: Test CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
- push
|
5
|
+
|
6
|
+
jobs:
|
7
|
+
test:
|
8
|
+
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
services:
|
11
|
+
localstack:
|
12
|
+
image: localstack/localstack
|
13
|
+
ports:
|
14
|
+
- 4572:4572
|
15
|
+
env:
|
16
|
+
SERVICES: s3
|
17
|
+
|
18
|
+
steps:
|
19
|
+
- uses: actions/checkout@v1
|
20
|
+
- name: Set up JDK 1.8
|
21
|
+
uses: actions/setup-java@v1
|
22
|
+
with:
|
23
|
+
java-version: 1.8
|
24
|
+
- name: Test with Gradle
|
25
|
+
run: ./gradlew test
|
26
|
+
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
0.1.0 (2019-11-17)
|
2
|
+
==================
|
3
|
+
|
4
|
+
* [New Feature] Support Logical Types older representations(OriginalTypes) #12
|
5
|
+
* [Enhancement] Add Github Actions CI settings #13
|
6
|
+
* [Enhancement] Support LogicalTypes for Glue Data Catalog #14
|
7
|
+
* [Enhancement] Update dependencies #15
|
8
|
+
* [New Feature] Support `auth_method: web_identity_token` #15
|
9
|
+
|
1
10
|
0.0.3 (2019-07-17)
|
2
11
|
==================
|
3
12
|
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# S3 Parquet output plugin for Embulk
|
2
2
|
|
3
|
+
![Release CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Release%20CI/badge.svg) ![Test CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Test%20CI/badge.svg)
|
4
|
+
|
3
5
|
[Embulk](https://github.com/embulk/embulk/) output plugin to dump records as [Apache Parquet](https://parquet.apache.org/) files on S3.
|
4
6
|
|
5
7
|
## Overview
|
@@ -22,12 +24,13 @@
|
|
22
24
|
- **column_options**: a map whose keys are name of columns, and values are configuration with following parameters (optional)
|
23
25
|
- **timezone**: timezone if type of this column is timestamp. If not set, **default_timezone** is used. (string, optional)
|
24
26
|
- **format**: timestamp format if type of this column is timestamp. If not set, **default_timestamp_format**: is used. (string, optional)
|
27
|
+
- **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
|
25
28
|
- **canned_acl**: grants one of [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#CannedACL) for created objects (string, default: `private`)
|
26
29
|
- **block_size**: The block size is the size of a row group being buffered in memory. This limits the memory usage when writing. Larger values will improve the I/O when reading but consume more memory when writing. (int, default: `134217728` (128MB))
|
27
30
|
- **page_size**: The page size is for compression. When reading, each page can be decompressed independently. A block is composed of pages. The page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. (int, default: `1048576` (1MB))
|
28
31
|
- **max_padding_size**: The max size (bytes) to write as padding and the min size of a row group (int, default: `8388608` (8MB))
|
29
32
|
- **enable_dictionary_encoding**: The boolean value is to enable/disable dictionary encoding. (boolean, default: `true`)
|
30
|
-
- **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`,
|
33
|
+
- **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, `"session"`, `"web_identity_token"`, default: `"default"`)
|
31
34
|
- `"basic"`: uses **access_key_id** and **secret_access_key** to authenticate.
|
32
35
|
- `"env"`: uses `AWS_ACCESS_KEY_ID` (or `AWS_ACCESS_KEY`) and `AWS_SECRET_KEY` (or `AWS_SECRET_ACCESS_KEY`) environment variables.
|
33
36
|
- `"instance"`: uses EC2 instance profile or attached ECS task role.
|
@@ -44,6 +47,7 @@
|
|
44
47
|
- `"anonymous"`: uses anonymous access. This auth method can access only public files.
|
45
48
|
- `"session"`: uses temporary-generated **access_key_id**, **secret_access_key** and **session_token**.
|
46
49
|
- `"assume_role"`: uses temporary-generated credentials by assuming **role_arn** role.
|
50
|
+
- `"web_identity_token"`: uses temporary-generated credentials by assuming **role_arn** role with web identity.
|
47
51
|
- `"default"`: uses AWS SDK's default strategy to look up available credentials from runtime environment. This method behaves like the combination of the following methods.
|
48
52
|
1. `"env"`
|
49
53
|
1. `"properties"`
|
@@ -54,17 +58,42 @@
|
|
54
58
|
- **access_key_id**: aws access key id. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
|
55
59
|
- **secret_access_key**: aws secret access key. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
|
56
60
|
- **session_token**: aws session token. this is required when **auth_method** is `"session"`. (string, optional)
|
57
|
-
- **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"`. (string, optional)
|
58
|
-
- **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"`. (string, optional)
|
61
|
+
- **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
|
62
|
+
- **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
|
59
63
|
- **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
60
|
-
- **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
|
64
|
+
- **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
|
65
|
+
- **web_identity_token_file**: the absolute path to the web identity token file. this is required when **auth_method** is `"web_identity_token"`. (string, optional)
|
61
66
|
- **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
62
67
|
- **catalog**: Register a table if this option is specified (optional)
|
63
68
|
- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
|
64
69
|
- **database**: The name of the database (string, required)
|
65
70
|
- **table**: The name of the table (string, required)
|
66
71
|
- **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
|
67
|
-
- **type**: type of
|
72
|
+
- **type**: type of column when this plugin creates new tables (e.g. `string`, `bigint`) (string, default: depends on the input embulk column type, or the parquet logical type. See the below table)
|
73
|
+
|
74
|
+
|embulk column type|glue data type|
|
75
|
+
|:---|:---|
|
76
|
+
|long|bigint|
|
77
|
+
|boolean|boolean|
|
78
|
+
|double|double|
|
79
|
+
|string|string|
|
80
|
+
|timestamp|string|
|
81
|
+
|json|string|
|
82
|
+
|
83
|
+
|parquet logical type|glue data type|note|
|
84
|
+
|:---|:---|:---|
|
85
|
+
|timestamp-millis|timestamp||
|
86
|
+
|timestamp-micros|long|Glue cannot recognize timestamp-micros.|
|
87
|
+
|int8|tinyint||
|
88
|
+
|int16|smallint||
|
89
|
+
|int32|int||
|
90
|
+
|int64|bigint||
|
91
|
+
|uint8|smallint|Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1|
|
92
|
+
|uint16|int|Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.|
|
93
|
+
|uint32|bigint|Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.|
|
94
|
+
|uint64|ConfigException|Glue bigint supports only a 64-bit signed integer.|
|
95
|
+
|json|string||
|
96
|
+
|
68
97
|
- **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
|
69
98
|
- **endpoint**: The AWS Service endpoint (string, optional)
|
70
99
|
- **region**: The AWS region (string, optional)
|
@@ -75,6 +104,8 @@
|
|
75
104
|
- **user** proxy user (string, optional)
|
76
105
|
- **password** proxy password (string, optional)
|
77
106
|
- **buffer_dir**: buffer directory for parquet files to be uploaded on S3 (string, default: Create a Temporary Directory)
|
107
|
+
- **type_options**: a map whose keys are name of embulk type(`boolean`, `long`, `double`, `string`, `timestamp`, `json`), and values are configuration with following parameters (optional)
|
108
|
+
- **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
|
78
109
|
|
79
110
|
|
80
111
|
## Example
|
@@ -92,7 +123,8 @@ out:
|
|
92
123
|
|
93
124
|
## Note
|
94
125
|
|
95
|
-
* The current
|
126
|
+
* The current Parquet [LogicalTypes](https://github.com/apache/parquet-format/blob/2b38663/LogicalTypes.md) implementation does only old representation.
|
127
|
+
* Some kind of LogicalTypes are sometimes not supported on your middleware. Be careful to giving logical type name.
|
96
128
|
|
97
129
|
## Development
|
98
130
|
|
@@ -106,6 +138,8 @@ $ embulk run example/config.yml -Ilib
|
|
106
138
|
### Run test:
|
107
139
|
|
108
140
|
```shell
|
141
|
+
## Run fake S3 with localstack
|
142
|
+
$ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
|
109
143
|
$ ./gradlew test
|
110
144
|
```
|
111
145
|
|
@@ -121,9 +155,12 @@ Fix [build.gradle](./build.gradle), then
|
|
121
155
|
|
122
156
|
```shell
|
123
157
|
$ ./gradlew gemPush
|
124
|
-
|
125
158
|
```
|
126
159
|
|
127
160
|
## ChangeLog
|
128
161
|
|
129
162
|
[CHANGELOG.md](./CHANGELOG.md)
|
163
|
+
|
164
|
+
## Contributors
|
165
|
+
|
166
|
+
- @syucream
|
data/build.gradle
CHANGED
@@ -13,18 +13,18 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.0
|
16
|
+
version = "0.1.0"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.8
|
19
19
|
targetCompatibility = 1.8
|
20
20
|
|
21
21
|
dependencies {
|
22
|
-
compile "org.embulk:embulk-core:0.9.
|
23
|
-
provided "org.embulk:embulk-core:0.9.
|
22
|
+
compile "org.embulk:embulk-core:0.9.20"
|
23
|
+
provided "org.embulk:embulk-core:0.9.20"
|
24
24
|
|
25
|
-
compile 'org.scala-lang:scala-library:2.13.
|
25
|
+
compile 'org.scala-lang:scala-library:2.13.1'
|
26
26
|
['glue', 's3', 'sts'].each { v ->
|
27
|
-
compile "com.amazonaws:aws-java-sdk-${v}:1.11.
|
27
|
+
compile "com.amazonaws:aws-java-sdk-${v}:1.11.676"
|
28
28
|
}
|
29
29
|
['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
|
30
30
|
compile "org.apache.parquet:parquet-${v}:1.10.1"
|
@@ -33,9 +33,8 @@ dependencies {
|
|
33
33
|
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
|
34
34
|
|
35
35
|
testCompile 'org.scalatest:scalatest_2.13:3.0.8'
|
36
|
-
testCompile 'org.embulk:embulk-test:0.9.
|
37
|
-
testCompile 'org.embulk:embulk-standards:0.9.
|
38
|
-
testCompile 'cloud.localstack:localstack-utils:0.1.15'
|
36
|
+
testCompile 'org.embulk:embulk-test:0.9.20'
|
37
|
+
testCompile 'org.embulk:embulk-standards:0.9.20'
|
39
38
|
testCompile 'org.apache.parquet:parquet-tools:1.10.1'
|
40
39
|
testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
|
41
40
|
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: ./example/data.tsv
|
5
|
+
parser:
|
6
|
+
type: csv
|
7
|
+
delimiter: "\t"
|
8
|
+
skip_header_lines: 0
|
9
|
+
null_string: ""
|
10
|
+
columns:
|
11
|
+
- { name: id, type: long }
|
12
|
+
- { name: description, type: string }
|
13
|
+
- { name: name, type: string }
|
14
|
+
- { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
|
15
|
+
- { name: payload, type: json}
|
16
|
+
stop_on_invalid_record: true
|
17
|
+
|
18
|
+
out:
|
19
|
+
type: s3_parquet
|
20
|
+
bucket: dev-baikal-workspace
|
21
|
+
path_prefix: path/to/my-obj-2.
|
22
|
+
file_ext: snappy.parquet
|
23
|
+
compression_codec: snappy
|
24
|
+
default_timezone: Asia/Tokyo
|
25
|
+
canned_acl: bucket-owner-full-control
|
26
|
+
column_options:
|
27
|
+
id:
|
28
|
+
logical_type: "int64"
|
29
|
+
payload:
|
30
|
+
logical_type: "json"
|
31
|
+
type_options:
|
32
|
+
timestamp:
|
33
|
+
logical_type: "timestamp-millis"
|
34
|
+
catalog:
|
35
|
+
database: example_db
|
36
|
+
table: example_tbl
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: ./example/data.tsv
|
5
|
+
parser:
|
6
|
+
type: csv
|
7
|
+
delimiter: "\t"
|
8
|
+
skip_header_lines: 0
|
9
|
+
null_string: ""
|
10
|
+
columns:
|
11
|
+
- { name: id, type: long }
|
12
|
+
- { name: description, type: string }
|
13
|
+
- { name: name, type: string }
|
14
|
+
- { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
|
15
|
+
- { name: payload, type: json}
|
16
|
+
stop_on_invalid_record: true
|
17
|
+
|
18
|
+
out:
|
19
|
+
type: s3_parquet
|
20
|
+
bucket: my-bucket
|
21
|
+
path_prefix: path/to/my-obj-2.
|
22
|
+
file_ext: snappy.parquet
|
23
|
+
compression_codec: snappy
|
24
|
+
default_timezone: Asia/Tokyo
|
25
|
+
canned_acl: bucket-owner-full-control
|
26
|
+
column_options:
|
27
|
+
id:
|
28
|
+
logical_type: "uint64"
|
29
|
+
type_options:
|
30
|
+
timestamp:
|
31
|
+
logical_type: "timestamp-millis"
|
@@ -51,9 +51,10 @@ object CatalogRegistrator
|
|
51
51
|
schema: Schema,
|
52
52
|
location: String,
|
53
53
|
compressionCodec: CompressionCodecName,
|
54
|
-
loggerOption: Option[Logger] = None
|
54
|
+
loggerOption: Option[Logger] = None,
|
55
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty): CatalogRegistrator =
|
55
56
|
{
|
56
|
-
new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
|
57
|
+
new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption, parquetColumnLogicalTypes)
|
57
58
|
}
|
58
59
|
}
|
59
60
|
|
@@ -62,7 +63,8 @@ class CatalogRegistrator(aws: Aws,
|
|
62
63
|
schema: Schema,
|
63
64
|
location: String,
|
64
65
|
compressionCodec: CompressionCodecName,
|
65
|
-
loggerOption: Option[Logger] = None
|
66
|
+
loggerOption: Option[Logger] = None,
|
67
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty)
|
66
68
|
{
|
67
69
|
val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
|
68
70
|
|
@@ -150,14 +152,36 @@ class CatalogRegistrator(aws: Aws,
|
|
150
152
|
schema.getColumns.asScala.toSeq.map { c =>
|
151
153
|
val cType: String =
|
152
154
|
if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
|
153
|
-
else
|
155
|
+
else if (parquetColumnLogicalTypes.contains(c.getName)) convertParquetLogicalTypeToGlueType(parquetColumnLogicalTypes(c.getName))
|
156
|
+
else convertEmbulkTypeToGlueType(c.getType)
|
154
157
|
new Column()
|
155
158
|
.withName(c.getName)
|
156
159
|
.withType(cType)
|
157
160
|
}
|
158
161
|
}
|
159
162
|
|
160
|
-
private def
|
163
|
+
private def convertParquetLogicalTypeToGlueType(t: String): String =
|
164
|
+
{
|
165
|
+
t match {
|
166
|
+
case "timestamp-millis" => "timestamp"
|
167
|
+
case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
|
168
|
+
case "int8" => "tinyint"
|
169
|
+
case "int16" => "smallint"
|
170
|
+
case "int32" => "int"
|
171
|
+
case "int64" => "bigint"
|
172
|
+
case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
|
173
|
+
case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
|
174
|
+
case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
|
175
|
+
case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
|
176
|
+
" because the Glue bigint supports a 64-bit signed integer." +
|
177
|
+
" Please use `catalog.column_options` to define the type.")
|
178
|
+
case "json" => "string"
|
179
|
+
case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
|
180
|
+
}
|
181
|
+
|
182
|
+
}
|
183
|
+
|
184
|
+
private def convertEmbulkTypeToGlueType(t: Type): String =
|
161
185
|
{
|
162
186
|
t match {
|
163
187
|
case _: BooleanType => "boolean"
|
@@ -9,15 +9,18 @@ import org.apache.parquet.column.ParquetProperties
|
|
9
9
|
import org.apache.parquet.hadoop.ParquetWriter
|
10
10
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
11
11
|
import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
|
12
|
-
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.PluginTask
|
12
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, PluginTask}
|
13
13
|
import org.embulk.output.s3_parquet.aws.Aws
|
14
|
-
import org.embulk.output.s3_parquet.parquet.ParquetFileWriter
|
14
|
+
import org.embulk.output.s3_parquet.parquet.{LogicalTypeHandlerStore, ParquetFileWriter}
|
15
15
|
import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
|
16
16
|
import org.embulk.spi.time.TimestampFormatter
|
17
17
|
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
18
|
import org.embulk.spi.util.Timestamps
|
19
19
|
import org.slf4j.{Logger, LoggerFactory}
|
20
20
|
|
21
|
+
import scala.jdk.CollectionConverters._
|
22
|
+
import scala.util.chaining._
|
23
|
+
|
21
24
|
|
22
25
|
object S3ParquetOutputPlugin
|
23
26
|
{
|
@@ -53,7 +56,7 @@ object S3ParquetOutputPlugin
|
|
53
56
|
|
54
57
|
@Config("column_options")
|
55
58
|
@ConfigDefault("{}")
|
56
|
-
def getColumnOptions: JMap[String,
|
59
|
+
def getColumnOptions: JMap[String, ColumnOptionTask]
|
57
60
|
|
58
61
|
@Config("canned_acl")
|
59
62
|
@ConfigDefault("\"private\"")
|
@@ -86,8 +89,23 @@ object S3ParquetOutputPlugin
|
|
86
89
|
@Config("catalog")
|
87
90
|
@ConfigDefault("null")
|
88
91
|
def getCatalog: Optional[CatalogRegistrator.Task]
|
92
|
+
|
93
|
+
@Config("type_options")
|
94
|
+
@ConfigDefault("{}")
|
95
|
+
def getTypeOptions: JMap[String, TypeOptionTask]
|
89
96
|
}
|
90
97
|
|
98
|
+
trait ColumnOptionTask
|
99
|
+
extends Task with TimestampColumnOption with LogicalTypeOption
|
100
|
+
|
101
|
+
trait TypeOptionTask
|
102
|
+
extends Task with LogicalTypeOption
|
103
|
+
|
104
|
+
trait LogicalTypeOption
|
105
|
+
{
|
106
|
+
@Config("logical_type")
|
107
|
+
def getLogicalType: Optional[String]
|
108
|
+
}
|
91
109
|
}
|
92
110
|
|
93
111
|
class S3ParquetOutputPlugin
|
@@ -117,11 +135,26 @@ class S3ParquetOutputPlugin
|
|
117
135
|
}
|
118
136
|
task.getCatalog.ifPresent { catalog =>
|
119
137
|
val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
|
138
|
+
val parquetColumnLogicalTypes: Map[String, String] = Map.newBuilder[String, String].pipe {builder =>
|
139
|
+
val cOptions = task.getColumnOptions.asScala
|
140
|
+
val tOptions = task.getTypeOptions.asScala
|
141
|
+
schema.getColumns.asScala.foreach {c =>
|
142
|
+
cOptions.get(c.getName)
|
143
|
+
if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
|
144
|
+
builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
|
145
|
+
}
|
146
|
+
else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
|
147
|
+
builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
|
148
|
+
}
|
149
|
+
}
|
150
|
+
builder.result()
|
151
|
+
}
|
120
152
|
val cr = CatalogRegistrator(aws = Aws(task),
|
121
153
|
task = catalog,
|
122
154
|
schema = schema,
|
123
155
|
location = location,
|
124
|
-
compressionCodec = task.getCompressionCodec
|
156
|
+
compressionCodec = task.getCompressionCodec,
|
157
|
+
parquetColumnLogicalTypes = parquetColumnLogicalTypes)
|
125
158
|
cr.run()
|
126
159
|
}
|
127
160
|
|
@@ -148,9 +181,12 @@ class S3ParquetOutputPlugin
|
|
148
181
|
|
149
182
|
// column_options
|
150
183
|
task.getColumnOptions.forEach { (k: String,
|
151
|
-
|
184
|
+
opt: ColumnOptionTask) =>
|
152
185
|
val c = schema.lookupColumn(k)
|
153
|
-
|
186
|
+
val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
|
187
|
+
if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
|
188
|
+
throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
189
|
+
}
|
154
190
|
}
|
155
191
|
|
156
192
|
// canned_acl
|
@@ -198,9 +234,11 @@ class S3ParquetOutputPlugin
|
|
198
234
|
val pageReader: PageReader = new PageReader(schema)
|
199
235
|
val aws: Aws = Aws(task)
|
200
236
|
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
|
237
|
+
val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(task.getTypeOptions, task.getColumnOptions)
|
201
238
|
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
202
239
|
.withPath(bufferFile)
|
203
240
|
.withSchema(schema)
|
241
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
204
242
|
.withTimestampFormatters(timestampFormatters)
|
205
243
|
.withCompressionCodec(task.getCompressionCodec)
|
206
244
|
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|