embulk-output-s3_parquet 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/FUNDING.yml +1 -0
- data/.github/workflows/release.yml +40 -0
- data/.github/workflows/test.yml +26 -0
- data/CHANGELOG.md +9 -0
- data/README.md +44 -7
- data/build.gradle +7 -8
- data/example/with_catalog.yml +36 -0
- data/example/with_logicaltypes.yml +31 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +29 -5
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +44 -6
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +11 -1
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +39 -11
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +145 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +107 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +4 -2
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +51 -34
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +34 -29
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +78 -0
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +162 -0
- metadata +23 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a5fcc051188467ff067c7542e3c12d32b9ce57e
|
4
|
+
data.tar.gz: c93d01c345e6e3a8b43f335f0467ee47532cc32d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 510bf2837f6c57e225b53084790dc0e79feef60247b73d8aee7cd268725676e0783e9ecee0cb2db4a3235969634802b2b8005208f36a4f1a86f1d15777ea9bb9
|
7
|
+
data.tar.gz: a1c23cbf8e5bc1c5414e4e906b0060cb5a7da9085767f319df57763724cfc07cda3925ba085015951dd3a5e40a9dba6a211d777474f00383b9d3f08d9f1d706a
|
data/.github/FUNDING.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
github: civitaspo
|
@@ -0,0 +1,40 @@
|
|
1
|
+
name: Release CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches:
|
6
|
+
- master
|
7
|
+
types:
|
8
|
+
- closed
|
9
|
+
|
10
|
+
jobs:
|
11
|
+
release:
|
12
|
+
|
13
|
+
runs-on: ubuntu-latest
|
14
|
+
services:
|
15
|
+
localstack:
|
16
|
+
image: localstack/localstack
|
17
|
+
ports:
|
18
|
+
- 4572:4572
|
19
|
+
env:
|
20
|
+
SERVICES: s3
|
21
|
+
|
22
|
+
steps:
|
23
|
+
- uses: actions/checkout@v1
|
24
|
+
- name: Set up JDK 1.8
|
25
|
+
uses: actions/setup-java@v1
|
26
|
+
with:
|
27
|
+
java-version: 1.8
|
28
|
+
- name: Test with Gradle
|
29
|
+
if: github.event.pull_request.merged == true
|
30
|
+
run: ./gradlew test
|
31
|
+
- name: Release the new gem
|
32
|
+
if: github.event.pull_request.merged == true
|
33
|
+
run: |
|
34
|
+
mkdir -p $HOME/.gem
|
35
|
+
touch $HOME/.gem/credentials
|
36
|
+
chmod 0600 $HOME/.gem/credentials
|
37
|
+
printf -- "---\n:rubygems_api_key: ${RUBYGEMS_API_KEY}\n" > $HOME/.gem/credentials
|
38
|
+
./gradlew gemPush
|
39
|
+
env:
|
40
|
+
RUBYGEMS_API_KEY: ${{secrets.RUBYGEMS_API_KEY}}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: Test CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
- push
|
5
|
+
|
6
|
+
jobs:
|
7
|
+
test:
|
8
|
+
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
services:
|
11
|
+
localstack:
|
12
|
+
image: localstack/localstack
|
13
|
+
ports:
|
14
|
+
- 4572:4572
|
15
|
+
env:
|
16
|
+
SERVICES: s3
|
17
|
+
|
18
|
+
steps:
|
19
|
+
- uses: actions/checkout@v1
|
20
|
+
- name: Set up JDK 1.8
|
21
|
+
uses: actions/setup-java@v1
|
22
|
+
with:
|
23
|
+
java-version: 1.8
|
24
|
+
- name: Test with Gradle
|
25
|
+
run: ./gradlew test
|
26
|
+
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
0.1.0 (2019-11-17)
|
2
|
+
==================
|
3
|
+
|
4
|
+
* [New Feature] Support Logical Types older representations(OriginalTypes) #12
|
5
|
+
* [Enhancement] Add Github Actions CI settings #13
|
6
|
+
* [Enhancement] Support LogicalTypes for Glue Data Catalog #14
|
7
|
+
* [Enhancement] Update dependencies #15
|
8
|
+
* [New Feature] Support `auth_method: web_identity_token` #15
|
9
|
+
|
1
10
|
0.0.3 (2019-07-17)
|
2
11
|
==================
|
3
12
|
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# S3 Parquet output plugin for Embulk
|
2
2
|
|
3
|
+
 
|
4
|
+
|
3
5
|
[Embulk](https://github.com/embulk/embulk/) output plugin to dump records as [Apache Parquet](https://parquet.apache.org/) files on S3.
|
4
6
|
|
5
7
|
## Overview
|
@@ -22,12 +24,13 @@
|
|
22
24
|
- **column_options**: a map whose keys are name of columns, and values are configuration with following parameters (optional)
|
23
25
|
- **timezone**: timezone if type of this column is timestamp. If not set, **default_timezone** is used. (string, optional)
|
24
26
|
- **format**: timestamp format if type of this column is timestamp. If not set, **default_timestamp_format**: is used. (string, optional)
|
27
|
+
- **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
|
25
28
|
- **canned_acl**: grants one of [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#CannedACL) for created objects (string, default: `private`)
|
26
29
|
- **block_size**: The block size is the size of a row group being buffered in memory. This limits the memory usage when writing. Larger values will improve the I/O when reading but consume more memory when writing. (int, default: `134217728` (128MB))
|
27
30
|
- **page_size**: The page size is for compression. When reading, each page can be decompressed independently. A block is composed of pages. The page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. (int, default: `1048576` (1MB))
|
28
31
|
- **max_padding_size**: The max size (bytes) to write as padding and the min size of a row group (int, default: `8388608` (8MB))
|
29
32
|
- **enable_dictionary_encoding**: The boolean value is to enable/disable dictionary encoding. (boolean, default: `true`)
|
30
|
-
- **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`,
|
33
|
+
- **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, `"session"`, `"web_identity_token"`, default: `"default"`)
|
31
34
|
- `"basic"`: uses **access_key_id** and **secret_access_key** to authenticate.
|
32
35
|
- `"env"`: uses `AWS_ACCESS_KEY_ID` (or `AWS_ACCESS_KEY`) and `AWS_SECRET_KEY` (or `AWS_SECRET_ACCESS_KEY`) environment variables.
|
33
36
|
- `"instance"`: uses EC2 instance profile or attached ECS task role.
|
@@ -44,6 +47,7 @@
|
|
44
47
|
- `"anonymous"`: uses anonymous access. This auth method can access only public files.
|
45
48
|
- `"session"`: uses temporary-generated **access_key_id**, **secret_access_key** and **session_token**.
|
46
49
|
- `"assume_role"`: uses temporary-generated credentials by assuming **role_arn** role.
|
50
|
+
- `"web_identity_token"`: uses temporary-generated credentials by assuming **role_arn** role with web identity.
|
47
51
|
- `"default"`: uses AWS SDK's default strategy to look up available credentials from runtime environment. This method behaves like the combination of the following methods.
|
48
52
|
1. `"env"`
|
49
53
|
1. `"properties"`
|
@@ -54,17 +58,42 @@
|
|
54
58
|
- **access_key_id**: aws access key id. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
|
55
59
|
- **secret_access_key**: aws secret access key. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
|
56
60
|
- **session_token**: aws session token. this is required when **auth_method** is `"session"`. (string, optional)
|
57
|
-
- **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"`. (string, optional)
|
58
|
-
- **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"`. (string, optional)
|
61
|
+
- **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
|
62
|
+
- **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
|
59
63
|
- **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
60
|
-
- **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
|
64
|
+
- **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
|
65
|
+
- **web_identity_token_file**: the absolute path to the web identity token file. this is required when **auth_method** is `"web_identity_token"`. (string, optional)
|
61
66
|
- **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
|
62
67
|
- **catalog**: Register a table if this option is specified (optional)
|
63
68
|
- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
|
64
69
|
- **database**: The name of the database (string, required)
|
65
70
|
- **table**: The name of the table (string, required)
|
66
71
|
- **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
|
67
|
-
- **type**: type of
|
72
|
+
- **type**: type of column when this plugin creates new tables (e.g. `string`, `bigint`) (string, default: depends on the input embulk column type, or the parquet logical type. See the below table)
|
73
|
+
|
74
|
+
|embulk column type|glue data type|
|
75
|
+
|:---|:---|
|
76
|
+
|long|bigint|
|
77
|
+
|boolean|boolean|
|
78
|
+
|double|double|
|
79
|
+
|string|string|
|
80
|
+
|timestamp|string|
|
81
|
+
|json|string|
|
82
|
+
|
83
|
+
|parquet logical type|glue data type|note|
|
84
|
+
|:---|:---|:---|
|
85
|
+
|timestamp-millis|timestamp||
|
86
|
+
|timestamp-micros|long|Glue cannot recognize timestamp-micros.|
|
87
|
+
|int8|tinyint||
|
88
|
+
|int16|smallint||
|
89
|
+
|int32|int||
|
90
|
+
|int64|bigint||
|
91
|
+
|uint8|smallint|Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1|
|
92
|
+
|uint16|int|Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.|
|
93
|
+
|uint32|bigint|Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.|
|
94
|
+
|uint64|ConfigException|Glue bigint supports only a 64-bit signed integer.|
|
95
|
+
|json|string||
|
96
|
+
|
68
97
|
- **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
|
69
98
|
- **endpoint**: The AWS Service endpoint (string, optional)
|
70
99
|
- **region**: The AWS region (string, optional)
|
@@ -75,6 +104,8 @@
|
|
75
104
|
- **user** proxy user (string, optional)
|
76
105
|
- **password** proxy password (string, optional)
|
77
106
|
- **buffer_dir**: buffer directory for parquet files to be uploaded on S3 (string, default: Create a Temporary Directory)
|
107
|
+
- **type_options**: a map whose keys are name of embulk type(`boolean`, `long`, `double`, `string`, `timestamp`, `json`), and values are configuration with following parameters (optional)
|
108
|
+
- **logical_type**: a Parquet logical type name (`timestamp-millis`, `timestamp-micros`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
|
78
109
|
|
79
110
|
|
80
111
|
## Example
|
@@ -92,7 +123,8 @@ out:
|
|
92
123
|
|
93
124
|
## Note
|
94
125
|
|
95
|
-
* The current
|
126
|
+
* The current Parquet [LogicalTypes](https://github.com/apache/parquet-format/blob/2b38663/LogicalTypes.md) implementation does only old representation.
|
127
|
+
* Some kind of LogicalTypes are sometimes not supported on your middleware. Be careful to giving logical type name.
|
96
128
|
|
97
129
|
## Development
|
98
130
|
|
@@ -106,6 +138,8 @@ $ embulk run example/config.yml -Ilib
|
|
106
138
|
### Run test:
|
107
139
|
|
108
140
|
```shell
|
141
|
+
## Run fake S3 with localstack
|
142
|
+
$ docker run -it --rm -p 4572:4572 -e SERVICES=s3 localstack/localstack
|
109
143
|
$ ./gradlew test
|
110
144
|
```
|
111
145
|
|
@@ -121,9 +155,12 @@ Fix [build.gradle](./build.gradle), then
|
|
121
155
|
|
122
156
|
```shell
|
123
157
|
$ ./gradlew gemPush
|
124
|
-
|
125
158
|
```
|
126
159
|
|
127
160
|
## ChangeLog
|
128
161
|
|
129
162
|
[CHANGELOG.md](./CHANGELOG.md)
|
163
|
+
|
164
|
+
## Contributors
|
165
|
+
|
166
|
+
- @syucream
|
data/build.gradle
CHANGED
@@ -13,18 +13,18 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.0
|
16
|
+
version = "0.1.0"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.8
|
19
19
|
targetCompatibility = 1.8
|
20
20
|
|
21
21
|
dependencies {
|
22
|
-
compile "org.embulk:embulk-core:0.9.
|
23
|
-
provided "org.embulk:embulk-core:0.9.
|
22
|
+
compile "org.embulk:embulk-core:0.9.20"
|
23
|
+
provided "org.embulk:embulk-core:0.9.20"
|
24
24
|
|
25
|
-
compile 'org.scala-lang:scala-library:2.13.
|
25
|
+
compile 'org.scala-lang:scala-library:2.13.1'
|
26
26
|
['glue', 's3', 'sts'].each { v ->
|
27
|
-
compile "com.amazonaws:aws-java-sdk-${v}:1.11.
|
27
|
+
compile "com.amazonaws:aws-java-sdk-${v}:1.11.676"
|
28
28
|
}
|
29
29
|
['column', 'common', 'encoding', 'format', 'hadoop', 'jackson'].each { v ->
|
30
30
|
compile "org.apache.parquet:parquet-${v}:1.10.1"
|
@@ -33,9 +33,8 @@ dependencies {
|
|
33
33
|
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
|
34
34
|
|
35
35
|
testCompile 'org.scalatest:scalatest_2.13:3.0.8'
|
36
|
-
testCompile 'org.embulk:embulk-test:0.9.
|
37
|
-
testCompile 'org.embulk:embulk-standards:0.9.
|
38
|
-
testCompile 'cloud.localstack:localstack-utils:0.1.15'
|
36
|
+
testCompile 'org.embulk:embulk-test:0.9.20'
|
37
|
+
testCompile 'org.embulk:embulk-standards:0.9.20'
|
39
38
|
testCompile 'org.apache.parquet:parquet-tools:1.10.1'
|
40
39
|
testCompile 'org.apache.hadoop:hadoop-client:2.9.2'
|
41
40
|
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: ./example/data.tsv
|
5
|
+
parser:
|
6
|
+
type: csv
|
7
|
+
delimiter: "\t"
|
8
|
+
skip_header_lines: 0
|
9
|
+
null_string: ""
|
10
|
+
columns:
|
11
|
+
- { name: id, type: long }
|
12
|
+
- { name: description, type: string }
|
13
|
+
- { name: name, type: string }
|
14
|
+
- { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
|
15
|
+
- { name: payload, type: json}
|
16
|
+
stop_on_invalid_record: true
|
17
|
+
|
18
|
+
out:
|
19
|
+
type: s3_parquet
|
20
|
+
bucket: dev-baikal-workspace
|
21
|
+
path_prefix: path/to/my-obj-2.
|
22
|
+
file_ext: snappy.parquet
|
23
|
+
compression_codec: snappy
|
24
|
+
default_timezone: Asia/Tokyo
|
25
|
+
canned_acl: bucket-owner-full-control
|
26
|
+
column_options:
|
27
|
+
id:
|
28
|
+
logical_type: "int64"
|
29
|
+
payload:
|
30
|
+
logical_type: "json"
|
31
|
+
type_options:
|
32
|
+
timestamp:
|
33
|
+
logical_type: "timestamp-millis"
|
34
|
+
catalog:
|
35
|
+
database: example_db
|
36
|
+
table: example_tbl
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: ./example/data.tsv
|
5
|
+
parser:
|
6
|
+
type: csv
|
7
|
+
delimiter: "\t"
|
8
|
+
skip_header_lines: 0
|
9
|
+
null_string: ""
|
10
|
+
columns:
|
11
|
+
- { name: id, type: long }
|
12
|
+
- { name: description, type: string }
|
13
|
+
- { name: name, type: string }
|
14
|
+
- { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
|
15
|
+
- { name: payload, type: json}
|
16
|
+
stop_on_invalid_record: true
|
17
|
+
|
18
|
+
out:
|
19
|
+
type: s3_parquet
|
20
|
+
bucket: my-bucket
|
21
|
+
path_prefix: path/to/my-obj-2.
|
22
|
+
file_ext: snappy.parquet
|
23
|
+
compression_codec: snappy
|
24
|
+
default_timezone: Asia/Tokyo
|
25
|
+
canned_acl: bucket-owner-full-control
|
26
|
+
column_options:
|
27
|
+
id:
|
28
|
+
logical_type: "uint64"
|
29
|
+
type_options:
|
30
|
+
timestamp:
|
31
|
+
logical_type: "timestamp-millis"
|
@@ -51,9 +51,10 @@ object CatalogRegistrator
|
|
51
51
|
schema: Schema,
|
52
52
|
location: String,
|
53
53
|
compressionCodec: CompressionCodecName,
|
54
|
-
loggerOption: Option[Logger] = None
|
54
|
+
loggerOption: Option[Logger] = None,
|
55
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty): CatalogRegistrator =
|
55
56
|
{
|
56
|
-
new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption)
|
57
|
+
new CatalogRegistrator(aws, task, schema, location, compressionCodec, loggerOption, parquetColumnLogicalTypes)
|
57
58
|
}
|
58
59
|
}
|
59
60
|
|
@@ -62,7 +63,8 @@ class CatalogRegistrator(aws: Aws,
|
|
62
63
|
schema: Schema,
|
63
64
|
location: String,
|
64
65
|
compressionCodec: CompressionCodecName,
|
65
|
-
loggerOption: Option[Logger] = None
|
66
|
+
loggerOption: Option[Logger] = None,
|
67
|
+
parquetColumnLogicalTypes: Map[String, String] = Map.empty)
|
66
68
|
{
|
67
69
|
val logger: Logger = loggerOption.getOrElse(LoggerFactory.getLogger(classOf[CatalogRegistrator]))
|
68
70
|
|
@@ -150,14 +152,36 @@ class CatalogRegistrator(aws: Aws,
|
|
150
152
|
schema.getColumns.asScala.toSeq.map { c =>
|
151
153
|
val cType: String =
|
152
154
|
if (columnOptions.contains(c.getName)) columnOptions(c.getName).getType
|
153
|
-
else
|
155
|
+
else if (parquetColumnLogicalTypes.contains(c.getName)) convertParquetLogicalTypeToGlueType(parquetColumnLogicalTypes(c.getName))
|
156
|
+
else convertEmbulkTypeToGlueType(c.getType)
|
154
157
|
new Column()
|
155
158
|
.withName(c.getName)
|
156
159
|
.withType(cType)
|
157
160
|
}
|
158
161
|
}
|
159
162
|
|
160
|
-
private def
|
163
|
+
private def convertParquetLogicalTypeToGlueType(t: String): String =
|
164
|
+
{
|
165
|
+
t match {
|
166
|
+
case "timestamp-millis" => "timestamp"
|
167
|
+
case "timestamp-micros" => "bigint" // Glue cannot recognize timestamp-micros.
|
168
|
+
case "int8" => "tinyint"
|
169
|
+
case "int16" => "smallint"
|
170
|
+
case "int32" => "int"
|
171
|
+
case "int64" => "bigint"
|
172
|
+
case "uint8" => "smallint" // Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1
|
173
|
+
case "uint16" => "int" // Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.
|
174
|
+
case "uint32" => "bigint" // Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.
|
175
|
+
case "uint64" => throw new ConfigException("Cannot convert uint64 to Glue data types automatically" +
|
176
|
+
" because the Glue bigint supports a 64-bit signed integer." +
|
177
|
+
" Please use `catalog.column_options` to define the type.")
|
178
|
+
case "json" => "string"
|
179
|
+
case _ => throw new ConfigException(s"Unsupported a parquet logical type: $t. Please use `catalog.column_options` to define the type.")
|
180
|
+
}
|
181
|
+
|
182
|
+
}
|
183
|
+
|
184
|
+
private def convertEmbulkTypeToGlueType(t: Type): String =
|
161
185
|
{
|
162
186
|
t match {
|
163
187
|
case _: BooleanType => "boolean"
|
@@ -9,15 +9,18 @@ import org.apache.parquet.column.ParquetProperties
|
|
9
9
|
import org.apache.parquet.hadoop.ParquetWriter
|
10
10
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
11
11
|
import org.embulk.config.{Config, ConfigDefault, ConfigDiff, ConfigException, ConfigSource, Task, TaskReport, TaskSource}
|
12
|
-
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.PluginTask
|
12
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, PluginTask}
|
13
13
|
import org.embulk.output.s3_parquet.aws.Aws
|
14
|
-
import org.embulk.output.s3_parquet.parquet.ParquetFileWriter
|
14
|
+
import org.embulk.output.s3_parquet.parquet.{LogicalTypeHandlerStore, ParquetFileWriter}
|
15
15
|
import org.embulk.spi.{Exec, OutputPlugin, PageReader, Schema, TransactionalPageOutput}
|
16
16
|
import org.embulk.spi.time.TimestampFormatter
|
17
17
|
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
|
18
18
|
import org.embulk.spi.util.Timestamps
|
19
19
|
import org.slf4j.{Logger, LoggerFactory}
|
20
20
|
|
21
|
+
import scala.jdk.CollectionConverters._
|
22
|
+
import scala.util.chaining._
|
23
|
+
|
21
24
|
|
22
25
|
object S3ParquetOutputPlugin
|
23
26
|
{
|
@@ -53,7 +56,7 @@ object S3ParquetOutputPlugin
|
|
53
56
|
|
54
57
|
@Config("column_options")
|
55
58
|
@ConfigDefault("{}")
|
56
|
-
def getColumnOptions: JMap[String,
|
59
|
+
def getColumnOptions: JMap[String, ColumnOptionTask]
|
57
60
|
|
58
61
|
@Config("canned_acl")
|
59
62
|
@ConfigDefault("\"private\"")
|
@@ -86,8 +89,23 @@ object S3ParquetOutputPlugin
|
|
86
89
|
@Config("catalog")
|
87
90
|
@ConfigDefault("null")
|
88
91
|
def getCatalog: Optional[CatalogRegistrator.Task]
|
92
|
+
|
93
|
+
@Config("type_options")
|
94
|
+
@ConfigDefault("{}")
|
95
|
+
def getTypeOptions: JMap[String, TypeOptionTask]
|
89
96
|
}
|
90
97
|
|
98
|
+
trait ColumnOptionTask
|
99
|
+
extends Task with TimestampColumnOption with LogicalTypeOption
|
100
|
+
|
101
|
+
trait TypeOptionTask
|
102
|
+
extends Task with LogicalTypeOption
|
103
|
+
|
104
|
+
trait LogicalTypeOption
|
105
|
+
{
|
106
|
+
@Config("logical_type")
|
107
|
+
def getLogicalType: Optional[String]
|
108
|
+
}
|
91
109
|
}
|
92
110
|
|
93
111
|
class S3ParquetOutputPlugin
|
@@ -117,11 +135,26 @@ class S3ParquetOutputPlugin
|
|
117
135
|
}
|
118
136
|
task.getCatalog.ifPresent { catalog =>
|
119
137
|
val location = s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
|
138
|
+
val parquetColumnLogicalTypes: Map[String, String] = Map.newBuilder[String, String].pipe {builder =>
|
139
|
+
val cOptions = task.getColumnOptions.asScala
|
140
|
+
val tOptions = task.getTypeOptions.asScala
|
141
|
+
schema.getColumns.asScala.foreach {c =>
|
142
|
+
cOptions.get(c.getName)
|
143
|
+
if (cOptions.contains(c.getName) && cOptions(c.getName).getLogicalType.isPresent) {
|
144
|
+
builder.addOne(c.getName -> cOptions(c.getName).getLogicalType.get())
|
145
|
+
}
|
146
|
+
else if (tOptions.contains(c.getType.getName) && tOptions(c.getType.getName).getLogicalType.isPresent) {
|
147
|
+
builder.addOne(c.getName -> tOptions(c.getType.getName).getLogicalType.get())
|
148
|
+
}
|
149
|
+
}
|
150
|
+
builder.result()
|
151
|
+
}
|
120
152
|
val cr = CatalogRegistrator(aws = Aws(task),
|
121
153
|
task = catalog,
|
122
154
|
schema = schema,
|
123
155
|
location = location,
|
124
|
-
compressionCodec = task.getCompressionCodec
|
156
|
+
compressionCodec = task.getCompressionCodec,
|
157
|
+
parquetColumnLogicalTypes = parquetColumnLogicalTypes)
|
125
158
|
cr.run()
|
126
159
|
}
|
127
160
|
|
@@ -148,9 +181,12 @@ class S3ParquetOutputPlugin
|
|
148
181
|
|
149
182
|
// column_options
|
150
183
|
task.getColumnOptions.forEach { (k: String,
|
151
|
-
|
184
|
+
opt: ColumnOptionTask) =>
|
152
185
|
val c = schema.lookupColumn(k)
|
153
|
-
|
186
|
+
val useTimestampOption = opt.getFormat.isPresent || opt.getTimeZoneId.isPresent
|
187
|
+
if (!c.getType.getName.equals("timestamp") && useTimestampOption) {
|
188
|
+
throw new ConfigException(s"column:$k is not 'timestamp' type.")
|
189
|
+
}
|
154
190
|
}
|
155
191
|
|
156
192
|
// canned_acl
|
@@ -198,9 +234,11 @@ class S3ParquetOutputPlugin
|
|
198
234
|
val pageReader: PageReader = new PageReader(schema)
|
199
235
|
val aws: Aws = Aws(task)
|
200
236
|
val timestampFormatters: Seq[TimestampFormatter] = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions).toSeq
|
237
|
+
val logicalTypeHandlers = LogicalTypeHandlerStore.fromEmbulkOptions(task.getTypeOptions, task.getColumnOptions)
|
201
238
|
val parquetWriter: ParquetWriter[PageReader] = ParquetFileWriter.builder()
|
202
239
|
.withPath(bufferFile)
|
203
240
|
.withSchema(schema)
|
241
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
204
242
|
.withTimestampFormatters(timestampFormatters)
|
205
243
|
.withCompressionCodec(task.getCompressionCodec)
|
206
244
|
.withDictionaryEncoding(task.getEnableDictionaryEncoding.orElse(ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED))
|