embulk-output-s3_parquet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.embulk.spi.DataException
|
5
4
|
import org.embulk.spi.`type`.Types
|
6
5
|
import org.junit.runner.RunWith
|
@@ -9,70 +8,77 @@ import org.scalatestplus.junit.JUnitRunner
|
|
9
8
|
|
10
9
|
import scala.util.Try
|
11
10
|
|
12
|
-
|
13
11
|
@RunWith(classOf[JUnitRunner])
|
14
|
-
class TestLogicalTypeHandler
|
15
|
-
|
16
|
-
{
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
12
|
+
class TestLogicalTypeHandler extends FunSuite {
|
13
|
+
|
14
|
+
test("IntLogicalTypeHandler.isConvertible() returns true for long") {
|
15
|
+
val h = Int8LogicalTypeHandler
|
16
|
+
|
17
|
+
assert(h.isConvertible(Types.LONG))
|
18
|
+
assert(!h.isConvertible(Types.BOOLEAN))
|
19
|
+
}
|
20
|
+
|
21
|
+
test(
|
22
|
+
"IntLogicalTypeHandler.consume() raises DataException if given type is not long"
|
23
|
+
) {
|
24
|
+
val h = Int8LogicalTypeHandler
|
25
|
+
val actual = Try(h.consume("invalid", null))
|
26
|
+
|
27
|
+
assert(actual.isFailure)
|
28
|
+
assert(actual.failed.get.isInstanceOf[DataException])
|
29
|
+
}
|
30
|
+
|
31
|
+
test(
|
32
|
+
"TimestampMillisLogicalTypeHandler.isConvertible() returns true for timestamp"
|
33
|
+
) {
|
34
|
+
val h = TimestampMillisLogicalTypeHandler
|
35
|
+
|
36
|
+
assert(h.isConvertible(Types.TIMESTAMP))
|
37
|
+
assert(!h.isConvertible(Types.BOOLEAN))
|
38
|
+
}
|
39
|
+
|
40
|
+
test(
|
41
|
+
"TimestampMillisLogicalTypeHandler.consume() raises DataException if given type is not timestamp"
|
42
|
+
) {
|
43
|
+
val h = TimestampMillisLogicalTypeHandler
|
44
|
+
val actual = Try(h.consume("invalid", null))
|
45
|
+
|
46
|
+
assert(actual.isFailure)
|
47
|
+
assert(actual.failed.get.isInstanceOf[DataException])
|
48
|
+
}
|
49
|
+
|
50
|
+
test(
|
51
|
+
"TimestampMicrosLogicalTypeHandler.isConvertible() returns true for timestamp"
|
52
|
+
) {
|
53
|
+
val h = TimestampMicrosLogicalTypeHandler
|
54
|
+
|
55
|
+
assert(h.isConvertible(Types.TIMESTAMP))
|
56
|
+
assert(!h.isConvertible(Types.BOOLEAN))
|
57
|
+
}
|
58
|
+
|
59
|
+
test(
|
60
|
+
"TimestampMicrosLogicalTypeHandler.consume() raises DataException if given type is not timestamp"
|
61
|
+
) {
|
62
|
+
val h = TimestampMicrosLogicalTypeHandler
|
63
|
+
val actual = Try(h.consume("invalid", null))
|
64
|
+
|
65
|
+
assert(actual.isFailure)
|
66
|
+
assert(actual.failed.get.isInstanceOf[DataException])
|
67
|
+
}
|
68
|
+
|
69
|
+
test("JsonLogicalTypeHandler.isConvertible() returns true for json") {
|
70
|
+
val h = JsonLogicalTypeHandler
|
71
|
+
|
72
|
+
assert(h.isConvertible(Types.JSON))
|
73
|
+
assert(!h.isConvertible(Types.BOOLEAN))
|
74
|
+
}
|
75
|
+
|
76
|
+
test(
|
77
|
+
"JsonLogicalTypeHandler.consume() raises DataException if given type is not json"
|
78
|
+
) {
|
79
|
+
val h = JsonLogicalTypeHandler
|
80
|
+
val actual = Try(h.consume("invalid", null))
|
81
|
+
assert(actual.isFailure)
|
82
|
+
assert(actual.failed.get.isInstanceOf[DataException])
|
83
|
+
}
|
78
84
|
}
|
@@ -1,11 +1,13 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.util.Optional
|
5
4
|
|
6
5
|
import com.google.common.base.{Optional => GOptional}
|
7
6
|
import org.embulk.config.{ConfigException, TaskSource}
|
8
|
-
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
7
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
8
|
+
ColumnOptionTask,
|
9
|
+
TypeOptionTask
|
10
|
+
}
|
9
11
|
import org.embulk.spi.`type`.{Types, Type => EType}
|
10
12
|
import org.junit.runner.RunWith
|
11
13
|
import org.scalatest.FunSuite
|
@@ -14,149 +16,164 @@ import org.scalatestplus.junit.JUnitRunner
|
|
14
16
|
import scala.jdk.CollectionConverters._
|
15
17
|
import scala.util.Try
|
16
18
|
|
17
|
-
|
18
19
|
@RunWith(classOf[JUnitRunner])
|
19
|
-
class TestLogicalTypeHandlerStore
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
20
|
+
class TestLogicalTypeHandlerStore extends FunSuite {
|
21
|
+
test("empty() returns empty maps") {
|
22
|
+
val rv = LogicalTypeHandlerStore.empty
|
23
|
+
|
24
|
+
assert(rv.fromColumnName.isEmpty)
|
25
|
+
assert(rv.fromEmbulkType.isEmpty)
|
26
|
+
}
|
27
|
+
|
28
|
+
test("fromEmbulkOptions() returns handlers for valid option tasks") {
|
29
|
+
val typeOpts = Map[String, TypeOptionTask](
|
30
|
+
"timestamp" -> DummyTypeOptionTask(
|
31
|
+
Optional.of[String]("timestamp-millis")
|
32
|
+
)
|
33
|
+
).asJava
|
34
|
+
val columnOpts = Map[String, ColumnOptionTask](
|
35
|
+
"col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros"))
|
36
|
+
).asJava
|
37
|
+
|
38
|
+
val expected1 = Map[EType, LogicalTypeHandler](
|
39
|
+
Types.TIMESTAMP -> TimestampMillisLogicalTypeHandler
|
40
|
+
)
|
41
|
+
val expected2 = Map[String, LogicalTypeHandler](
|
42
|
+
"col1" -> TimestampMicrosLogicalTypeHandler
|
43
|
+
)
|
44
|
+
|
45
|
+
val rv = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
46
|
+
|
47
|
+
assert(rv.fromEmbulkType == expected1)
|
48
|
+
assert(rv.fromColumnName == expected2)
|
49
|
+
}
|
50
|
+
|
51
|
+
test(
|
52
|
+
"fromEmbulkOptions() raises ConfigException if invalid option tasks given"
|
53
|
+
) {
|
54
|
+
val emptyTypeOpts = Map.empty[String, TypeOptionTask].asJava
|
55
|
+
val emptyColumnOpts = Map.empty[String, ColumnOptionTask].asJava
|
56
|
+
|
57
|
+
val invalidTypeOpts = Map[String, TypeOptionTask](
|
58
|
+
"unknown-embulk-type-name" -> DummyTypeOptionTask(
|
59
|
+
Optional.of[String]("timestamp-millis")
|
60
|
+
),
|
61
|
+
"timestamp" -> DummyTypeOptionTask(
|
62
|
+
Optional.of[String]("unknown-parquet-logical-type-name")
|
63
|
+
)
|
64
|
+
).asJava
|
65
|
+
val invalidColumnOpts = Map[String, ColumnOptionTask](
|
66
|
+
"col1" -> DummyColumnOptionTask(
|
67
|
+
Optional.of[String]("unknown-parquet-logical-type-name")
|
68
|
+
)
|
69
|
+
).asJava
|
70
|
+
|
71
|
+
val try1 = Try(
|
72
|
+
LogicalTypeHandlerStore
|
73
|
+
.fromEmbulkOptions(invalidTypeOpts, emptyColumnOpts)
|
74
|
+
)
|
75
|
+
assert(try1.isFailure)
|
76
|
+
assert(try1.failed.get.isInstanceOf[ConfigException])
|
77
|
+
|
78
|
+
val try2 = Try(
|
79
|
+
LogicalTypeHandlerStore
|
80
|
+
.fromEmbulkOptions(emptyTypeOpts, invalidColumnOpts)
|
81
|
+
)
|
82
|
+
assert(try2.isFailure)
|
83
|
+
assert(try2.failed.get.isInstanceOf[ConfigException])
|
84
|
+
|
85
|
+
val try3 = Try(
|
86
|
+
LogicalTypeHandlerStore
|
87
|
+
.fromEmbulkOptions(invalidTypeOpts, invalidColumnOpts)
|
88
|
+
)
|
89
|
+
assert(try3.isFailure)
|
90
|
+
assert(try3.failed.get.isInstanceOf[ConfigException])
|
91
|
+
}
|
92
|
+
|
93
|
+
test("get() returns a handler matched with primary column name condition") {
|
94
|
+
val typeOpts = Map[String, TypeOptionTask](
|
95
|
+
"timestamp" -> DummyTypeOptionTask(
|
96
|
+
Optional.of[String]("timestamp-millis")
|
97
|
+
)
|
98
|
+
).asJava
|
99
|
+
val columnOpts = Map[String, ColumnOptionTask](
|
100
|
+
"col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros"))
|
101
|
+
).asJava
|
102
|
+
|
103
|
+
val handlers =
|
104
|
+
LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
105
|
+
|
106
|
+
// It matches both of column name and embulk type, and column name should be primary
|
107
|
+
val expected = Some(TimestampMicrosLogicalTypeHandler)
|
108
|
+
val actual = handlers.get("col1", Types.TIMESTAMP)
|
109
|
+
|
110
|
+
assert(actual == expected)
|
111
|
+
}
|
112
|
+
|
113
|
+
test("get() returns a handler matched with type name condition") {
|
114
|
+
val typeOpts = Map[String, TypeOptionTask](
|
115
|
+
"timestamp" -> DummyTypeOptionTask(
|
116
|
+
Optional.of[String]("timestamp-millis")
|
117
|
+
)
|
118
|
+
).asJava
|
119
|
+
val columnOpts = Map.empty[String, ColumnOptionTask].asJava
|
120
|
+
|
121
|
+
val handlers =
|
122
|
+
LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
123
|
+
|
124
|
+
// It matches column name
|
125
|
+
val expected = Some(TimestampMillisLogicalTypeHandler)
|
126
|
+
val actual = handlers.get("col1", Types.TIMESTAMP)
|
127
|
+
|
128
|
+
assert(actual == expected)
|
129
|
+
}
|
130
|
+
|
131
|
+
test("get() returns None if not matched") {
|
132
|
+
val typeOpts = Map.empty[String, TypeOptionTask].asJava
|
133
|
+
val columnOpts = Map.empty[String, ColumnOptionTask].asJava
|
134
|
+
|
135
|
+
val handlers =
|
136
|
+
LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
137
|
+
|
138
|
+
// It matches embulk type
|
139
|
+
val actual = handlers.get("col1", Types.TIMESTAMP)
|
140
|
+
|
141
|
+
assert(actual.isEmpty)
|
142
|
+
}
|
143
|
+
|
144
|
+
private case class DummyTypeOptionTask(lt: Optional[String])
|
145
|
+
extends TypeOptionTask {
|
146
|
+
|
147
|
+
override def getLogicalType: Optional[String] = {
|
148
|
+
lt
|
27
149
|
}
|
28
150
|
|
29
|
-
|
30
|
-
val typeOpts = Map[String, TypeOptionTask](
|
31
|
-
"timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
|
32
|
-
).asJava
|
33
|
-
val columnOpts = Map[String, ColumnOptionTask](
|
34
|
-
"col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros")),
|
35
|
-
).asJava
|
36
|
-
|
37
|
-
val expected1 = Map[EType, LogicalTypeHandler](
|
38
|
-
Types.TIMESTAMP -> TimestampMillisLogicalTypeHandler,
|
39
|
-
)
|
40
|
-
val expected2 = Map[String, LogicalTypeHandler](
|
41
|
-
"col1" -> TimestampMicrosLogicalTypeHandler,
|
42
|
-
)
|
43
|
-
|
44
|
-
val rv = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
45
|
-
|
46
|
-
assert(rv.fromEmbulkType == expected1)
|
47
|
-
assert(rv.fromColumnName == expected2)
|
48
|
-
}
|
151
|
+
override def validate(): Unit = {}
|
49
152
|
|
50
|
-
|
51
|
-
|
52
|
-
val emptyColumnOpts = Map.empty[String, ColumnOptionTask].asJava
|
53
|
-
|
54
|
-
val invalidTypeOpts = Map[String, TypeOptionTask](
|
55
|
-
"unknown-embulk-type-name" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
|
56
|
-
"timestamp" -> DummyTypeOptionTask(Optional.of[String]("unknown-parquet-logical-type-name")),
|
57
|
-
).asJava
|
58
|
-
val invalidColumnOpts = Map[String, ColumnOptionTask](
|
59
|
-
"col1" -> DummyColumnOptionTask(Optional.of[String]("unknown-parquet-logical-type-name")),
|
60
|
-
).asJava
|
61
|
-
|
62
|
-
val try1 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(invalidTypeOpts, emptyColumnOpts))
|
63
|
-
assert(try1.isFailure)
|
64
|
-
assert(try1.failed.get.isInstanceOf[ConfigException])
|
65
|
-
|
66
|
-
val try2 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(emptyTypeOpts, invalidColumnOpts))
|
67
|
-
assert(try2.isFailure)
|
68
|
-
assert(try2.failed.get.isInstanceOf[ConfigException])
|
69
|
-
|
70
|
-
val try3 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(invalidTypeOpts, invalidColumnOpts))
|
71
|
-
assert(try3.isFailure)
|
72
|
-
assert(try3.failed.get.isInstanceOf[ConfigException])
|
153
|
+
override def dump(): TaskSource = {
|
154
|
+
null
|
73
155
|
}
|
156
|
+
}
|
74
157
|
|
75
|
-
|
76
|
-
|
77
|
-
"timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
|
78
|
-
).asJava
|
79
|
-
val columnOpts = Map[String, ColumnOptionTask](
|
80
|
-
"col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros")),
|
81
|
-
).asJava
|
158
|
+
private case class DummyColumnOptionTask(lt: Optional[String])
|
159
|
+
extends ColumnOptionTask {
|
82
160
|
|
83
|
-
|
84
|
-
|
85
|
-
// It matches both of column name and embulk type, and column name should be primary
|
86
|
-
val expected = Some(TimestampMicrosLogicalTypeHandler)
|
87
|
-
val actual = handlers.get("col1", Types.TIMESTAMP)
|
88
|
-
|
89
|
-
assert(actual == expected)
|
161
|
+
override def getTimeZoneId: GOptional[String] = {
|
162
|
+
GOptional.absent[String]
|
90
163
|
}
|
91
164
|
|
92
|
-
|
93
|
-
|
94
|
-
"timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
|
95
|
-
).asJava
|
96
|
-
val columnOpts = Map.empty[String, ColumnOptionTask].asJava
|
97
|
-
|
98
|
-
val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
99
|
-
|
100
|
-
// It matches column name
|
101
|
-
val expected = Some(TimestampMillisLogicalTypeHandler)
|
102
|
-
val actual = handlers.get("col1", Types.TIMESTAMP)
|
103
|
-
|
104
|
-
assert(actual == expected)
|
165
|
+
override def getFormat: GOptional[String] = {
|
166
|
+
GOptional.absent[String]
|
105
167
|
}
|
106
168
|
|
107
|
-
|
108
|
-
|
109
|
-
val columnOpts = Map.empty[String, ColumnOptionTask].asJava
|
110
|
-
|
111
|
-
val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
|
112
|
-
|
113
|
-
// It matches embulk type
|
114
|
-
val actual = handlers.get("col1", Types.TIMESTAMP)
|
115
|
-
|
116
|
-
assert(actual.isEmpty)
|
169
|
+
override def getLogicalType: Optional[String] = {
|
170
|
+
lt
|
117
171
|
}
|
118
172
|
|
119
|
-
|
120
|
-
extends TypeOptionTask
|
121
|
-
{
|
122
|
-
override def getLogicalType: Optional[String] =
|
123
|
-
{
|
124
|
-
lt
|
125
|
-
}
|
126
|
-
|
127
|
-
override def validate(): Unit =
|
128
|
-
{}
|
129
|
-
|
130
|
-
override def dump(): TaskSource =
|
131
|
-
{
|
132
|
-
null
|
133
|
-
}
|
134
|
-
}
|
173
|
+
override def validate(): Unit = {}
|
135
174
|
|
136
|
-
|
137
|
-
|
138
|
-
{
|
139
|
-
override def getTimeZoneId: GOptional[String] =
|
140
|
-
{
|
141
|
-
GOptional.absent[String]
|
142
|
-
}
|
143
|
-
|
144
|
-
override def getFormat: GOptional[String] =
|
145
|
-
{
|
146
|
-
GOptional.absent[String]
|
147
|
-
}
|
148
|
-
|
149
|
-
override def getLogicalType: Optional[String] =
|
150
|
-
{
|
151
|
-
lt
|
152
|
-
}
|
153
|
-
|
154
|
-
override def validate(): Unit =
|
155
|
-
{}
|
156
|
-
|
157
|
-
override def dump(): TaskSource =
|
158
|
-
{
|
159
|
-
null
|
160
|
-
}
|
175
|
+
override def dump(): TaskSource = {
|
176
|
+
null
|
161
177
|
}
|
178
|
+
}
|
162
179
|
}
|