embulk-output-s3_parquet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.embulk.spi.DataException
5
4
  import org.embulk.spi.`type`.Types
6
5
  import org.junit.runner.RunWith
@@ -9,70 +8,77 @@ import org.scalatestplus.junit.JUnitRunner
9
8
 
10
9
  import scala.util.Try
11
10
 
12
-
13
11
  @RunWith(classOf[JUnitRunner])
14
- class TestLogicalTypeHandler
15
- extends FunSuite
16
- {
17
-
18
- test("IntLogicalTypeHandler.isConvertible() returns true for long") {
19
- val h = Int8LogicalTypeHandler
20
-
21
- assert(h.isConvertible(Types.LONG))
22
- assert(!h.isConvertible(Types.BOOLEAN))
23
- }
24
-
25
- test("IntLogicalTypeHandler.consume() raises DataException if given type is not long") {
26
- val h = Int8LogicalTypeHandler
27
- val actual = Try(h.consume("invalid", null))
28
-
29
- assert(actual.isFailure)
30
- assert(actual.failed.get.isInstanceOf[DataException])
31
- }
32
-
33
-
34
- test("TimestampMillisLogicalTypeHandler.isConvertible() returns true for timestamp") {
35
- val h = TimestampMillisLogicalTypeHandler
36
-
37
- assert(h.isConvertible(Types.TIMESTAMP))
38
- assert(!h.isConvertible(Types.BOOLEAN))
39
- }
40
-
41
- test("TimestampMillisLogicalTypeHandler.consume() raises DataException if given type is not timestamp") {
42
- val h = TimestampMillisLogicalTypeHandler
43
- val actual = Try(h.consume("invalid", null))
44
-
45
- assert(actual.isFailure)
46
- assert(actual.failed.get.isInstanceOf[DataException])
47
- }
48
-
49
-
50
- test("TimestampMicrosLogicalTypeHandler.isConvertible() returns true for timestamp") {
51
- val h = TimestampMicrosLogicalTypeHandler
52
-
53
- assert(h.isConvertible(Types.TIMESTAMP))
54
- assert(!h.isConvertible(Types.BOOLEAN))
55
- }
56
-
57
- test("TimestampMicrosLogicalTypeHandler.consume() raises DataException if given type is not timestamp") {
58
- val h = TimestampMicrosLogicalTypeHandler
59
- val actual = Try(h.consume("invalid", null))
60
-
61
- assert(actual.isFailure)
62
- assert(actual.failed.get.isInstanceOf[DataException])
63
- }
64
-
65
- test("JsonLogicalTypeHandler.isConvertible() returns true for json") {
66
- val h = JsonLogicalTypeHandler
67
-
68
- assert(h.isConvertible(Types.JSON))
69
- assert(!h.isConvertible(Types.BOOLEAN))
70
- }
71
-
72
- test("JsonLogicalTypeHandler.consume() raises DataException if given type is not json") {
73
- val h = JsonLogicalTypeHandler
74
- val actual = Try(h.consume("invalid", null))
75
- assert(actual.isFailure)
76
- assert(actual.failed.get.isInstanceOf[DataException])
77
- }
12
+ class TestLogicalTypeHandler extends FunSuite {
13
+
14
+ test("IntLogicalTypeHandler.isConvertible() returns true for long") {
15
+ val h = Int8LogicalTypeHandler
16
+
17
+ assert(h.isConvertible(Types.LONG))
18
+ assert(!h.isConvertible(Types.BOOLEAN))
19
+ }
20
+
21
+ test(
22
+ "IntLogicalTypeHandler.consume() raises DataException if given type is not long"
23
+ ) {
24
+ val h = Int8LogicalTypeHandler
25
+ val actual = Try(h.consume("invalid", null))
26
+
27
+ assert(actual.isFailure)
28
+ assert(actual.failed.get.isInstanceOf[DataException])
29
+ }
30
+
31
+ test(
32
+ "TimestampMillisLogicalTypeHandler.isConvertible() returns true for timestamp"
33
+ ) {
34
+ val h = TimestampMillisLogicalTypeHandler
35
+
36
+ assert(h.isConvertible(Types.TIMESTAMP))
37
+ assert(!h.isConvertible(Types.BOOLEAN))
38
+ }
39
+
40
+ test(
41
+ "TimestampMillisLogicalTypeHandler.consume() raises DataException if given type is not timestamp"
42
+ ) {
43
+ val h = TimestampMillisLogicalTypeHandler
44
+ val actual = Try(h.consume("invalid", null))
45
+
46
+ assert(actual.isFailure)
47
+ assert(actual.failed.get.isInstanceOf[DataException])
48
+ }
49
+
50
+ test(
51
+ "TimestampMicrosLogicalTypeHandler.isConvertible() returns true for timestamp"
52
+ ) {
53
+ val h = TimestampMicrosLogicalTypeHandler
54
+
55
+ assert(h.isConvertible(Types.TIMESTAMP))
56
+ assert(!h.isConvertible(Types.BOOLEAN))
57
+ }
58
+
59
+ test(
60
+ "TimestampMicrosLogicalTypeHandler.consume() raises DataException if given type is not timestamp"
61
+ ) {
62
+ val h = TimestampMicrosLogicalTypeHandler
63
+ val actual = Try(h.consume("invalid", null))
64
+
65
+ assert(actual.isFailure)
66
+ assert(actual.failed.get.isInstanceOf[DataException])
67
+ }
68
+
69
+ test("JsonLogicalTypeHandler.isConvertible() returns true for json") {
70
+ val h = JsonLogicalTypeHandler
71
+
72
+ assert(h.isConvertible(Types.JSON))
73
+ assert(!h.isConvertible(Types.BOOLEAN))
74
+ }
75
+
76
+ test(
77
+ "JsonLogicalTypeHandler.consume() raises DataException if given type is not json"
78
+ ) {
79
+ val h = JsonLogicalTypeHandler
80
+ val actual = Try(h.consume("invalid", null))
81
+ assert(actual.isFailure)
82
+ assert(actual.failed.get.isInstanceOf[DataException])
83
+ }
78
84
  }
@@ -1,11 +1,13 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.google.common.base.{Optional => GOptional}
7
6
  import org.embulk.config.{ConfigException, TaskSource}
8
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, TypeOptionTask}
7
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
8
+ ColumnOptionTask,
9
+ TypeOptionTask
10
+ }
9
11
  import org.embulk.spi.`type`.{Types, Type => EType}
10
12
  import org.junit.runner.RunWith
11
13
  import org.scalatest.FunSuite
@@ -14,149 +16,164 @@ import org.scalatestplus.junit.JUnitRunner
14
16
  import scala.jdk.CollectionConverters._
15
17
  import scala.util.Try
16
18
 
17
-
18
19
  @RunWith(classOf[JUnitRunner])
19
- class TestLogicalTypeHandlerStore
20
- extends FunSuite
21
- {
22
- test("empty() returns empty maps") {
23
- val rv = LogicalTypeHandlerStore.empty
24
-
25
- assert(rv.fromColumnName.isEmpty)
26
- assert(rv.fromEmbulkType.isEmpty)
20
+ class TestLogicalTypeHandlerStore extends FunSuite {
21
+ test("empty() returns empty maps") {
22
+ val rv = LogicalTypeHandlerStore.empty
23
+
24
+ assert(rv.fromColumnName.isEmpty)
25
+ assert(rv.fromEmbulkType.isEmpty)
26
+ }
27
+
28
+ test("fromEmbulkOptions() returns handlers for valid option tasks") {
29
+ val typeOpts = Map[String, TypeOptionTask](
30
+ "timestamp" -> DummyTypeOptionTask(
31
+ Optional.of[String]("timestamp-millis")
32
+ )
33
+ ).asJava
34
+ val columnOpts = Map[String, ColumnOptionTask](
35
+ "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros"))
36
+ ).asJava
37
+
38
+ val expected1 = Map[EType, LogicalTypeHandler](
39
+ Types.TIMESTAMP -> TimestampMillisLogicalTypeHandler
40
+ )
41
+ val expected2 = Map[String, LogicalTypeHandler](
42
+ "col1" -> TimestampMicrosLogicalTypeHandler
43
+ )
44
+
45
+ val rv = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
46
+
47
+ assert(rv.fromEmbulkType == expected1)
48
+ assert(rv.fromColumnName == expected2)
49
+ }
50
+
51
+ test(
52
+ "fromEmbulkOptions() raises ConfigException if invalid option tasks given"
53
+ ) {
54
+ val emptyTypeOpts = Map.empty[String, TypeOptionTask].asJava
55
+ val emptyColumnOpts = Map.empty[String, ColumnOptionTask].asJava
56
+
57
+ val invalidTypeOpts = Map[String, TypeOptionTask](
58
+ "unknown-embulk-type-name" -> DummyTypeOptionTask(
59
+ Optional.of[String]("timestamp-millis")
60
+ ),
61
+ "timestamp" -> DummyTypeOptionTask(
62
+ Optional.of[String]("unknown-parquet-logical-type-name")
63
+ )
64
+ ).asJava
65
+ val invalidColumnOpts = Map[String, ColumnOptionTask](
66
+ "col1" -> DummyColumnOptionTask(
67
+ Optional.of[String]("unknown-parquet-logical-type-name")
68
+ )
69
+ ).asJava
70
+
71
+ val try1 = Try(
72
+ LogicalTypeHandlerStore
73
+ .fromEmbulkOptions(invalidTypeOpts, emptyColumnOpts)
74
+ )
75
+ assert(try1.isFailure)
76
+ assert(try1.failed.get.isInstanceOf[ConfigException])
77
+
78
+ val try2 = Try(
79
+ LogicalTypeHandlerStore
80
+ .fromEmbulkOptions(emptyTypeOpts, invalidColumnOpts)
81
+ )
82
+ assert(try2.isFailure)
83
+ assert(try2.failed.get.isInstanceOf[ConfigException])
84
+
85
+ val try3 = Try(
86
+ LogicalTypeHandlerStore
87
+ .fromEmbulkOptions(invalidTypeOpts, invalidColumnOpts)
88
+ )
89
+ assert(try3.isFailure)
90
+ assert(try3.failed.get.isInstanceOf[ConfigException])
91
+ }
92
+
93
+ test("get() returns a handler matched with primary column name condition") {
94
+ val typeOpts = Map[String, TypeOptionTask](
95
+ "timestamp" -> DummyTypeOptionTask(
96
+ Optional.of[String]("timestamp-millis")
97
+ )
98
+ ).asJava
99
+ val columnOpts = Map[String, ColumnOptionTask](
100
+ "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros"))
101
+ ).asJava
102
+
103
+ val handlers =
104
+ LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
105
+
106
+ // It matches both of column name and embulk type, and column name should be primary
107
+ val expected = Some(TimestampMicrosLogicalTypeHandler)
108
+ val actual = handlers.get("col1", Types.TIMESTAMP)
109
+
110
+ assert(actual == expected)
111
+ }
112
+
113
+ test("get() returns a handler matched with type name condition") {
114
+ val typeOpts = Map[String, TypeOptionTask](
115
+ "timestamp" -> DummyTypeOptionTask(
116
+ Optional.of[String]("timestamp-millis")
117
+ )
118
+ ).asJava
119
+ val columnOpts = Map.empty[String, ColumnOptionTask].asJava
120
+
121
+ val handlers =
122
+ LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
123
+
124
+ // It matches column name
125
+ val expected = Some(TimestampMillisLogicalTypeHandler)
126
+ val actual = handlers.get("col1", Types.TIMESTAMP)
127
+
128
+ assert(actual == expected)
129
+ }
130
+
131
+ test("get() returns None if not matched") {
132
+ val typeOpts = Map.empty[String, TypeOptionTask].asJava
133
+ val columnOpts = Map.empty[String, ColumnOptionTask].asJava
134
+
135
+ val handlers =
136
+ LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
137
+
138
+ // It matches embulk type
139
+ val actual = handlers.get("col1", Types.TIMESTAMP)
140
+
141
+ assert(actual.isEmpty)
142
+ }
143
+
144
+ private case class DummyTypeOptionTask(lt: Optional[String])
145
+ extends TypeOptionTask {
146
+
147
+ override def getLogicalType: Optional[String] = {
148
+ lt
27
149
  }
28
150
 
29
- test("fromEmbulkOptions() returns handlers for valid option tasks") {
30
- val typeOpts = Map[String, TypeOptionTask](
31
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
32
- ).asJava
33
- val columnOpts = Map[String, ColumnOptionTask](
34
- "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros")),
35
- ).asJava
36
-
37
- val expected1 = Map[EType, LogicalTypeHandler](
38
- Types.TIMESTAMP -> TimestampMillisLogicalTypeHandler,
39
- )
40
- val expected2 = Map[String, LogicalTypeHandler](
41
- "col1" -> TimestampMicrosLogicalTypeHandler,
42
- )
43
-
44
- val rv = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
45
-
46
- assert(rv.fromEmbulkType == expected1)
47
- assert(rv.fromColumnName == expected2)
48
- }
151
+ override def validate(): Unit = {}
49
152
 
50
- test("fromEmbulkOptions() raises ConfigException if invalid option tasks given") {
51
- val emptyTypeOpts = Map.empty[String, TypeOptionTask].asJava
52
- val emptyColumnOpts = Map.empty[String, ColumnOptionTask].asJava
53
-
54
- val invalidTypeOpts = Map[String, TypeOptionTask](
55
- "unknown-embulk-type-name" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
56
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("unknown-parquet-logical-type-name")),
57
- ).asJava
58
- val invalidColumnOpts = Map[String, ColumnOptionTask](
59
- "col1" -> DummyColumnOptionTask(Optional.of[String]("unknown-parquet-logical-type-name")),
60
- ).asJava
61
-
62
- val try1 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(invalidTypeOpts, emptyColumnOpts))
63
- assert(try1.isFailure)
64
- assert(try1.failed.get.isInstanceOf[ConfigException])
65
-
66
- val try2 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(emptyTypeOpts, invalidColumnOpts))
67
- assert(try2.isFailure)
68
- assert(try2.failed.get.isInstanceOf[ConfigException])
69
-
70
- val try3 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(invalidTypeOpts, invalidColumnOpts))
71
- assert(try3.isFailure)
72
- assert(try3.failed.get.isInstanceOf[ConfigException])
153
+ override def dump(): TaskSource = {
154
+ null
73
155
  }
156
+ }
74
157
 
75
- test("get() returns a handler matched with primary column name condition") {
76
- val typeOpts = Map[String, TypeOptionTask](
77
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
78
- ).asJava
79
- val columnOpts = Map[String, ColumnOptionTask](
80
- "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros")),
81
- ).asJava
158
+ private case class DummyColumnOptionTask(lt: Optional[String])
159
+ extends ColumnOptionTask {
82
160
 
83
- val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
84
-
85
- // It matches both of column name and embulk type, and column name should be primary
86
- val expected = Some(TimestampMicrosLogicalTypeHandler)
87
- val actual = handlers.get("col1", Types.TIMESTAMP)
88
-
89
- assert(actual == expected)
161
+ override def getTimeZoneId: GOptional[String] = {
162
+ GOptional.absent[String]
90
163
  }
91
164
 
92
- test("get() returns a handler matched with type name condition") {
93
- val typeOpts = Map[String, TypeOptionTask](
94
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
95
- ).asJava
96
- val columnOpts = Map.empty[String, ColumnOptionTask].asJava
97
-
98
- val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
99
-
100
- // It matches column name
101
- val expected = Some(TimestampMillisLogicalTypeHandler)
102
- val actual = handlers.get("col1", Types.TIMESTAMP)
103
-
104
- assert(actual == expected)
165
+ override def getFormat: GOptional[String] = {
166
+ GOptional.absent[String]
105
167
  }
106
168
 
107
- test("get() returns None if not matched") {
108
- val typeOpts = Map.empty[String, TypeOptionTask].asJava
109
- val columnOpts = Map.empty[String, ColumnOptionTask].asJava
110
-
111
- val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
112
-
113
- // It matches embulk type
114
- val actual = handlers.get("col1", Types.TIMESTAMP)
115
-
116
- assert(actual.isEmpty)
169
+ override def getLogicalType: Optional[String] = {
170
+ lt
117
171
  }
118
172
 
119
- private case class DummyTypeOptionTask(lt: Optional[String])
120
- extends TypeOptionTask
121
- {
122
- override def getLogicalType: Optional[String] =
123
- {
124
- lt
125
- }
126
-
127
- override def validate(): Unit =
128
- {}
129
-
130
- override def dump(): TaskSource =
131
- {
132
- null
133
- }
134
- }
173
+ override def validate(): Unit = {}
135
174
 
136
- private case class DummyColumnOptionTask(lt: Optional[String])
137
- extends ColumnOptionTask
138
- {
139
- override def getTimeZoneId: GOptional[String] =
140
- {
141
- GOptional.absent[String]
142
- }
143
-
144
- override def getFormat: GOptional[String] =
145
- {
146
- GOptional.absent[String]
147
- }
148
-
149
- override def getLogicalType: Optional[String] =
150
- {
151
- lt
152
- }
153
-
154
- override def validate(): Unit =
155
- {}
156
-
157
- override def dump(): TaskSource =
158
- {
159
- null
160
- }
175
+ override def dump(): TaskSource = {
176
+ null
161
177
  }
178
+ }
162
179
  }