embulk-output-s3_parquet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.embulk.spi.DataException
5
4
  import org.embulk.spi.`type`.Types
6
5
  import org.junit.runner.RunWith
@@ -9,70 +8,77 @@ import org.scalatestplus.junit.JUnitRunner
9
8
 
10
9
  import scala.util.Try
11
10
 
12
-
13
11
  @RunWith(classOf[JUnitRunner])
14
- class TestLogicalTypeHandler
15
- extends FunSuite
16
- {
17
-
18
- test("IntLogicalTypeHandler.isConvertible() returns true for long") {
19
- val h = Int8LogicalTypeHandler
20
-
21
- assert(h.isConvertible(Types.LONG))
22
- assert(!h.isConvertible(Types.BOOLEAN))
23
- }
24
-
25
- test("IntLogicalTypeHandler.consume() raises DataException if given type is not long") {
26
- val h = Int8LogicalTypeHandler
27
- val actual = Try(h.consume("invalid", null))
28
-
29
- assert(actual.isFailure)
30
- assert(actual.failed.get.isInstanceOf[DataException])
31
- }
32
-
33
-
34
- test("TimestampMillisLogicalTypeHandler.isConvertible() returns true for timestamp") {
35
- val h = TimestampMillisLogicalTypeHandler
36
-
37
- assert(h.isConvertible(Types.TIMESTAMP))
38
- assert(!h.isConvertible(Types.BOOLEAN))
39
- }
40
-
41
- test("TimestampMillisLogicalTypeHandler.consume() raises DataException if given type is not timestamp") {
42
- val h = TimestampMillisLogicalTypeHandler
43
- val actual = Try(h.consume("invalid", null))
44
-
45
- assert(actual.isFailure)
46
- assert(actual.failed.get.isInstanceOf[DataException])
47
- }
48
-
49
-
50
- test("TimestampMicrosLogicalTypeHandler.isConvertible() returns true for timestamp") {
51
- val h = TimestampMicrosLogicalTypeHandler
52
-
53
- assert(h.isConvertible(Types.TIMESTAMP))
54
- assert(!h.isConvertible(Types.BOOLEAN))
55
- }
56
-
57
- test("TimestampMicrosLogicalTypeHandler.consume() raises DataException if given type is not timestamp") {
58
- val h = TimestampMicrosLogicalTypeHandler
59
- val actual = Try(h.consume("invalid", null))
60
-
61
- assert(actual.isFailure)
62
- assert(actual.failed.get.isInstanceOf[DataException])
63
- }
64
-
65
- test("JsonLogicalTypeHandler.isConvertible() returns true for json") {
66
- val h = JsonLogicalTypeHandler
67
-
68
- assert(h.isConvertible(Types.JSON))
69
- assert(!h.isConvertible(Types.BOOLEAN))
70
- }
71
-
72
- test("JsonLogicalTypeHandler.consume() raises DataException if given type is not json") {
73
- val h = JsonLogicalTypeHandler
74
- val actual = Try(h.consume("invalid", null))
75
- assert(actual.isFailure)
76
- assert(actual.failed.get.isInstanceOf[DataException])
77
- }
12
+ class TestLogicalTypeHandler extends FunSuite {
13
+
14
+ test("IntLogicalTypeHandler.isConvertible() returns true for long") {
15
+ val h = Int8LogicalTypeHandler
16
+
17
+ assert(h.isConvertible(Types.LONG))
18
+ assert(!h.isConvertible(Types.BOOLEAN))
19
+ }
20
+
21
+ test(
22
+ "IntLogicalTypeHandler.consume() raises DataException if given type is not long"
23
+ ) {
24
+ val h = Int8LogicalTypeHandler
25
+ val actual = Try(h.consume("invalid", null))
26
+
27
+ assert(actual.isFailure)
28
+ assert(actual.failed.get.isInstanceOf[DataException])
29
+ }
30
+
31
+ test(
32
+ "TimestampMillisLogicalTypeHandler.isConvertible() returns true for timestamp"
33
+ ) {
34
+ val h = TimestampMillisLogicalTypeHandler
35
+
36
+ assert(h.isConvertible(Types.TIMESTAMP))
37
+ assert(!h.isConvertible(Types.BOOLEAN))
38
+ }
39
+
40
+ test(
41
+ "TimestampMillisLogicalTypeHandler.consume() raises DataException if given type is not timestamp"
42
+ ) {
43
+ val h = TimestampMillisLogicalTypeHandler
44
+ val actual = Try(h.consume("invalid", null))
45
+
46
+ assert(actual.isFailure)
47
+ assert(actual.failed.get.isInstanceOf[DataException])
48
+ }
49
+
50
+ test(
51
+ "TimestampMicrosLogicalTypeHandler.isConvertible() returns true for timestamp"
52
+ ) {
53
+ val h = TimestampMicrosLogicalTypeHandler
54
+
55
+ assert(h.isConvertible(Types.TIMESTAMP))
56
+ assert(!h.isConvertible(Types.BOOLEAN))
57
+ }
58
+
59
+ test(
60
+ "TimestampMicrosLogicalTypeHandler.consume() raises DataException if given type is not timestamp"
61
+ ) {
62
+ val h = TimestampMicrosLogicalTypeHandler
63
+ val actual = Try(h.consume("invalid", null))
64
+
65
+ assert(actual.isFailure)
66
+ assert(actual.failed.get.isInstanceOf[DataException])
67
+ }
68
+
69
+ test("JsonLogicalTypeHandler.isConvertible() returns true for json") {
70
+ val h = JsonLogicalTypeHandler
71
+
72
+ assert(h.isConvertible(Types.JSON))
73
+ assert(!h.isConvertible(Types.BOOLEAN))
74
+ }
75
+
76
+ test(
77
+ "JsonLogicalTypeHandler.consume() raises DataException if given type is not json"
78
+ ) {
79
+ val h = JsonLogicalTypeHandler
80
+ val actual = Try(h.consume("invalid", null))
81
+ assert(actual.isFailure)
82
+ assert(actual.failed.get.isInstanceOf[DataException])
83
+ }
78
84
  }
@@ -1,11 +1,13 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import java.util.Optional
5
4
 
6
5
  import com.google.common.base.{Optional => GOptional}
7
6
  import org.embulk.config.{ConfigException, TaskSource}
8
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, TypeOptionTask}
7
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
8
+ ColumnOptionTask,
9
+ TypeOptionTask
10
+ }
9
11
  import org.embulk.spi.`type`.{Types, Type => EType}
10
12
  import org.junit.runner.RunWith
11
13
  import org.scalatest.FunSuite
@@ -14,149 +16,164 @@ import org.scalatestplus.junit.JUnitRunner
14
16
  import scala.jdk.CollectionConverters._
15
17
  import scala.util.Try
16
18
 
17
-
18
19
  @RunWith(classOf[JUnitRunner])
19
- class TestLogicalTypeHandlerStore
20
- extends FunSuite
21
- {
22
- test("empty() returns empty maps") {
23
- val rv = LogicalTypeHandlerStore.empty
24
-
25
- assert(rv.fromColumnName.isEmpty)
26
- assert(rv.fromEmbulkType.isEmpty)
20
+ class TestLogicalTypeHandlerStore extends FunSuite {
21
+ test("empty() returns empty maps") {
22
+ val rv = LogicalTypeHandlerStore.empty
23
+
24
+ assert(rv.fromColumnName.isEmpty)
25
+ assert(rv.fromEmbulkType.isEmpty)
26
+ }
27
+
28
+ test("fromEmbulkOptions() returns handlers for valid option tasks") {
29
+ val typeOpts = Map[String, TypeOptionTask](
30
+ "timestamp" -> DummyTypeOptionTask(
31
+ Optional.of[String]("timestamp-millis")
32
+ )
33
+ ).asJava
34
+ val columnOpts = Map[String, ColumnOptionTask](
35
+ "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros"))
36
+ ).asJava
37
+
38
+ val expected1 = Map[EType, LogicalTypeHandler](
39
+ Types.TIMESTAMP -> TimestampMillisLogicalTypeHandler
40
+ )
41
+ val expected2 = Map[String, LogicalTypeHandler](
42
+ "col1" -> TimestampMicrosLogicalTypeHandler
43
+ )
44
+
45
+ val rv = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
46
+
47
+ assert(rv.fromEmbulkType == expected1)
48
+ assert(rv.fromColumnName == expected2)
49
+ }
50
+
51
+ test(
52
+ "fromEmbulkOptions() raises ConfigException if invalid option tasks given"
53
+ ) {
54
+ val emptyTypeOpts = Map.empty[String, TypeOptionTask].asJava
55
+ val emptyColumnOpts = Map.empty[String, ColumnOptionTask].asJava
56
+
57
+ val invalidTypeOpts = Map[String, TypeOptionTask](
58
+ "unknown-embulk-type-name" -> DummyTypeOptionTask(
59
+ Optional.of[String]("timestamp-millis")
60
+ ),
61
+ "timestamp" -> DummyTypeOptionTask(
62
+ Optional.of[String]("unknown-parquet-logical-type-name")
63
+ )
64
+ ).asJava
65
+ val invalidColumnOpts = Map[String, ColumnOptionTask](
66
+ "col1" -> DummyColumnOptionTask(
67
+ Optional.of[String]("unknown-parquet-logical-type-name")
68
+ )
69
+ ).asJava
70
+
71
+ val try1 = Try(
72
+ LogicalTypeHandlerStore
73
+ .fromEmbulkOptions(invalidTypeOpts, emptyColumnOpts)
74
+ )
75
+ assert(try1.isFailure)
76
+ assert(try1.failed.get.isInstanceOf[ConfigException])
77
+
78
+ val try2 = Try(
79
+ LogicalTypeHandlerStore
80
+ .fromEmbulkOptions(emptyTypeOpts, invalidColumnOpts)
81
+ )
82
+ assert(try2.isFailure)
83
+ assert(try2.failed.get.isInstanceOf[ConfigException])
84
+
85
+ val try3 = Try(
86
+ LogicalTypeHandlerStore
87
+ .fromEmbulkOptions(invalidTypeOpts, invalidColumnOpts)
88
+ )
89
+ assert(try3.isFailure)
90
+ assert(try3.failed.get.isInstanceOf[ConfigException])
91
+ }
92
+
93
+ test("get() returns a handler matched with primary column name condition") {
94
+ val typeOpts = Map[String, TypeOptionTask](
95
+ "timestamp" -> DummyTypeOptionTask(
96
+ Optional.of[String]("timestamp-millis")
97
+ )
98
+ ).asJava
99
+ val columnOpts = Map[String, ColumnOptionTask](
100
+ "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros"))
101
+ ).asJava
102
+
103
+ val handlers =
104
+ LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
105
+
106
+ // It matches both of column name and embulk type, and column name should be primary
107
+ val expected = Some(TimestampMicrosLogicalTypeHandler)
108
+ val actual = handlers.get("col1", Types.TIMESTAMP)
109
+
110
+ assert(actual == expected)
111
+ }
112
+
113
+ test("get() returns a handler matched with type name condition") {
114
+ val typeOpts = Map[String, TypeOptionTask](
115
+ "timestamp" -> DummyTypeOptionTask(
116
+ Optional.of[String]("timestamp-millis")
117
+ )
118
+ ).asJava
119
+ val columnOpts = Map.empty[String, ColumnOptionTask].asJava
120
+
121
+ val handlers =
122
+ LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
123
+
124
+ // It matches column name
125
+ val expected = Some(TimestampMillisLogicalTypeHandler)
126
+ val actual = handlers.get("col1", Types.TIMESTAMP)
127
+
128
+ assert(actual == expected)
129
+ }
130
+
131
+ test("get() returns None if not matched") {
132
+ val typeOpts = Map.empty[String, TypeOptionTask].asJava
133
+ val columnOpts = Map.empty[String, ColumnOptionTask].asJava
134
+
135
+ val handlers =
136
+ LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
137
+
138
+ // It matches embulk type
139
+ val actual = handlers.get("col1", Types.TIMESTAMP)
140
+
141
+ assert(actual.isEmpty)
142
+ }
143
+
144
+ private case class DummyTypeOptionTask(lt: Optional[String])
145
+ extends TypeOptionTask {
146
+
147
+ override def getLogicalType: Optional[String] = {
148
+ lt
27
149
  }
28
150
 
29
- test("fromEmbulkOptions() returns handlers for valid option tasks") {
30
- val typeOpts = Map[String, TypeOptionTask](
31
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
32
- ).asJava
33
- val columnOpts = Map[String, ColumnOptionTask](
34
- "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros")),
35
- ).asJava
36
-
37
- val expected1 = Map[EType, LogicalTypeHandler](
38
- Types.TIMESTAMP -> TimestampMillisLogicalTypeHandler,
39
- )
40
- val expected2 = Map[String, LogicalTypeHandler](
41
- "col1" -> TimestampMicrosLogicalTypeHandler,
42
- )
43
-
44
- val rv = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
45
-
46
- assert(rv.fromEmbulkType == expected1)
47
- assert(rv.fromColumnName == expected2)
48
- }
151
+ override def validate(): Unit = {}
49
152
 
50
- test("fromEmbulkOptions() raises ConfigException if invalid option tasks given") {
51
- val emptyTypeOpts = Map.empty[String, TypeOptionTask].asJava
52
- val emptyColumnOpts = Map.empty[String, ColumnOptionTask].asJava
53
-
54
- val invalidTypeOpts = Map[String, TypeOptionTask](
55
- "unknown-embulk-type-name" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
56
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("unknown-parquet-logical-type-name")),
57
- ).asJava
58
- val invalidColumnOpts = Map[String, ColumnOptionTask](
59
- "col1" -> DummyColumnOptionTask(Optional.of[String]("unknown-parquet-logical-type-name")),
60
- ).asJava
61
-
62
- val try1 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(invalidTypeOpts, emptyColumnOpts))
63
- assert(try1.isFailure)
64
- assert(try1.failed.get.isInstanceOf[ConfigException])
65
-
66
- val try2 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(emptyTypeOpts, invalidColumnOpts))
67
- assert(try2.isFailure)
68
- assert(try2.failed.get.isInstanceOf[ConfigException])
69
-
70
- val try3 = Try(LogicalTypeHandlerStore.fromEmbulkOptions(invalidTypeOpts, invalidColumnOpts))
71
- assert(try3.isFailure)
72
- assert(try3.failed.get.isInstanceOf[ConfigException])
153
+ override def dump(): TaskSource = {
154
+ null
73
155
  }
156
+ }
74
157
 
75
- test("get() returns a handler matched with primary column name condition") {
76
- val typeOpts = Map[String, TypeOptionTask](
77
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
78
- ).asJava
79
- val columnOpts = Map[String, ColumnOptionTask](
80
- "col1" -> DummyColumnOptionTask(Optional.of[String]("timestamp-micros")),
81
- ).asJava
158
+ private case class DummyColumnOptionTask(lt: Optional[String])
159
+ extends ColumnOptionTask {
82
160
 
83
- val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
84
-
85
- // It matches both of column name and embulk type, and column name should be primary
86
- val expected = Some(TimestampMicrosLogicalTypeHandler)
87
- val actual = handlers.get("col1", Types.TIMESTAMP)
88
-
89
- assert(actual == expected)
161
+ override def getTimeZoneId: GOptional[String] = {
162
+ GOptional.absent[String]
90
163
  }
91
164
 
92
- test("get() returns a handler matched with type name condition") {
93
- val typeOpts = Map[String, TypeOptionTask](
94
- "timestamp" -> DummyTypeOptionTask(Optional.of[String]("timestamp-millis")),
95
- ).asJava
96
- val columnOpts = Map.empty[String, ColumnOptionTask].asJava
97
-
98
- val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
99
-
100
- // It matches column name
101
- val expected = Some(TimestampMillisLogicalTypeHandler)
102
- val actual = handlers.get("col1", Types.TIMESTAMP)
103
-
104
- assert(actual == expected)
165
+ override def getFormat: GOptional[String] = {
166
+ GOptional.absent[String]
105
167
  }
106
168
 
107
- test("get() returns None if not matched") {
108
- val typeOpts = Map.empty[String, TypeOptionTask].asJava
109
- val columnOpts = Map.empty[String, ColumnOptionTask].asJava
110
-
111
- val handlers = LogicalTypeHandlerStore.fromEmbulkOptions(typeOpts, columnOpts)
112
-
113
- // It matches embulk type
114
- val actual = handlers.get("col1", Types.TIMESTAMP)
115
-
116
- assert(actual.isEmpty)
169
+ override def getLogicalType: Optional[String] = {
170
+ lt
117
171
  }
118
172
 
119
- private case class DummyTypeOptionTask(lt: Optional[String])
120
- extends TypeOptionTask
121
- {
122
- override def getLogicalType: Optional[String] =
123
- {
124
- lt
125
- }
126
-
127
- override def validate(): Unit =
128
- {}
129
-
130
- override def dump(): TaskSource =
131
- {
132
- null
133
- }
134
- }
173
+ override def validate(): Unit = {}
135
174
 
136
- private case class DummyColumnOptionTask(lt: Optional[String])
137
- extends ColumnOptionTask
138
- {
139
- override def getTimeZoneId: GOptional[String] =
140
- {
141
- GOptional.absent[String]
142
- }
143
-
144
- override def getFormat: GOptional[String] =
145
- {
146
- GOptional.absent[String]
147
- }
148
-
149
- override def getLogicalType: Optional[String] =
150
- {
151
- lt
152
- }
153
-
154
- override def validate(): Unit =
155
- {}
156
-
157
- override def dump(): TaskSource =
158
- {
159
- null
160
- }
175
+ override def dump(): TaskSource = {
176
+ null
161
177
  }
178
+ }
162
179
  }