embulk-output-s3_parquet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.apache.parquet.io.api.{Binary, RecordConsumer}
|
5
4
|
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
|
6
5
|
import org.apache.parquet.schema.{Type => PType}
|
@@ -11,135 +10,151 @@ import org.embulk.spi.`type`.Types
|
|
11
10
|
import org.embulk.spi.time.Timestamp
|
12
11
|
import org.msgpack.value.Value
|
13
12
|
|
14
|
-
|
15
13
|
/**
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
sealed trait LogicalTypeHandler
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
def consume(orig: Any,
|
30
|
-
recordConsumer: RecordConsumer): Unit
|
14
|
+
* Handle Apache Parquet 'Logical Types' on schema/value conversion.
|
15
|
+
* ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
|
16
|
+
*
|
17
|
+
* It focuses on only older representation because newer supported since 1.11 is not used actually yet.
|
18
|
+
* TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
|
19
|
+
*
|
20
|
+
*/
|
21
|
+
sealed trait LogicalTypeHandler {
|
22
|
+
def isConvertible(t: EType): Boolean
|
23
|
+
|
24
|
+
def newSchemaFieldType(name: String): PrimitiveType
|
25
|
+
|
26
|
+
def consume(orig: Any, recordConsumer: RecordConsumer): Unit
|
31
27
|
}
|
32
28
|
|
33
29
|
abstract class IntLogicalTypeHandler(ot: OriginalType)
|
34
|
-
extends LogicalTypeHandler
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
30
|
+
extends LogicalTypeHandler {
|
31
|
+
|
32
|
+
override def isConvertible(t: EType): Boolean = {
|
33
|
+
t == Types.LONG
|
34
|
+
}
|
35
|
+
|
36
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
37
|
+
new PrimitiveType(
|
38
|
+
PType.Repetition.OPTIONAL,
|
39
|
+
PrimitiveTypeName.INT64,
|
40
|
+
name,
|
41
|
+
ot
|
42
|
+
)
|
43
|
+
}
|
44
|
+
|
45
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
46
|
+
orig match {
|
47
|
+
case v: Long => recordConsumer.addLong(v)
|
48
|
+
case _ =>
|
49
|
+
throw new DataException(
|
50
|
+
"given mismatched type value; expected type is long"
|
51
|
+
)
|
53
52
|
}
|
53
|
+
}
|
54
54
|
}
|
55
55
|
|
56
|
-
object TimestampMillisLogicalTypeHandler
|
57
|
-
|
58
|
-
{
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
56
|
+
object TimestampMillisLogicalTypeHandler extends LogicalTypeHandler {
|
57
|
+
|
58
|
+
override def isConvertible(t: EType): Boolean = {
|
59
|
+
t == Types.TIMESTAMP
|
60
|
+
}
|
61
|
+
|
62
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
63
|
+
new PrimitiveType(
|
64
|
+
PType.Repetition.OPTIONAL,
|
65
|
+
PrimitiveTypeName.INT64,
|
66
|
+
name,
|
67
|
+
OriginalType.TIMESTAMP_MILLIS
|
68
|
+
)
|
69
|
+
}
|
70
|
+
|
71
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
72
|
+
orig match {
|
73
|
+
case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
|
74
|
+
case _ =>
|
75
|
+
throw new DataException(
|
76
|
+
"given mismatched type value; expected type is timestamp"
|
77
|
+
)
|
76
78
|
}
|
79
|
+
}
|
77
80
|
}
|
78
81
|
|
79
|
-
object TimestampMicrosLogicalTypeHandler
|
80
|
-
|
81
|
-
{
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
82
|
+
object TimestampMicrosLogicalTypeHandler extends LogicalTypeHandler {
|
83
|
+
|
84
|
+
override def isConvertible(t: EType): Boolean = {
|
85
|
+
t == Types.TIMESTAMP
|
86
|
+
}
|
87
|
+
|
88
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
89
|
+
new PrimitiveType(
|
90
|
+
PType.Repetition.OPTIONAL,
|
91
|
+
PrimitiveTypeName.INT64,
|
92
|
+
name,
|
93
|
+
OriginalType.TIMESTAMP_MICROS
|
94
|
+
)
|
95
|
+
}
|
96
|
+
|
97
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
98
|
+
orig match {
|
99
|
+
case ts: Timestamp =>
|
100
|
+
val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano
|
101
|
+
.asInstanceOf[Long] / 1_000L)
|
102
|
+
recordConsumer.addLong(v)
|
103
|
+
case _ =>
|
104
|
+
throw new DataException(
|
105
|
+
"given mismatched type value; expected type is timestamp"
|
106
|
+
)
|
101
107
|
}
|
108
|
+
}
|
102
109
|
}
|
103
110
|
|
104
|
-
object Int8LogicalTypeHandler
|
105
|
-
|
111
|
+
object Int8LogicalTypeHandler extends IntLogicalTypeHandler(OriginalType.INT_8)
|
112
|
+
|
106
113
|
object Int16LogicalTypeHandler
|
107
114
|
extends IntLogicalTypeHandler(OriginalType.INT_16)
|
115
|
+
|
108
116
|
object Int32LogicalTypeHandler
|
109
117
|
extends IntLogicalTypeHandler(OriginalType.INT_32)
|
118
|
+
|
110
119
|
object Int64LogicalTypeHandler
|
111
120
|
extends IntLogicalTypeHandler(OriginalType.INT_64)
|
112
121
|
|
113
122
|
object Uint8LogicalTypeHandler
|
114
123
|
extends IntLogicalTypeHandler(OriginalType.UINT_8)
|
124
|
+
|
115
125
|
object Uint16LogicalTypeHandler
|
116
126
|
extends IntLogicalTypeHandler(OriginalType.UINT_16)
|
127
|
+
|
117
128
|
object Uint32LogicalTypeHandler
|
118
129
|
extends IntLogicalTypeHandler(OriginalType.UINT_32)
|
130
|
+
|
119
131
|
object Uint64LogicalTypeHandler
|
120
132
|
extends IntLogicalTypeHandler(OriginalType.UINT_64)
|
121
133
|
|
122
|
-
object JsonLogicalTypeHandler
|
123
|
-
|
124
|
-
{
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
134
|
+
object JsonLogicalTypeHandler extends LogicalTypeHandler {
|
135
|
+
|
136
|
+
override def isConvertible(t: EType): Boolean = {
|
137
|
+
t == Types.JSON
|
138
|
+
}
|
139
|
+
|
140
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
141
|
+
new PrimitiveType(
|
142
|
+
PType.Repetition.OPTIONAL,
|
143
|
+
PrimitiveTypeName.BINARY,
|
144
|
+
name,
|
145
|
+
OriginalType.JSON
|
146
|
+
)
|
147
|
+
}
|
148
|
+
|
149
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
150
|
+
orig match {
|
151
|
+
case msgPack: Value =>
|
152
|
+
val bin = Binary.fromString(msgPack.toJson)
|
153
|
+
recordConsumer.addBinary(bin)
|
154
|
+
case _ =>
|
155
|
+
throw new DataException(
|
156
|
+
"given mismatched type value; expected type is json"
|
157
|
+
)
|
144
158
|
}
|
159
|
+
}
|
145
160
|
}
|
@@ -1,107 +1,114 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.embulk.spi.`type`.{Type, Types}
|
5
4
|
import java.util.{Map => JMap}
|
6
5
|
|
7
6
|
import org.embulk.config.ConfigException
|
8
|
-
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
7
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
8
|
+
ColumnOptionTask,
|
9
|
+
TypeOptionTask
|
10
|
+
}
|
9
11
|
|
10
12
|
import scala.jdk.CollectionConverters._
|
11
13
|
|
12
|
-
|
13
14
|
/**
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
case class LogicalTypeHandlerStore private(
|
20
|
-
|
21
|
-
|
15
|
+
* A storage has mapping from logical type query (column name, type) to handler.
|
16
|
+
*
|
17
|
+
* @param fromEmbulkType
|
18
|
+
* @param fromColumnName
|
19
|
+
*/
|
20
|
+
case class LogicalTypeHandlerStore private (
|
21
|
+
fromEmbulkType: Map[Type, LogicalTypeHandler],
|
22
|
+
fromColumnName: Map[String, LogicalTypeHandler]
|
23
|
+
) {
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
case Some(h) => Some(h)
|
32
|
-
case _ => None
|
33
|
-
}
|
25
|
+
// Try column name lookup, then column type
|
26
|
+
def get(n: String, t: Type): Option[LogicalTypeHandler] = {
|
27
|
+
get(n) match {
|
28
|
+
case Some(h) => Some(h)
|
29
|
+
case _ =>
|
30
|
+
get(t) match {
|
31
|
+
case Some(h) => Some(h)
|
32
|
+
case _ => None
|
34
33
|
}
|
35
34
|
}
|
35
|
+
}
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
}
|
37
|
+
def get(t: Type): Option[LogicalTypeHandler] = {
|
38
|
+
fromEmbulkType.get(t)
|
39
|
+
}
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
}
|
41
|
+
def get(n: String): Option[LogicalTypeHandler] = {
|
42
|
+
fromColumnName.get(n)
|
43
|
+
}
|
46
44
|
}
|
47
45
|
|
48
|
-
object LogicalTypeHandlerStore
|
49
|
-
{
|
50
|
-
private val STRING_TO_EMBULK_TYPE = Map[String, Type](
|
51
|
-
"boolean" -> Types.BOOLEAN,
|
52
|
-
"long" -> Types.LONG,
|
53
|
-
"double" -> Types.DOUBLE,
|
54
|
-
"string" -> Types.STRING,
|
55
|
-
"timestamp" -> Types.TIMESTAMP,
|
56
|
-
"json" -> Types.JSON
|
57
|
-
)
|
46
|
+
object LogicalTypeHandlerStore {
|
58
47
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
"uint8" -> Uint8LogicalTypeHandler,
|
68
|
-
"uint16" -> Uint16LogicalTypeHandler,
|
69
|
-
"uint32" -> Uint32LogicalTypeHandler,
|
70
|
-
"uint64" -> Uint64LogicalTypeHandler,
|
71
|
-
"json" -> JsonLogicalTypeHandler
|
72
|
-
)
|
48
|
+
private val STRING_TO_EMBULK_TYPE = Map[String, Type](
|
49
|
+
"boolean" -> Types.BOOLEAN,
|
50
|
+
"long" -> Types.LONG,
|
51
|
+
"double" -> Types.DOUBLE,
|
52
|
+
"string" -> Types.STRING,
|
53
|
+
"timestamp" -> Types.TIMESTAMP,
|
54
|
+
"json" -> Types.JSON
|
55
|
+
)
|
73
56
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
57
|
+
// Listed only older logical types that we can convert from embulk type
|
58
|
+
private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
|
59
|
+
"timestamp-millis" -> TimestampMillisLogicalTypeHandler,
|
60
|
+
"timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
|
61
|
+
"int8" -> Int8LogicalTypeHandler,
|
62
|
+
"int16" -> Int16LogicalTypeHandler,
|
63
|
+
"int32" -> Int32LogicalTypeHandler,
|
64
|
+
"int64" -> Int64LogicalTypeHandler,
|
65
|
+
"uint8" -> Uint8LogicalTypeHandler,
|
66
|
+
"uint16" -> Uint16LogicalTypeHandler,
|
67
|
+
"uint32" -> Uint32LogicalTypeHandler,
|
68
|
+
"uint64" -> Uint64LogicalTypeHandler,
|
69
|
+
"json" -> JsonLogicalTypeHandler
|
70
|
+
)
|
78
71
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
val t = STRING_TO_EMBULK_TYPE.get(k)
|
86
|
-
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
87
|
-
(t, h) match {
|
88
|
-
case (Some(tt), Some(hh)) => (tt, hh)
|
89
|
-
case _ => throw new ConfigException("invalid logical types in type_options")
|
90
|
-
}
|
91
|
-
}
|
92
|
-
.toMap
|
72
|
+
def empty: LogicalTypeHandlerStore = {
|
73
|
+
LogicalTypeHandlerStore(
|
74
|
+
Map.empty[Type, LogicalTypeHandler],
|
75
|
+
Map.empty[String, LogicalTypeHandler]
|
76
|
+
)
|
77
|
+
}
|
93
78
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
79
|
+
def fromEmbulkOptions(
|
80
|
+
typeOpts: JMap[String, TypeOptionTask],
|
81
|
+
columnOpts: JMap[String, ColumnOptionTask]
|
82
|
+
): LogicalTypeHandlerStore = {
|
83
|
+
val fromEmbulkType = typeOpts.asScala
|
84
|
+
.filter(_._2.getLogicalType.isPresent)
|
85
|
+
.map[Type, LogicalTypeHandler] {
|
86
|
+
case (k, v) =>
|
87
|
+
val t = STRING_TO_EMBULK_TYPE.get(k)
|
88
|
+
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
89
|
+
(t, h) match {
|
90
|
+
case (Some(tt), Some(hh)) => (tt, hh)
|
91
|
+
case _ =>
|
92
|
+
throw new ConfigException("invalid logical types in type_options")
|
93
|
+
}
|
94
|
+
}
|
95
|
+
.toMap
|
104
96
|
|
105
|
-
|
106
|
-
|
97
|
+
val fromColumnName = columnOpts.asScala
|
98
|
+
.filter(_._2.getLogicalType.isPresent)
|
99
|
+
.map[String, LogicalTypeHandler] {
|
100
|
+
case (k, v) =>
|
101
|
+
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
102
|
+
h match {
|
103
|
+
case Some(hh) => (k, hh)
|
104
|
+
case _ =>
|
105
|
+
throw new ConfigException(
|
106
|
+
"invalid logical types in column_options"
|
107
|
+
)
|
108
|
+
}
|
109
|
+
}
|
110
|
+
.toMap
|
111
|
+
|
112
|
+
LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
|
113
|
+
}
|
107
114
|
}
|