embulk-output-s3_parquet 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.apache.parquet.io.api.{Binary, RecordConsumer}
|
5
4
|
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
|
6
5
|
import org.apache.parquet.schema.{Type => PType}
|
@@ -11,135 +10,151 @@ import org.embulk.spi.`type`.Types
|
|
11
10
|
import org.embulk.spi.time.Timestamp
|
12
11
|
import org.msgpack.value.Value
|
13
12
|
|
14
|
-
|
15
13
|
/**
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
sealed trait LogicalTypeHandler
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
def consume(orig: Any,
|
30
|
-
recordConsumer: RecordConsumer): Unit
|
14
|
+
* Handle Apache Parquet 'Logical Types' on schema/value conversion.
|
15
|
+
* ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
|
16
|
+
*
|
17
|
+
* It focuses on only older representation because newer supported since 1.11 is not used actually yet.
|
18
|
+
* TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
|
19
|
+
*
|
20
|
+
*/
|
21
|
+
sealed trait LogicalTypeHandler {
|
22
|
+
def isConvertible(t: EType): Boolean
|
23
|
+
|
24
|
+
def newSchemaFieldType(name: String): PrimitiveType
|
25
|
+
|
26
|
+
def consume(orig: Any, recordConsumer: RecordConsumer): Unit
|
31
27
|
}
|
32
28
|
|
33
29
|
abstract class IntLogicalTypeHandler(ot: OriginalType)
|
34
|
-
extends LogicalTypeHandler
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
30
|
+
extends LogicalTypeHandler {
|
31
|
+
|
32
|
+
override def isConvertible(t: EType): Boolean = {
|
33
|
+
t == Types.LONG
|
34
|
+
}
|
35
|
+
|
36
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
37
|
+
new PrimitiveType(
|
38
|
+
PType.Repetition.OPTIONAL,
|
39
|
+
PrimitiveTypeName.INT64,
|
40
|
+
name,
|
41
|
+
ot
|
42
|
+
)
|
43
|
+
}
|
44
|
+
|
45
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
46
|
+
orig match {
|
47
|
+
case v: Long => recordConsumer.addLong(v)
|
48
|
+
case _ =>
|
49
|
+
throw new DataException(
|
50
|
+
"given mismatched type value; expected type is long"
|
51
|
+
)
|
53
52
|
}
|
53
|
+
}
|
54
54
|
}
|
55
55
|
|
56
|
-
object TimestampMillisLogicalTypeHandler
|
57
|
-
|
58
|
-
{
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
56
|
+
object TimestampMillisLogicalTypeHandler extends LogicalTypeHandler {
|
57
|
+
|
58
|
+
override def isConvertible(t: EType): Boolean = {
|
59
|
+
t == Types.TIMESTAMP
|
60
|
+
}
|
61
|
+
|
62
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
63
|
+
new PrimitiveType(
|
64
|
+
PType.Repetition.OPTIONAL,
|
65
|
+
PrimitiveTypeName.INT64,
|
66
|
+
name,
|
67
|
+
OriginalType.TIMESTAMP_MILLIS
|
68
|
+
)
|
69
|
+
}
|
70
|
+
|
71
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
72
|
+
orig match {
|
73
|
+
case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
|
74
|
+
case _ =>
|
75
|
+
throw new DataException(
|
76
|
+
"given mismatched type value; expected type is timestamp"
|
77
|
+
)
|
76
78
|
}
|
79
|
+
}
|
77
80
|
}
|
78
81
|
|
79
|
-
object TimestampMicrosLogicalTypeHandler
|
80
|
-
|
81
|
-
{
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
82
|
+
object TimestampMicrosLogicalTypeHandler extends LogicalTypeHandler {
|
83
|
+
|
84
|
+
override def isConvertible(t: EType): Boolean = {
|
85
|
+
t == Types.TIMESTAMP
|
86
|
+
}
|
87
|
+
|
88
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
89
|
+
new PrimitiveType(
|
90
|
+
PType.Repetition.OPTIONAL,
|
91
|
+
PrimitiveTypeName.INT64,
|
92
|
+
name,
|
93
|
+
OriginalType.TIMESTAMP_MICROS
|
94
|
+
)
|
95
|
+
}
|
96
|
+
|
97
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
98
|
+
orig match {
|
99
|
+
case ts: Timestamp =>
|
100
|
+
val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano
|
101
|
+
.asInstanceOf[Long] / 1_000L)
|
102
|
+
recordConsumer.addLong(v)
|
103
|
+
case _ =>
|
104
|
+
throw new DataException(
|
105
|
+
"given mismatched type value; expected type is timestamp"
|
106
|
+
)
|
101
107
|
}
|
108
|
+
}
|
102
109
|
}
|
103
110
|
|
104
|
-
object Int8LogicalTypeHandler
|
105
|
-
|
111
|
+
object Int8LogicalTypeHandler extends IntLogicalTypeHandler(OriginalType.INT_8)
|
112
|
+
|
106
113
|
object Int16LogicalTypeHandler
|
107
114
|
extends IntLogicalTypeHandler(OriginalType.INT_16)
|
115
|
+
|
108
116
|
object Int32LogicalTypeHandler
|
109
117
|
extends IntLogicalTypeHandler(OriginalType.INT_32)
|
118
|
+
|
110
119
|
object Int64LogicalTypeHandler
|
111
120
|
extends IntLogicalTypeHandler(OriginalType.INT_64)
|
112
121
|
|
113
122
|
object Uint8LogicalTypeHandler
|
114
123
|
extends IntLogicalTypeHandler(OriginalType.UINT_8)
|
124
|
+
|
115
125
|
object Uint16LogicalTypeHandler
|
116
126
|
extends IntLogicalTypeHandler(OriginalType.UINT_16)
|
127
|
+
|
117
128
|
object Uint32LogicalTypeHandler
|
118
129
|
extends IntLogicalTypeHandler(OriginalType.UINT_32)
|
130
|
+
|
119
131
|
object Uint64LogicalTypeHandler
|
120
132
|
extends IntLogicalTypeHandler(OriginalType.UINT_64)
|
121
133
|
|
122
|
-
object JsonLogicalTypeHandler
|
123
|
-
|
124
|
-
{
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
134
|
+
object JsonLogicalTypeHandler extends LogicalTypeHandler {
|
135
|
+
|
136
|
+
override def isConvertible(t: EType): Boolean = {
|
137
|
+
t == Types.JSON
|
138
|
+
}
|
139
|
+
|
140
|
+
override def newSchemaFieldType(name: String): PrimitiveType = {
|
141
|
+
new PrimitiveType(
|
142
|
+
PType.Repetition.OPTIONAL,
|
143
|
+
PrimitiveTypeName.BINARY,
|
144
|
+
name,
|
145
|
+
OriginalType.JSON
|
146
|
+
)
|
147
|
+
}
|
148
|
+
|
149
|
+
override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
|
150
|
+
orig match {
|
151
|
+
case msgPack: Value =>
|
152
|
+
val bin = Binary.fromString(msgPack.toJson)
|
153
|
+
recordConsumer.addBinary(bin)
|
154
|
+
case _ =>
|
155
|
+
throw new DataException(
|
156
|
+
"given mismatched type value; expected type is json"
|
157
|
+
)
|
144
158
|
}
|
159
|
+
}
|
145
160
|
}
|
@@ -1,107 +1,114 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.embulk.spi.`type`.{Type, Types}
|
5
4
|
import java.util.{Map => JMap}
|
6
5
|
|
7
6
|
import org.embulk.config.ConfigException
|
8
|
-
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
7
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
|
8
|
+
ColumnOptionTask,
|
9
|
+
TypeOptionTask
|
10
|
+
}
|
9
11
|
|
10
12
|
import scala.jdk.CollectionConverters._
|
11
13
|
|
12
|
-
|
13
14
|
/**
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
case class LogicalTypeHandlerStore private(
|
20
|
-
|
21
|
-
|
15
|
+
* A storage has mapping from logical type query (column name, type) to handler.
|
16
|
+
*
|
17
|
+
* @param fromEmbulkType
|
18
|
+
* @param fromColumnName
|
19
|
+
*/
|
20
|
+
case class LogicalTypeHandlerStore private (
|
21
|
+
fromEmbulkType: Map[Type, LogicalTypeHandler],
|
22
|
+
fromColumnName: Map[String, LogicalTypeHandler]
|
23
|
+
) {
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
case Some(h) => Some(h)
|
32
|
-
case _ => None
|
33
|
-
}
|
25
|
+
// Try column name lookup, then column type
|
26
|
+
def get(n: String, t: Type): Option[LogicalTypeHandler] = {
|
27
|
+
get(n) match {
|
28
|
+
case Some(h) => Some(h)
|
29
|
+
case _ =>
|
30
|
+
get(t) match {
|
31
|
+
case Some(h) => Some(h)
|
32
|
+
case _ => None
|
34
33
|
}
|
35
34
|
}
|
35
|
+
}
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
}
|
37
|
+
def get(t: Type): Option[LogicalTypeHandler] = {
|
38
|
+
fromEmbulkType.get(t)
|
39
|
+
}
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
}
|
41
|
+
def get(n: String): Option[LogicalTypeHandler] = {
|
42
|
+
fromColumnName.get(n)
|
43
|
+
}
|
46
44
|
}
|
47
45
|
|
48
|
-
object LogicalTypeHandlerStore
|
49
|
-
{
|
50
|
-
private val STRING_TO_EMBULK_TYPE = Map[String, Type](
|
51
|
-
"boolean" -> Types.BOOLEAN,
|
52
|
-
"long" -> Types.LONG,
|
53
|
-
"double" -> Types.DOUBLE,
|
54
|
-
"string" -> Types.STRING,
|
55
|
-
"timestamp" -> Types.TIMESTAMP,
|
56
|
-
"json" -> Types.JSON
|
57
|
-
)
|
46
|
+
object LogicalTypeHandlerStore {
|
58
47
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
"uint8" -> Uint8LogicalTypeHandler,
|
68
|
-
"uint16" -> Uint16LogicalTypeHandler,
|
69
|
-
"uint32" -> Uint32LogicalTypeHandler,
|
70
|
-
"uint64" -> Uint64LogicalTypeHandler,
|
71
|
-
"json" -> JsonLogicalTypeHandler
|
72
|
-
)
|
48
|
+
private val STRING_TO_EMBULK_TYPE = Map[String, Type](
|
49
|
+
"boolean" -> Types.BOOLEAN,
|
50
|
+
"long" -> Types.LONG,
|
51
|
+
"double" -> Types.DOUBLE,
|
52
|
+
"string" -> Types.STRING,
|
53
|
+
"timestamp" -> Types.TIMESTAMP,
|
54
|
+
"json" -> Types.JSON
|
55
|
+
)
|
73
56
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
57
|
+
// Listed only older logical types that we can convert from embulk type
|
58
|
+
private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
|
59
|
+
"timestamp-millis" -> TimestampMillisLogicalTypeHandler,
|
60
|
+
"timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
|
61
|
+
"int8" -> Int8LogicalTypeHandler,
|
62
|
+
"int16" -> Int16LogicalTypeHandler,
|
63
|
+
"int32" -> Int32LogicalTypeHandler,
|
64
|
+
"int64" -> Int64LogicalTypeHandler,
|
65
|
+
"uint8" -> Uint8LogicalTypeHandler,
|
66
|
+
"uint16" -> Uint16LogicalTypeHandler,
|
67
|
+
"uint32" -> Uint32LogicalTypeHandler,
|
68
|
+
"uint64" -> Uint64LogicalTypeHandler,
|
69
|
+
"json" -> JsonLogicalTypeHandler
|
70
|
+
)
|
78
71
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
val t = STRING_TO_EMBULK_TYPE.get(k)
|
86
|
-
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
87
|
-
(t, h) match {
|
88
|
-
case (Some(tt), Some(hh)) => (tt, hh)
|
89
|
-
case _ => throw new ConfigException("invalid logical types in type_options")
|
90
|
-
}
|
91
|
-
}
|
92
|
-
.toMap
|
72
|
+
def empty: LogicalTypeHandlerStore = {
|
73
|
+
LogicalTypeHandlerStore(
|
74
|
+
Map.empty[Type, LogicalTypeHandler],
|
75
|
+
Map.empty[String, LogicalTypeHandler]
|
76
|
+
)
|
77
|
+
}
|
93
78
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
79
|
+
def fromEmbulkOptions(
|
80
|
+
typeOpts: JMap[String, TypeOptionTask],
|
81
|
+
columnOpts: JMap[String, ColumnOptionTask]
|
82
|
+
): LogicalTypeHandlerStore = {
|
83
|
+
val fromEmbulkType = typeOpts.asScala
|
84
|
+
.filter(_._2.getLogicalType.isPresent)
|
85
|
+
.map[Type, LogicalTypeHandler] {
|
86
|
+
case (k, v) =>
|
87
|
+
val t = STRING_TO_EMBULK_TYPE.get(k)
|
88
|
+
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
89
|
+
(t, h) match {
|
90
|
+
case (Some(tt), Some(hh)) => (tt, hh)
|
91
|
+
case _ =>
|
92
|
+
throw new ConfigException("invalid logical types in type_options")
|
93
|
+
}
|
94
|
+
}
|
95
|
+
.toMap
|
104
96
|
|
105
|
-
|
106
|
-
|
97
|
+
val fromColumnName = columnOpts.asScala
|
98
|
+
.filter(_._2.getLogicalType.isPresent)
|
99
|
+
.map[String, LogicalTypeHandler] {
|
100
|
+
case (k, v) =>
|
101
|
+
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
102
|
+
h match {
|
103
|
+
case Some(hh) => (k, hh)
|
104
|
+
case _ =>
|
105
|
+
throw new ConfigException(
|
106
|
+
"invalid logical types in column_options"
|
107
|
+
)
|
108
|
+
}
|
109
|
+
}
|
110
|
+
.toMap
|
111
|
+
|
112
|
+
LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
|
113
|
+
}
|
107
114
|
}
|