embulk-output-s3_parquet 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/FUNDING.yml +1 -0
- data/.github/workflows/release.yml +40 -0
- data/.github/workflows/test.yml +26 -0
- data/CHANGELOG.md +9 -0
- data/README.md +44 -7
- data/build.gradle +7 -8
- data/example/with_catalog.yml +36 -0
- data/example/with_logicaltypes.yml +31 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +29 -5
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +44 -6
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +11 -1
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +39 -11
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +145 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +107 -0
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +4 -2
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +51 -34
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +34 -29
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +78 -0
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +162 -0
- metadata +23 -14
@@ -3,7 +3,7 @@ package org.embulk.output.s3_parquet.aws
|
|
3
3
|
|
4
4
|
import java.util.Optional
|
5
5
|
|
6
|
-
import com.amazonaws.auth.{AnonymousAWSCredentials, AWSCredentialsProvider, AWSStaticCredentialsProvider, BasicAWSCredentials, BasicSessionCredentials, DefaultAWSCredentialsProviderChain, EC2ContainerCredentialsProviderWrapper, EnvironmentVariableCredentialsProvider, STSAssumeRoleSessionCredentialsProvider, SystemPropertiesCredentialsProvider}
|
6
|
+
import com.amazonaws.auth.{AnonymousAWSCredentials, AWSCredentialsProvider, AWSStaticCredentialsProvider, BasicAWSCredentials, BasicSessionCredentials, DefaultAWSCredentialsProviderChain, EC2ContainerCredentialsProviderWrapper, EnvironmentVariableCredentialsProvider, STSAssumeRoleSessionCredentialsProvider, SystemPropertiesCredentialsProvider, WebIdentityTokenCredentialsProvider}
|
7
7
|
import com.amazonaws.auth.profile.{ProfileCredentialsProvider, ProfilesConfigFile}
|
8
8
|
import org.embulk.config.{Config, ConfigDefault, ConfigException}
|
9
9
|
import org.embulk.output.s3_parquet.aws.AwsCredentials.Task
|
@@ -60,6 +60,9 @@ object AwsCredentials
|
|
60
60
|
@ConfigDefault("null")
|
61
61
|
def getScopeDownPolicy: Optional[String]
|
62
62
|
|
63
|
+
@Config("web_identity_token_file")
|
64
|
+
@ConfigDefault("null")
|
65
|
+
def getWebIdentityTokenFile: Optional[String]
|
63
66
|
}
|
64
67
|
|
65
68
|
def apply(task: Task): AwsCredentials =
|
@@ -119,6 +122,13 @@ class AwsCredentials(task: Task)
|
|
119
122
|
|
120
123
|
builder.build()
|
121
124
|
|
125
|
+
case "web_identity_token" =>
|
126
|
+
WebIdentityTokenCredentialsProvider.builder()
|
127
|
+
.roleArn(getRequiredOption(task.getRoleArn, "role_arn"))
|
128
|
+
.roleSessionName(getRequiredOption(task.getRoleSessionName, "role_session_name"))
|
129
|
+
.webIdentityTokenFile(getRequiredOption(task.getWebIdentityTokenFile, "web_identity_token_file"))
|
130
|
+
.build()
|
131
|
+
|
122
132
|
case "default" =>
|
123
133
|
new DefaultAWSCredentialsProviderChain
|
124
134
|
|
@@ -16,30 +16,36 @@ object EmbulkMessageType
|
|
16
16
|
}
|
17
17
|
|
18
18
|
case class Builder(name: String = "embulk",
|
19
|
-
schema: Schema = Schema.builder().build()
|
19
|
+
schema: Schema = Schema.builder().build(),
|
20
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
20
21
|
{
|
21
22
|
|
22
23
|
def withName(name: String): Builder =
|
23
24
|
{
|
24
|
-
Builder(name = name, schema = schema)
|
25
|
+
Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
|
25
26
|
}
|
26
27
|
|
27
28
|
def withSchema(schema: Schema): Builder =
|
28
29
|
{
|
29
|
-
Builder(name = name, schema = schema)
|
30
|
+
Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
|
31
|
+
}
|
32
|
+
|
33
|
+
def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
|
34
|
+
{
|
35
|
+
Builder(name = name, schema = schema, logicalTypeHandlers = logicalTypeHandlers)
|
30
36
|
}
|
31
37
|
|
32
38
|
def build(): MessageType =
|
33
39
|
{
|
34
40
|
val builder: ImmutableList.Builder[Type] = ImmutableList.builder[Type]()
|
35
|
-
schema.visitColumns(EmbulkMessageTypeColumnVisitor(builder))
|
41
|
+
schema.visitColumns(EmbulkMessageTypeColumnVisitor(builder, logicalTypeHandlers))
|
36
42
|
new MessageType("embulk", builder.build())
|
37
|
-
|
38
43
|
}
|
39
44
|
|
40
45
|
}
|
41
46
|
|
42
|
-
private case class EmbulkMessageTypeColumnVisitor(builder: ImmutableList.Builder[Type]
|
47
|
+
private case class EmbulkMessageTypeColumnVisitor(builder: ImmutableList.Builder[Type],
|
48
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
43
49
|
extends ColumnVisitor
|
44
50
|
{
|
45
51
|
|
@@ -50,7 +56,15 @@ object EmbulkMessageType
|
|
50
56
|
|
51
57
|
override def longColumn(column: Column): Unit =
|
52
58
|
{
|
53
|
-
|
59
|
+
val name = column.getName
|
60
|
+
val et = column.getType
|
61
|
+
|
62
|
+
val t = logicalTypeHandlers.get(name, et) match {
|
63
|
+
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
64
|
+
case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.INT64, column.getName)
|
65
|
+
}
|
66
|
+
|
67
|
+
builder.add(t)
|
54
68
|
}
|
55
69
|
|
56
70
|
override def doubleColumn(column: Column): Unit =
|
@@ -65,14 +79,28 @@ object EmbulkMessageType
|
|
65
79
|
|
66
80
|
override def timestampColumn(column: Column): Unit =
|
67
81
|
{
|
68
|
-
|
69
|
-
|
82
|
+
val name = column.getName
|
83
|
+
val et = column.getType
|
84
|
+
|
85
|
+
val t = logicalTypeHandlers.get(name, et) match {
|
86
|
+
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
87
|
+
case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
|
88
|
+
}
|
89
|
+
|
90
|
+
builder.add(t)
|
70
91
|
}
|
71
92
|
|
72
93
|
override def jsonColumn(column: Column): Unit =
|
73
94
|
{
|
74
|
-
|
75
|
-
|
95
|
+
val name = column.getName
|
96
|
+
val et = column.getType
|
97
|
+
|
98
|
+
val t = logicalTypeHandlers.get(name, et) match {
|
99
|
+
case Some(h) if h.isConvertible(et) => h.newSchemaFieldType(name)
|
100
|
+
case _ => new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.UTF8)
|
101
|
+
}
|
102
|
+
|
103
|
+
builder.add(t)
|
76
104
|
}
|
77
105
|
}
|
78
106
|
|
@@ -0,0 +1,145 @@
|
|
1
|
+
package org.embulk.output.s3_parquet.parquet
|
2
|
+
|
3
|
+
|
4
|
+
import org.apache.parquet.io.api.{Binary, RecordConsumer}
|
5
|
+
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
|
6
|
+
import org.apache.parquet.schema.{Type => PType}
|
7
|
+
import org.apache.parquet.schema.{OriginalType, PrimitiveType}
|
8
|
+
import org.embulk.spi.DataException
|
9
|
+
import org.embulk.spi.`type`.{Type => EType}
|
10
|
+
import org.embulk.spi.`type`.Types
|
11
|
+
import org.embulk.spi.time.Timestamp
|
12
|
+
import org.msgpack.value.Value
|
13
|
+
|
14
|
+
|
15
|
+
/**
|
16
|
+
* Handle Apache Parquet 'Logical Types' on schema/value conversion.
|
17
|
+
* ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
|
18
|
+
*
|
19
|
+
* It focuses on only older representation because newer supported since 1.11 is not used actually yet.
|
20
|
+
* TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
|
21
|
+
*
|
22
|
+
*/
|
23
|
+
sealed trait LogicalTypeHandler
|
24
|
+
{
|
25
|
+
def isConvertible(t: EType): Boolean
|
26
|
+
|
27
|
+
def newSchemaFieldType(name: String): PrimitiveType
|
28
|
+
|
29
|
+
def consume(orig: Any,
|
30
|
+
recordConsumer: RecordConsumer): Unit
|
31
|
+
}
|
32
|
+
|
33
|
+
abstract class IntLogicalTypeHandler(ot: OriginalType)
|
34
|
+
extends LogicalTypeHandler
|
35
|
+
{
|
36
|
+
override def isConvertible(t: EType): Boolean =
|
37
|
+
{
|
38
|
+
t == Types.LONG
|
39
|
+
}
|
40
|
+
|
41
|
+
override def newSchemaFieldType(name: String): PrimitiveType =
|
42
|
+
{
|
43
|
+
new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, ot)
|
44
|
+
}
|
45
|
+
|
46
|
+
override def consume(orig: Any,
|
47
|
+
recordConsumer: RecordConsumer): Unit =
|
48
|
+
{
|
49
|
+
orig match {
|
50
|
+
case v: Long => recordConsumer.addLong(v)
|
51
|
+
case _ => throw new DataException("given mismatched type value; expected type is long")
|
52
|
+
}
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
object TimestampMillisLogicalTypeHandler
|
57
|
+
extends LogicalTypeHandler
|
58
|
+
{
|
59
|
+
override def isConvertible(t: EType): Boolean =
|
60
|
+
{
|
61
|
+
t == Types.TIMESTAMP
|
62
|
+
}
|
63
|
+
|
64
|
+
override def newSchemaFieldType(name: String): PrimitiveType =
|
65
|
+
{
|
66
|
+
new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, OriginalType.TIMESTAMP_MILLIS)
|
67
|
+
}
|
68
|
+
|
69
|
+
override def consume(orig: Any,
|
70
|
+
recordConsumer: RecordConsumer): Unit =
|
71
|
+
{
|
72
|
+
orig match {
|
73
|
+
case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
|
74
|
+
case _ => throw new DataException("given mismatched type value; expected type is timestamp")
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
object TimestampMicrosLogicalTypeHandler
|
80
|
+
extends LogicalTypeHandler
|
81
|
+
{
|
82
|
+
override def isConvertible(t: EType): Boolean =
|
83
|
+
{
|
84
|
+
t == Types.TIMESTAMP
|
85
|
+
}
|
86
|
+
|
87
|
+
override def newSchemaFieldType(name: String): PrimitiveType =
|
88
|
+
{
|
89
|
+
new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, OriginalType.TIMESTAMP_MICROS)
|
90
|
+
}
|
91
|
+
|
92
|
+
override def consume(orig: Any,
|
93
|
+
recordConsumer: RecordConsumer): Unit =
|
94
|
+
{
|
95
|
+
orig match {
|
96
|
+
case ts: Timestamp =>
|
97
|
+
val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano.asInstanceOf[Long] / 1_000L)
|
98
|
+
recordConsumer.addLong(v)
|
99
|
+
case _ => throw new DataException("given mismatched type value; expected type is timestamp")
|
100
|
+
}
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
object Int8LogicalTypeHandler
|
105
|
+
extends IntLogicalTypeHandler(OriginalType.INT_8)
|
106
|
+
object Int16LogicalTypeHandler
|
107
|
+
extends IntLogicalTypeHandler(OriginalType.INT_16)
|
108
|
+
object Int32LogicalTypeHandler
|
109
|
+
extends IntLogicalTypeHandler(OriginalType.INT_32)
|
110
|
+
object Int64LogicalTypeHandler
|
111
|
+
extends IntLogicalTypeHandler(OriginalType.INT_64)
|
112
|
+
|
113
|
+
object Uint8LogicalTypeHandler
|
114
|
+
extends IntLogicalTypeHandler(OriginalType.UINT_8)
|
115
|
+
object Uint16LogicalTypeHandler
|
116
|
+
extends IntLogicalTypeHandler(OriginalType.UINT_16)
|
117
|
+
object Uint32LogicalTypeHandler
|
118
|
+
extends IntLogicalTypeHandler(OriginalType.UINT_32)
|
119
|
+
object Uint64LogicalTypeHandler
|
120
|
+
extends IntLogicalTypeHandler(OriginalType.UINT_64)
|
121
|
+
|
122
|
+
object JsonLogicalTypeHandler
|
123
|
+
extends LogicalTypeHandler
|
124
|
+
{
|
125
|
+
override def isConvertible(t: EType): Boolean =
|
126
|
+
{
|
127
|
+
t == Types.JSON
|
128
|
+
}
|
129
|
+
|
130
|
+
override def newSchemaFieldType(name: String): PrimitiveType =
|
131
|
+
{
|
132
|
+
new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.JSON)
|
133
|
+
}
|
134
|
+
|
135
|
+
override def consume(orig: Any,
|
136
|
+
recordConsumer: RecordConsumer): Unit =
|
137
|
+
{
|
138
|
+
orig match {
|
139
|
+
case msgPack: Value =>
|
140
|
+
val bin = Binary.fromString(msgPack.toJson)
|
141
|
+
recordConsumer.addBinary(bin)
|
142
|
+
case _ => throw new DataException("given mismatched type value; expected type is json")
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
@@ -0,0 +1,107 @@
|
|
1
|
+
package org.embulk.output.s3_parquet.parquet
|
2
|
+
|
3
|
+
|
4
|
+
import org.embulk.spi.`type`.{Type, Types}
|
5
|
+
import java.util.{Map => JMap}
|
6
|
+
|
7
|
+
import org.embulk.config.ConfigException
|
8
|
+
import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, TypeOptionTask}
|
9
|
+
|
10
|
+
import scala.jdk.CollectionConverters._
|
11
|
+
|
12
|
+
|
13
|
+
/**
|
14
|
+
* A storage has mapping from logical type query (column name, type) to handler.
|
15
|
+
*
|
16
|
+
* @param fromEmbulkType
|
17
|
+
* @param fromColumnName
|
18
|
+
*/
|
19
|
+
case class LogicalTypeHandlerStore private(fromEmbulkType: Map[Type, LogicalTypeHandler],
|
20
|
+
fromColumnName: Map[String, LogicalTypeHandler])
|
21
|
+
{
|
22
|
+
|
23
|
+
// Try column name lookup, then column type
|
24
|
+
def get(n: String,
|
25
|
+
t: Type): Option[LogicalTypeHandler] =
|
26
|
+
{
|
27
|
+
get(n) match {
|
28
|
+
case Some(h) => Some(h)
|
29
|
+
case _ =>
|
30
|
+
get(t) match {
|
31
|
+
case Some(h) => Some(h)
|
32
|
+
case _ => None
|
33
|
+
}
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
def get(t: Type): Option[LogicalTypeHandler] =
|
38
|
+
{
|
39
|
+
fromEmbulkType.get(t)
|
40
|
+
}
|
41
|
+
|
42
|
+
def get(n: String): Option[LogicalTypeHandler] =
|
43
|
+
{
|
44
|
+
fromColumnName.get(n)
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
object LogicalTypeHandlerStore
|
49
|
+
{
|
50
|
+
private val STRING_TO_EMBULK_TYPE = Map[String, Type](
|
51
|
+
"boolean" -> Types.BOOLEAN,
|
52
|
+
"long" -> Types.LONG,
|
53
|
+
"double" -> Types.DOUBLE,
|
54
|
+
"string" -> Types.STRING,
|
55
|
+
"timestamp" -> Types.TIMESTAMP,
|
56
|
+
"json" -> Types.JSON
|
57
|
+
)
|
58
|
+
|
59
|
+
// Listed only older logical types that we can convert from embulk type
|
60
|
+
private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
|
61
|
+
"timestamp-millis" -> TimestampMillisLogicalTypeHandler,
|
62
|
+
"timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
|
63
|
+
"int8" -> Int8LogicalTypeHandler,
|
64
|
+
"int16" -> Int16LogicalTypeHandler,
|
65
|
+
"int32" -> Int32LogicalTypeHandler,
|
66
|
+
"int64" -> Int64LogicalTypeHandler,
|
67
|
+
"uint8" -> Uint8LogicalTypeHandler,
|
68
|
+
"uint16" -> Uint16LogicalTypeHandler,
|
69
|
+
"uint32" -> Uint32LogicalTypeHandler,
|
70
|
+
"uint64" -> Uint64LogicalTypeHandler,
|
71
|
+
"json" -> JsonLogicalTypeHandler
|
72
|
+
)
|
73
|
+
|
74
|
+
def empty: LogicalTypeHandlerStore =
|
75
|
+
{
|
76
|
+
LogicalTypeHandlerStore(Map.empty[Type, LogicalTypeHandler], Map.empty[String, LogicalTypeHandler])
|
77
|
+
}
|
78
|
+
|
79
|
+
def fromEmbulkOptions(typeOpts: JMap[String, TypeOptionTask],
|
80
|
+
columnOpts: JMap[String, ColumnOptionTask]): LogicalTypeHandlerStore =
|
81
|
+
{
|
82
|
+
val fromEmbulkType = typeOpts.asScala
|
83
|
+
.filter(_._2.getLogicalType.isPresent)
|
84
|
+
.map[Type, LogicalTypeHandler] { case (k, v) =>
|
85
|
+
val t = STRING_TO_EMBULK_TYPE.get(k)
|
86
|
+
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
87
|
+
(t, h) match {
|
88
|
+
case (Some(tt), Some(hh)) => (tt, hh)
|
89
|
+
case _ => throw new ConfigException("invalid logical types in type_options")
|
90
|
+
}
|
91
|
+
}
|
92
|
+
.toMap
|
93
|
+
|
94
|
+
val fromColumnName = columnOpts.asScala
|
95
|
+
.filter(_._2.getLogicalType.isPresent)
|
96
|
+
.map[String, LogicalTypeHandler] { case (k, v) =>
|
97
|
+
val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
|
98
|
+
h match {
|
99
|
+
case Some(hh) => (k, hh)
|
100
|
+
case _ => throw new ConfigException("invalid logical types in column_options")
|
101
|
+
}
|
102
|
+
}
|
103
|
+
.toMap
|
104
|
+
|
105
|
+
LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
|
106
|
+
}
|
107
|
+
}
|
@@ -13,7 +13,8 @@ import scala.jdk.CollectionConverters._
|
|
13
13
|
|
14
14
|
|
15
15
|
private[parquet] case class ParquetFileWriteSupport(schema: Schema,
|
16
|
-
timestampFormatters: Seq[TimestampFormatter]
|
16
|
+
timestampFormatters: Seq[TimestampFormatter],
|
17
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
17
18
|
extends WriteSupport[PageReader]
|
18
19
|
{
|
19
20
|
|
@@ -23,6 +24,7 @@ private[parquet] case class ParquetFileWriteSupport(schema: Schema,
|
|
23
24
|
{
|
24
25
|
val messageType: MessageType = EmbulkMessageType.builder()
|
25
26
|
.withSchema(schema)
|
27
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
26
28
|
.build()
|
27
29
|
val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
|
28
30
|
new WriteContext(messageType, metadata.asJava)
|
@@ -30,7 +32,7 @@ private[parquet] case class ParquetFileWriteSupport(schema: Schema,
|
|
30
32
|
|
31
33
|
override def prepareForWrite(recordConsumer: RecordConsumer): Unit =
|
32
34
|
{
|
33
|
-
currentParquetFileWriter = ParquetFileWriter(recordConsumer, schema, timestampFormatters)
|
35
|
+
currentParquetFileWriter = ParquetFileWriter(recordConsumer, schema, timestampFormatters, logicalTypeHandlers)
|
34
36
|
}
|
35
37
|
|
36
38
|
override def write(record: PageReader): Unit =
|
@@ -15,52 +15,59 @@ object ParquetFileWriter
|
|
15
15
|
|
16
16
|
case class Builder(path: Path = null,
|
17
17
|
schema: Schema = null,
|
18
|
-
timestampFormatters: Seq[TimestampFormatter] = null
|
18
|
+
timestampFormatters: Seq[TimestampFormatter] = null,
|
19
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
19
20
|
extends ParquetWriter.Builder[PageReader, Builder](path)
|
20
21
|
{
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
def withPath(path: Path): Builder =
|
24
|
+
{
|
25
|
+
copy(path = path)
|
26
|
+
}
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
def withPath(pathString: String): Builder =
|
29
|
+
{
|
30
|
+
copy(path = new Path(pathString))
|
31
|
+
}
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
def withSchema(schema: Schema): Builder =
|
34
|
+
{
|
35
|
+
copy(schema = schema)
|
36
|
+
}
|
36
37
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
def withTimestampFormatters(timestampFormatters: Seq[TimestampFormatter]): Builder =
|
39
|
+
{
|
40
|
+
copy(timestampFormatters = timestampFormatters)
|
41
|
+
}
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
43
|
+
def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
|
44
|
+
{
|
45
|
+
copy(logicalTypeHandlers = logicalTypeHandlers)
|
46
|
+
}
|
47
|
+
|
48
|
+
override def self(): Builder =
|
49
|
+
{
|
50
|
+
this
|
51
|
+
}
|
46
52
|
|
47
53
|
override def getWriteSupport(conf: Configuration): WriteSupport[PageReader] =
|
48
54
|
{
|
49
|
-
ParquetFileWriteSupport(schema, timestampFormatters)
|
55
|
+
ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
|
50
56
|
}
|
51
57
|
}
|
52
58
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
59
|
+
def builder(): Builder =
|
60
|
+
{
|
61
|
+
Builder()
|
62
|
+
}
|
57
63
|
|
58
64
|
}
|
59
65
|
|
60
66
|
|
61
67
|
private[parquet] case class ParquetFileWriter(recordConsumer: RecordConsumer,
|
62
68
|
schema: Schema,
|
63
|
-
timestampFormatters: Seq[TimestampFormatter]
|
69
|
+
timestampFormatters: Seq[TimestampFormatter],
|
70
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
64
71
|
{
|
65
72
|
|
66
73
|
def write(record: PageReader): Unit =
|
@@ -117,11 +124,16 @@ private[parquet] case class ParquetFileWriter(recordConsumer: RecordConsumer,
|
|
117
124
|
{
|
118
125
|
nullOr(column, {
|
119
126
|
withWriteFieldContext(column, {
|
120
|
-
// TODO: is a correct way to convert for parquet ?
|
121
127
|
val t = record.getTimestamp(column)
|
122
|
-
|
123
|
-
|
124
|
-
|
128
|
+
|
129
|
+
logicalTypeHandlers.get(column.getName, column.getType) match {
|
130
|
+
case Some(h) =>
|
131
|
+
h.consume(t, recordConsumer)
|
132
|
+
case _ =>
|
133
|
+
val ft = timestampFormatters(column.getIndex).format(t)
|
134
|
+
val bin = Binary.fromString(ft)
|
135
|
+
recordConsumer.addBinary(bin)
|
136
|
+
}
|
125
137
|
})
|
126
138
|
})
|
127
139
|
}
|
@@ -130,10 +142,15 @@ private[parquet] case class ParquetFileWriter(recordConsumer: RecordConsumer,
|
|
130
142
|
{
|
131
143
|
nullOr(column, {
|
132
144
|
withWriteFieldContext(column, {
|
133
|
-
// TODO: is a correct way to convert for parquet ?
|
134
145
|
val msgPack = record.getJson(column)
|
135
|
-
|
136
|
-
|
146
|
+
|
147
|
+
logicalTypeHandlers.get(column.getName, column.getType) match {
|
148
|
+
case Some(h) =>
|
149
|
+
h.consume(msgPack, recordConsumer)
|
150
|
+
case _ =>
|
151
|
+
val bin = Binary.fromString(msgPack.toJson)
|
152
|
+
recordConsumer.addBinary(bin)
|
153
|
+
}
|
137
154
|
})
|
138
155
|
})
|
139
156
|
}
|