embulk-output-s3_parquet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.apache.parquet.io.api.{Binary, RecordConsumer}
5
4
  import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
6
5
  import org.apache.parquet.schema.{Type => PType}
@@ -11,135 +10,151 @@ import org.embulk.spi.`type`.Types
11
10
  import org.embulk.spi.time.Timestamp
12
11
  import org.msgpack.value.Value
13
12
 
14
-
15
13
  /**
16
- * Handle Apache Parquet 'Logical Types' on schema/value conversion.
17
- * ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
18
- *
19
- * It focuses on only older representation because newer supported since 1.11 is not used actually yet.
20
- * TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
21
- *
22
- */
23
- sealed trait LogicalTypeHandler
24
- {
25
- def isConvertible(t: EType): Boolean
26
-
27
- def newSchemaFieldType(name: String): PrimitiveType
28
-
29
- def consume(orig: Any,
30
- recordConsumer: RecordConsumer): Unit
14
+ * Handle Apache Parquet 'Logical Types' on schema/value conversion.
15
+ * ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
16
+ *
17
+ * It focuses on only older representation because newer supported since 1.11 is not used actually yet.
18
+ * TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
19
+ *
20
+ */
21
+ sealed trait LogicalTypeHandler {
22
+ def isConvertible(t: EType): Boolean
23
+
24
+ def newSchemaFieldType(name: String): PrimitiveType
25
+
26
+ def consume(orig: Any, recordConsumer: RecordConsumer): Unit
31
27
  }
32
28
 
33
29
  abstract class IntLogicalTypeHandler(ot: OriginalType)
34
- extends LogicalTypeHandler
35
- {
36
- override def isConvertible(t: EType): Boolean =
37
- {
38
- t == Types.LONG
39
- }
40
-
41
- override def newSchemaFieldType(name: String): PrimitiveType =
42
- {
43
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, ot)
44
- }
45
-
46
- override def consume(orig: Any,
47
- recordConsumer: RecordConsumer): Unit =
48
- {
49
- orig match {
50
- case v: Long => recordConsumer.addLong(v)
51
- case _ => throw new DataException("given mismatched type value; expected type is long")
52
- }
30
+ extends LogicalTypeHandler {
31
+
32
+ override def isConvertible(t: EType): Boolean = {
33
+ t == Types.LONG
34
+ }
35
+
36
+ override def newSchemaFieldType(name: String): PrimitiveType = {
37
+ new PrimitiveType(
38
+ PType.Repetition.OPTIONAL,
39
+ PrimitiveTypeName.INT64,
40
+ name,
41
+ ot
42
+ )
43
+ }
44
+
45
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
46
+ orig match {
47
+ case v: Long => recordConsumer.addLong(v)
48
+ case _ =>
49
+ throw new DataException(
50
+ "given mismatched type value; expected type is long"
51
+ )
53
52
  }
53
+ }
54
54
  }
55
55
 
56
- object TimestampMillisLogicalTypeHandler
57
- extends LogicalTypeHandler
58
- {
59
- override def isConvertible(t: EType): Boolean =
60
- {
61
- t == Types.TIMESTAMP
62
- }
63
-
64
- override def newSchemaFieldType(name: String): PrimitiveType =
65
- {
66
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, OriginalType.TIMESTAMP_MILLIS)
67
- }
68
-
69
- override def consume(orig: Any,
70
- recordConsumer: RecordConsumer): Unit =
71
- {
72
- orig match {
73
- case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
74
- case _ => throw new DataException("given mismatched type value; expected type is timestamp")
75
- }
56
+ object TimestampMillisLogicalTypeHandler extends LogicalTypeHandler {
57
+
58
+ override def isConvertible(t: EType): Boolean = {
59
+ t == Types.TIMESTAMP
60
+ }
61
+
62
+ override def newSchemaFieldType(name: String): PrimitiveType = {
63
+ new PrimitiveType(
64
+ PType.Repetition.OPTIONAL,
65
+ PrimitiveTypeName.INT64,
66
+ name,
67
+ OriginalType.TIMESTAMP_MILLIS
68
+ )
69
+ }
70
+
71
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
72
+ orig match {
73
+ case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
74
+ case _ =>
75
+ throw new DataException(
76
+ "given mismatched type value; expected type is timestamp"
77
+ )
76
78
  }
79
+ }
77
80
  }
78
81
 
79
- object TimestampMicrosLogicalTypeHandler
80
- extends LogicalTypeHandler
81
- {
82
- override def isConvertible(t: EType): Boolean =
83
- {
84
- t == Types.TIMESTAMP
85
- }
86
-
87
- override def newSchemaFieldType(name: String): PrimitiveType =
88
- {
89
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, OriginalType.TIMESTAMP_MICROS)
90
- }
91
-
92
- override def consume(orig: Any,
93
- recordConsumer: RecordConsumer): Unit =
94
- {
95
- orig match {
96
- case ts: Timestamp =>
97
- val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano.asInstanceOf[Long] / 1_000L)
98
- recordConsumer.addLong(v)
99
- case _ => throw new DataException("given mismatched type value; expected type is timestamp")
100
- }
82
+ object TimestampMicrosLogicalTypeHandler extends LogicalTypeHandler {
83
+
84
+ override def isConvertible(t: EType): Boolean = {
85
+ t == Types.TIMESTAMP
86
+ }
87
+
88
+ override def newSchemaFieldType(name: String): PrimitiveType = {
89
+ new PrimitiveType(
90
+ PType.Repetition.OPTIONAL,
91
+ PrimitiveTypeName.INT64,
92
+ name,
93
+ OriginalType.TIMESTAMP_MICROS
94
+ )
95
+ }
96
+
97
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
98
+ orig match {
99
+ case ts: Timestamp =>
100
+ val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano
101
+ .asInstanceOf[Long] / 1_000L)
102
+ recordConsumer.addLong(v)
103
+ case _ =>
104
+ throw new DataException(
105
+ "given mismatched type value; expected type is timestamp"
106
+ )
101
107
  }
108
+ }
102
109
  }
103
110
 
104
- object Int8LogicalTypeHandler
105
- extends IntLogicalTypeHandler(OriginalType.INT_8)
111
+ object Int8LogicalTypeHandler extends IntLogicalTypeHandler(OriginalType.INT_8)
112
+
106
113
  object Int16LogicalTypeHandler
107
114
  extends IntLogicalTypeHandler(OriginalType.INT_16)
115
+
108
116
  object Int32LogicalTypeHandler
109
117
  extends IntLogicalTypeHandler(OriginalType.INT_32)
118
+
110
119
  object Int64LogicalTypeHandler
111
120
  extends IntLogicalTypeHandler(OriginalType.INT_64)
112
121
 
113
122
  object Uint8LogicalTypeHandler
114
123
  extends IntLogicalTypeHandler(OriginalType.UINT_8)
124
+
115
125
  object Uint16LogicalTypeHandler
116
126
  extends IntLogicalTypeHandler(OriginalType.UINT_16)
127
+
117
128
  object Uint32LogicalTypeHandler
118
129
  extends IntLogicalTypeHandler(OriginalType.UINT_32)
130
+
119
131
  object Uint64LogicalTypeHandler
120
132
  extends IntLogicalTypeHandler(OriginalType.UINT_64)
121
133
 
122
- object JsonLogicalTypeHandler
123
- extends LogicalTypeHandler
124
- {
125
- override def isConvertible(t: EType): Boolean =
126
- {
127
- t == Types.JSON
128
- }
129
-
130
- override def newSchemaFieldType(name: String): PrimitiveType =
131
- {
132
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.JSON)
133
- }
134
-
135
- override def consume(orig: Any,
136
- recordConsumer: RecordConsumer): Unit =
137
- {
138
- orig match {
139
- case msgPack: Value =>
140
- val bin = Binary.fromString(msgPack.toJson)
141
- recordConsumer.addBinary(bin)
142
- case _ => throw new DataException("given mismatched type value; expected type is json")
143
- }
134
+ object JsonLogicalTypeHandler extends LogicalTypeHandler {
135
+
136
+ override def isConvertible(t: EType): Boolean = {
137
+ t == Types.JSON
138
+ }
139
+
140
+ override def newSchemaFieldType(name: String): PrimitiveType = {
141
+ new PrimitiveType(
142
+ PType.Repetition.OPTIONAL,
143
+ PrimitiveTypeName.BINARY,
144
+ name,
145
+ OriginalType.JSON
146
+ )
147
+ }
148
+
149
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
150
+ orig match {
151
+ case msgPack: Value =>
152
+ val bin = Binary.fromString(msgPack.toJson)
153
+ recordConsumer.addBinary(bin)
154
+ case _ =>
155
+ throw new DataException(
156
+ "given mismatched type value; expected type is json"
157
+ )
144
158
  }
159
+ }
145
160
  }
@@ -1,107 +1,114 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.embulk.spi.`type`.{Type, Types}
5
4
  import java.util.{Map => JMap}
6
5
 
7
6
  import org.embulk.config.ConfigException
8
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, TypeOptionTask}
7
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
8
+ ColumnOptionTask,
9
+ TypeOptionTask
10
+ }
9
11
 
10
12
  import scala.jdk.CollectionConverters._
11
13
 
12
-
13
14
  /**
14
- * A storage has mapping from logical type query (column name, type) to handler.
15
- *
16
- * @param fromEmbulkType
17
- * @param fromColumnName
18
- */
19
- case class LogicalTypeHandlerStore private(fromEmbulkType: Map[Type, LogicalTypeHandler],
20
- fromColumnName: Map[String, LogicalTypeHandler])
21
- {
15
+ * A storage has mapping from logical type query (column name, type) to handler.
16
+ *
17
+ * @param fromEmbulkType
18
+ * @param fromColumnName
19
+ */
20
+ case class LogicalTypeHandlerStore private (
21
+ fromEmbulkType: Map[Type, LogicalTypeHandler],
22
+ fromColumnName: Map[String, LogicalTypeHandler]
23
+ ) {
22
24
 
23
- // Try column name lookup, then column type
24
- def get(n: String,
25
- t: Type): Option[LogicalTypeHandler] =
26
- {
27
- get(n) match {
28
- case Some(h) => Some(h)
29
- case _ =>
30
- get(t) match {
31
- case Some(h) => Some(h)
32
- case _ => None
33
- }
25
+ // Try column name lookup, then column type
26
+ def get(n: String, t: Type): Option[LogicalTypeHandler] = {
27
+ get(n) match {
28
+ case Some(h) => Some(h)
29
+ case _ =>
30
+ get(t) match {
31
+ case Some(h) => Some(h)
32
+ case _ => None
34
33
  }
35
34
  }
35
+ }
36
36
 
37
- def get(t: Type): Option[LogicalTypeHandler] =
38
- {
39
- fromEmbulkType.get(t)
40
- }
37
+ def get(t: Type): Option[LogicalTypeHandler] = {
38
+ fromEmbulkType.get(t)
39
+ }
41
40
 
42
- def get(n: String): Option[LogicalTypeHandler] =
43
- {
44
- fromColumnName.get(n)
45
- }
41
+ def get(n: String): Option[LogicalTypeHandler] = {
42
+ fromColumnName.get(n)
43
+ }
46
44
  }
47
45
 
48
- object LogicalTypeHandlerStore
49
- {
50
- private val STRING_TO_EMBULK_TYPE = Map[String, Type](
51
- "boolean" -> Types.BOOLEAN,
52
- "long" -> Types.LONG,
53
- "double" -> Types.DOUBLE,
54
- "string" -> Types.STRING,
55
- "timestamp" -> Types.TIMESTAMP,
56
- "json" -> Types.JSON
57
- )
46
+ object LogicalTypeHandlerStore {
58
47
 
59
- // Listed only older logical types that we can convert from embulk type
60
- private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
61
- "timestamp-millis" -> TimestampMillisLogicalTypeHandler,
62
- "timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
63
- "int8" -> Int8LogicalTypeHandler,
64
- "int16" -> Int16LogicalTypeHandler,
65
- "int32" -> Int32LogicalTypeHandler,
66
- "int64" -> Int64LogicalTypeHandler,
67
- "uint8" -> Uint8LogicalTypeHandler,
68
- "uint16" -> Uint16LogicalTypeHandler,
69
- "uint32" -> Uint32LogicalTypeHandler,
70
- "uint64" -> Uint64LogicalTypeHandler,
71
- "json" -> JsonLogicalTypeHandler
72
- )
48
+ private val STRING_TO_EMBULK_TYPE = Map[String, Type](
49
+ "boolean" -> Types.BOOLEAN,
50
+ "long" -> Types.LONG,
51
+ "double" -> Types.DOUBLE,
52
+ "string" -> Types.STRING,
53
+ "timestamp" -> Types.TIMESTAMP,
54
+ "json" -> Types.JSON
55
+ )
73
56
 
74
- def empty: LogicalTypeHandlerStore =
75
- {
76
- LogicalTypeHandlerStore(Map.empty[Type, LogicalTypeHandler], Map.empty[String, LogicalTypeHandler])
77
- }
57
+ // Listed only older logical types that we can convert from embulk type
58
+ private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
59
+ "timestamp-millis" -> TimestampMillisLogicalTypeHandler,
60
+ "timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
61
+ "int8" -> Int8LogicalTypeHandler,
62
+ "int16" -> Int16LogicalTypeHandler,
63
+ "int32" -> Int32LogicalTypeHandler,
64
+ "int64" -> Int64LogicalTypeHandler,
65
+ "uint8" -> Uint8LogicalTypeHandler,
66
+ "uint16" -> Uint16LogicalTypeHandler,
67
+ "uint32" -> Uint32LogicalTypeHandler,
68
+ "uint64" -> Uint64LogicalTypeHandler,
69
+ "json" -> JsonLogicalTypeHandler
70
+ )
78
71
 
79
- def fromEmbulkOptions(typeOpts: JMap[String, TypeOptionTask],
80
- columnOpts: JMap[String, ColumnOptionTask]): LogicalTypeHandlerStore =
81
- {
82
- val fromEmbulkType = typeOpts.asScala
83
- .filter(_._2.getLogicalType.isPresent)
84
- .map[Type, LogicalTypeHandler] { case (k, v) =>
85
- val t = STRING_TO_EMBULK_TYPE.get(k)
86
- val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
87
- (t, h) match {
88
- case (Some(tt), Some(hh)) => (tt, hh)
89
- case _ => throw new ConfigException("invalid logical types in type_options")
90
- }
91
- }
92
- .toMap
72
+ def empty: LogicalTypeHandlerStore = {
73
+ LogicalTypeHandlerStore(
74
+ Map.empty[Type, LogicalTypeHandler],
75
+ Map.empty[String, LogicalTypeHandler]
76
+ )
77
+ }
93
78
 
94
- val fromColumnName = columnOpts.asScala
95
- .filter(_._2.getLogicalType.isPresent)
96
- .map[String, LogicalTypeHandler] { case (k, v) =>
97
- val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
98
- h match {
99
- case Some(hh) => (k, hh)
100
- case _ => throw new ConfigException("invalid logical types in column_options")
101
- }
102
- }
103
- .toMap
79
+ def fromEmbulkOptions(
80
+ typeOpts: JMap[String, TypeOptionTask],
81
+ columnOpts: JMap[String, ColumnOptionTask]
82
+ ): LogicalTypeHandlerStore = {
83
+ val fromEmbulkType = typeOpts.asScala
84
+ .filter(_._2.getLogicalType.isPresent)
85
+ .map[Type, LogicalTypeHandler] {
86
+ case (k, v) =>
87
+ val t = STRING_TO_EMBULK_TYPE.get(k)
88
+ val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
89
+ (t, h) match {
90
+ case (Some(tt), Some(hh)) => (tt, hh)
91
+ case _ =>
92
+ throw new ConfigException("invalid logical types in type_options")
93
+ }
94
+ }
95
+ .toMap
104
96
 
105
- LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
106
- }
97
+ val fromColumnName = columnOpts.asScala
98
+ .filter(_._2.getLogicalType.isPresent)
99
+ .map[String, LogicalTypeHandler] {
100
+ case (k, v) =>
101
+ val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
102
+ h match {
103
+ case Some(hh) => (k, hh)
104
+ case _ =>
105
+ throw new ConfigException(
106
+ "invalid logical types in column_options"
107
+ )
108
+ }
109
+ }
110
+ .toMap
111
+
112
+ LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
113
+ }
107
114
  }