embulk-output-s3_parquet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.apache.parquet.io.api.{Binary, RecordConsumer}
5
4
  import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
6
5
  import org.apache.parquet.schema.{Type => PType}
@@ -11,135 +10,151 @@ import org.embulk.spi.`type`.Types
11
10
  import org.embulk.spi.time.Timestamp
12
11
  import org.msgpack.value.Value
13
12
 
14
-
15
13
  /**
16
- * Handle Apache Parquet 'Logical Types' on schema/value conversion.
17
- * ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
18
- *
19
- * It focuses on only older representation because newer supported since 1.11 is not used actually yet.
20
- * TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
21
- *
22
- */
23
- sealed trait LogicalTypeHandler
24
- {
25
- def isConvertible(t: EType): Boolean
26
-
27
- def newSchemaFieldType(name: String): PrimitiveType
28
-
29
- def consume(orig: Any,
30
- recordConsumer: RecordConsumer): Unit
14
+ * Handle Apache Parquet 'Logical Types' on schema/value conversion.
15
+ * ref. https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
16
+ *
17
+ * It focuses on only older representation because newer supported since 1.11 is not used actually yet.
18
+ * TODO Support both of older and newer representation after 1.11+ is published and other middleware supports it.
19
+ *
20
+ */
21
+ sealed trait LogicalTypeHandler {
22
+ def isConvertible(t: EType): Boolean
23
+
24
+ def newSchemaFieldType(name: String): PrimitiveType
25
+
26
+ def consume(orig: Any, recordConsumer: RecordConsumer): Unit
31
27
  }
32
28
 
33
29
  abstract class IntLogicalTypeHandler(ot: OriginalType)
34
- extends LogicalTypeHandler
35
- {
36
- override def isConvertible(t: EType): Boolean =
37
- {
38
- t == Types.LONG
39
- }
40
-
41
- override def newSchemaFieldType(name: String): PrimitiveType =
42
- {
43
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, ot)
44
- }
45
-
46
- override def consume(orig: Any,
47
- recordConsumer: RecordConsumer): Unit =
48
- {
49
- orig match {
50
- case v: Long => recordConsumer.addLong(v)
51
- case _ => throw new DataException("given mismatched type value; expected type is long")
52
- }
30
+ extends LogicalTypeHandler {
31
+
32
+ override def isConvertible(t: EType): Boolean = {
33
+ t == Types.LONG
34
+ }
35
+
36
+ override def newSchemaFieldType(name: String): PrimitiveType = {
37
+ new PrimitiveType(
38
+ PType.Repetition.OPTIONAL,
39
+ PrimitiveTypeName.INT64,
40
+ name,
41
+ ot
42
+ )
43
+ }
44
+
45
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
46
+ orig match {
47
+ case v: Long => recordConsumer.addLong(v)
48
+ case _ =>
49
+ throw new DataException(
50
+ "given mismatched type value; expected type is long"
51
+ )
53
52
  }
53
+ }
54
54
  }
55
55
 
56
- object TimestampMillisLogicalTypeHandler
57
- extends LogicalTypeHandler
58
- {
59
- override def isConvertible(t: EType): Boolean =
60
- {
61
- t == Types.TIMESTAMP
62
- }
63
-
64
- override def newSchemaFieldType(name: String): PrimitiveType =
65
- {
66
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, OriginalType.TIMESTAMP_MILLIS)
67
- }
68
-
69
- override def consume(orig: Any,
70
- recordConsumer: RecordConsumer): Unit =
71
- {
72
- orig match {
73
- case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
74
- case _ => throw new DataException("given mismatched type value; expected type is timestamp")
75
- }
56
+ object TimestampMillisLogicalTypeHandler extends LogicalTypeHandler {
57
+
58
+ override def isConvertible(t: EType): Boolean = {
59
+ t == Types.TIMESTAMP
60
+ }
61
+
62
+ override def newSchemaFieldType(name: String): PrimitiveType = {
63
+ new PrimitiveType(
64
+ PType.Repetition.OPTIONAL,
65
+ PrimitiveTypeName.INT64,
66
+ name,
67
+ OriginalType.TIMESTAMP_MILLIS
68
+ )
69
+ }
70
+
71
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
72
+ orig match {
73
+ case ts: Timestamp => recordConsumer.addLong(ts.toEpochMilli)
74
+ case _ =>
75
+ throw new DataException(
76
+ "given mismatched type value; expected type is timestamp"
77
+ )
76
78
  }
79
+ }
77
80
  }
78
81
 
79
- object TimestampMicrosLogicalTypeHandler
80
- extends LogicalTypeHandler
81
- {
82
- override def isConvertible(t: EType): Boolean =
83
- {
84
- t == Types.TIMESTAMP
85
- }
86
-
87
- override def newSchemaFieldType(name: String): PrimitiveType =
88
- {
89
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.INT64, name, OriginalType.TIMESTAMP_MICROS)
90
- }
91
-
92
- override def consume(orig: Any,
93
- recordConsumer: RecordConsumer): Unit =
94
- {
95
- orig match {
96
- case ts: Timestamp =>
97
- val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano.asInstanceOf[Long] / 1_000L)
98
- recordConsumer.addLong(v)
99
- case _ => throw new DataException("given mismatched type value; expected type is timestamp")
100
- }
82
+ object TimestampMicrosLogicalTypeHandler extends LogicalTypeHandler {
83
+
84
+ override def isConvertible(t: EType): Boolean = {
85
+ t == Types.TIMESTAMP
86
+ }
87
+
88
+ override def newSchemaFieldType(name: String): PrimitiveType = {
89
+ new PrimitiveType(
90
+ PType.Repetition.OPTIONAL,
91
+ PrimitiveTypeName.INT64,
92
+ name,
93
+ OriginalType.TIMESTAMP_MICROS
94
+ )
95
+ }
96
+
97
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
98
+ orig match {
99
+ case ts: Timestamp =>
100
+ val v = (ts.getEpochSecond * 1_000_000L) + (ts.getNano
101
+ .asInstanceOf[Long] / 1_000L)
102
+ recordConsumer.addLong(v)
103
+ case _ =>
104
+ throw new DataException(
105
+ "given mismatched type value; expected type is timestamp"
106
+ )
101
107
  }
108
+ }
102
109
  }
103
110
 
104
- object Int8LogicalTypeHandler
105
- extends IntLogicalTypeHandler(OriginalType.INT_8)
111
+ object Int8LogicalTypeHandler extends IntLogicalTypeHandler(OriginalType.INT_8)
112
+
106
113
  object Int16LogicalTypeHandler
107
114
  extends IntLogicalTypeHandler(OriginalType.INT_16)
115
+
108
116
  object Int32LogicalTypeHandler
109
117
  extends IntLogicalTypeHandler(OriginalType.INT_32)
118
+
110
119
  object Int64LogicalTypeHandler
111
120
  extends IntLogicalTypeHandler(OriginalType.INT_64)
112
121
 
113
122
  object Uint8LogicalTypeHandler
114
123
  extends IntLogicalTypeHandler(OriginalType.UINT_8)
124
+
115
125
  object Uint16LogicalTypeHandler
116
126
  extends IntLogicalTypeHandler(OriginalType.UINT_16)
127
+
117
128
  object Uint32LogicalTypeHandler
118
129
  extends IntLogicalTypeHandler(OriginalType.UINT_32)
130
+
119
131
  object Uint64LogicalTypeHandler
120
132
  extends IntLogicalTypeHandler(OriginalType.UINT_64)
121
133
 
122
- object JsonLogicalTypeHandler
123
- extends LogicalTypeHandler
124
- {
125
- override def isConvertible(t: EType): Boolean =
126
- {
127
- t == Types.JSON
128
- }
129
-
130
- override def newSchemaFieldType(name: String): PrimitiveType =
131
- {
132
- new PrimitiveType(PType.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, name, OriginalType.JSON)
133
- }
134
-
135
- override def consume(orig: Any,
136
- recordConsumer: RecordConsumer): Unit =
137
- {
138
- orig match {
139
- case msgPack: Value =>
140
- val bin = Binary.fromString(msgPack.toJson)
141
- recordConsumer.addBinary(bin)
142
- case _ => throw new DataException("given mismatched type value; expected type is json")
143
- }
134
+ object JsonLogicalTypeHandler extends LogicalTypeHandler {
135
+
136
+ override def isConvertible(t: EType): Boolean = {
137
+ t == Types.JSON
138
+ }
139
+
140
+ override def newSchemaFieldType(name: String): PrimitiveType = {
141
+ new PrimitiveType(
142
+ PType.Repetition.OPTIONAL,
143
+ PrimitiveTypeName.BINARY,
144
+ name,
145
+ OriginalType.JSON
146
+ )
147
+ }
148
+
149
+ override def consume(orig: Any, recordConsumer: RecordConsumer): Unit = {
150
+ orig match {
151
+ case msgPack: Value =>
152
+ val bin = Binary.fromString(msgPack.toJson)
153
+ recordConsumer.addBinary(bin)
154
+ case _ =>
155
+ throw new DataException(
156
+ "given mismatched type value; expected type is json"
157
+ )
144
158
  }
159
+ }
145
160
  }
@@ -1,107 +1,114 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.embulk.spi.`type`.{Type, Types}
5
4
  import java.util.{Map => JMap}
6
5
 
7
6
  import org.embulk.config.ConfigException
8
- import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{ColumnOptionTask, TypeOptionTask}
7
+ import org.embulk.output.s3_parquet.S3ParquetOutputPlugin.{
8
+ ColumnOptionTask,
9
+ TypeOptionTask
10
+ }
9
11
 
10
12
  import scala.jdk.CollectionConverters._
11
13
 
12
-
13
14
  /**
14
- * A storage has mapping from logical type query (column name, type) to handler.
15
- *
16
- * @param fromEmbulkType
17
- * @param fromColumnName
18
- */
19
- case class LogicalTypeHandlerStore private(fromEmbulkType: Map[Type, LogicalTypeHandler],
20
- fromColumnName: Map[String, LogicalTypeHandler])
21
- {
15
+ * A storage has mapping from logical type query (column name, type) to handler.
16
+ *
17
+ * @param fromEmbulkType
18
+ * @param fromColumnName
19
+ */
20
+ case class LogicalTypeHandlerStore private (
21
+ fromEmbulkType: Map[Type, LogicalTypeHandler],
22
+ fromColumnName: Map[String, LogicalTypeHandler]
23
+ ) {
22
24
 
23
- // Try column name lookup, then column type
24
- def get(n: String,
25
- t: Type): Option[LogicalTypeHandler] =
26
- {
27
- get(n) match {
28
- case Some(h) => Some(h)
29
- case _ =>
30
- get(t) match {
31
- case Some(h) => Some(h)
32
- case _ => None
33
- }
25
+ // Try column name lookup, then column type
26
+ def get(n: String, t: Type): Option[LogicalTypeHandler] = {
27
+ get(n) match {
28
+ case Some(h) => Some(h)
29
+ case _ =>
30
+ get(t) match {
31
+ case Some(h) => Some(h)
32
+ case _ => None
34
33
  }
35
34
  }
35
+ }
36
36
 
37
- def get(t: Type): Option[LogicalTypeHandler] =
38
- {
39
- fromEmbulkType.get(t)
40
- }
37
+ def get(t: Type): Option[LogicalTypeHandler] = {
38
+ fromEmbulkType.get(t)
39
+ }
41
40
 
42
- def get(n: String): Option[LogicalTypeHandler] =
43
- {
44
- fromColumnName.get(n)
45
- }
41
+ def get(n: String): Option[LogicalTypeHandler] = {
42
+ fromColumnName.get(n)
43
+ }
46
44
  }
47
45
 
48
- object LogicalTypeHandlerStore
49
- {
50
- private val STRING_TO_EMBULK_TYPE = Map[String, Type](
51
- "boolean" -> Types.BOOLEAN,
52
- "long" -> Types.LONG,
53
- "double" -> Types.DOUBLE,
54
- "string" -> Types.STRING,
55
- "timestamp" -> Types.TIMESTAMP,
56
- "json" -> Types.JSON
57
- )
46
+ object LogicalTypeHandlerStore {
58
47
 
59
- // Listed only older logical types that we can convert from embulk type
60
- private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
61
- "timestamp-millis" -> TimestampMillisLogicalTypeHandler,
62
- "timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
63
- "int8" -> Int8LogicalTypeHandler,
64
- "int16" -> Int16LogicalTypeHandler,
65
- "int32" -> Int32LogicalTypeHandler,
66
- "int64" -> Int64LogicalTypeHandler,
67
- "uint8" -> Uint8LogicalTypeHandler,
68
- "uint16" -> Uint16LogicalTypeHandler,
69
- "uint32" -> Uint32LogicalTypeHandler,
70
- "uint64" -> Uint64LogicalTypeHandler,
71
- "json" -> JsonLogicalTypeHandler
72
- )
48
+ private val STRING_TO_EMBULK_TYPE = Map[String, Type](
49
+ "boolean" -> Types.BOOLEAN,
50
+ "long" -> Types.LONG,
51
+ "double" -> Types.DOUBLE,
52
+ "string" -> Types.STRING,
53
+ "timestamp" -> Types.TIMESTAMP,
54
+ "json" -> Types.JSON
55
+ )
73
56
 
74
- def empty: LogicalTypeHandlerStore =
75
- {
76
- LogicalTypeHandlerStore(Map.empty[Type, LogicalTypeHandler], Map.empty[String, LogicalTypeHandler])
77
- }
57
+ // Listed only older logical types that we can convert from embulk type
58
+ private val STRING_TO_LOGICAL_TYPE = Map[String, LogicalTypeHandler](
59
+ "timestamp-millis" -> TimestampMillisLogicalTypeHandler,
60
+ "timestamp-micros" -> TimestampMicrosLogicalTypeHandler,
61
+ "int8" -> Int8LogicalTypeHandler,
62
+ "int16" -> Int16LogicalTypeHandler,
63
+ "int32" -> Int32LogicalTypeHandler,
64
+ "int64" -> Int64LogicalTypeHandler,
65
+ "uint8" -> Uint8LogicalTypeHandler,
66
+ "uint16" -> Uint16LogicalTypeHandler,
67
+ "uint32" -> Uint32LogicalTypeHandler,
68
+ "uint64" -> Uint64LogicalTypeHandler,
69
+ "json" -> JsonLogicalTypeHandler
70
+ )
78
71
 
79
- def fromEmbulkOptions(typeOpts: JMap[String, TypeOptionTask],
80
- columnOpts: JMap[String, ColumnOptionTask]): LogicalTypeHandlerStore =
81
- {
82
- val fromEmbulkType = typeOpts.asScala
83
- .filter(_._2.getLogicalType.isPresent)
84
- .map[Type, LogicalTypeHandler] { case (k, v) =>
85
- val t = STRING_TO_EMBULK_TYPE.get(k)
86
- val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
87
- (t, h) match {
88
- case (Some(tt), Some(hh)) => (tt, hh)
89
- case _ => throw new ConfigException("invalid logical types in type_options")
90
- }
91
- }
92
- .toMap
72
+ def empty: LogicalTypeHandlerStore = {
73
+ LogicalTypeHandlerStore(
74
+ Map.empty[Type, LogicalTypeHandler],
75
+ Map.empty[String, LogicalTypeHandler]
76
+ )
77
+ }
93
78
 
94
- val fromColumnName = columnOpts.asScala
95
- .filter(_._2.getLogicalType.isPresent)
96
- .map[String, LogicalTypeHandler] { case (k, v) =>
97
- val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
98
- h match {
99
- case Some(hh) => (k, hh)
100
- case _ => throw new ConfigException("invalid logical types in column_options")
101
- }
102
- }
103
- .toMap
79
+ def fromEmbulkOptions(
80
+ typeOpts: JMap[String, TypeOptionTask],
81
+ columnOpts: JMap[String, ColumnOptionTask]
82
+ ): LogicalTypeHandlerStore = {
83
+ val fromEmbulkType = typeOpts.asScala
84
+ .filter(_._2.getLogicalType.isPresent)
85
+ .map[Type, LogicalTypeHandler] {
86
+ case (k, v) =>
87
+ val t = STRING_TO_EMBULK_TYPE.get(k)
88
+ val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
89
+ (t, h) match {
90
+ case (Some(tt), Some(hh)) => (tt, hh)
91
+ case _ =>
92
+ throw new ConfigException("invalid logical types in type_options")
93
+ }
94
+ }
95
+ .toMap
104
96
 
105
- LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
106
- }
97
+ val fromColumnName = columnOpts.asScala
98
+ .filter(_._2.getLogicalType.isPresent)
99
+ .map[String, LogicalTypeHandler] {
100
+ case (k, v) =>
101
+ val h = STRING_TO_LOGICAL_TYPE.get(v.getLogicalType.get)
102
+ h match {
103
+ case Some(hh) => (k, hh)
104
+ case _ =>
105
+ throw new ConfigException(
106
+ "invalid logical types in column_options"
107
+ )
108
+ }
109
+ }
110
+ .toMap
111
+
112
+ LogicalTypeHandlerStore(fromEmbulkType, fromColumnName)
113
+ }
107
114
  }