embulk-output-s3_parquet 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.apache.hadoop.conf.Configuration
|
5
4
|
import org.apache.parquet.hadoop.api.WriteSupport
|
6
5
|
import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
|
@@ -11,32 +10,34 @@ import org.embulk.spi.time.TimestampFormatter
|
|
11
10
|
|
12
11
|
import scala.jdk.CollectionConverters._
|
13
12
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
13
|
+
private[parquet] case class ParquetFileWriteSupport(
|
14
|
+
schema: Schema,
|
15
|
+
timestampFormatters: Seq[TimestampFormatter],
|
16
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
|
17
|
+
) extends WriteSupport[PageReader] {
|
18
|
+
|
19
|
+
private var currentParquetFileWriter: ParquetFileWriter = _
|
20
|
+
|
21
|
+
override def init(configuration: Configuration): WriteContext = {
|
22
|
+
val messageType: MessageType = EmbulkMessageType
|
23
|
+
.builder()
|
24
|
+
.withSchema(schema)
|
25
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
26
|
+
.build()
|
27
|
+
val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
|
28
|
+
new WriteContext(messageType, metadata.asJava)
|
29
|
+
}
|
30
|
+
|
31
|
+
override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
|
32
|
+
currentParquetFileWriter = ParquetFileWriter(
|
33
|
+
recordConsumer,
|
34
|
+
schema,
|
35
|
+
timestampFormatters,
|
36
|
+
logicalTypeHandlers
|
37
|
+
)
|
38
|
+
}
|
39
|
+
|
40
|
+
override def write(record: PageReader): Unit = {
|
41
|
+
currentParquetFileWriter.write(record)
|
42
|
+
}
|
42
43
|
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.apache.hadoop.conf.Configuration
|
5
4
|
import org.apache.hadoop.fs.Path
|
6
5
|
import org.apache.parquet.hadoop.ParquetWriter
|
@@ -9,168 +8,160 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer}
|
|
9
8
|
import org.embulk.spi.{Column, ColumnVisitor, PageReader, Schema}
|
10
9
|
import org.embulk.spi.time.TimestampFormatter
|
11
10
|
|
11
|
+
object ParquetFileWriter {
|
12
|
+
|
13
|
+
case class Builder(
|
14
|
+
path: Path = null,
|
15
|
+
schema: Schema = null,
|
16
|
+
timestampFormatters: Seq[TimestampFormatter] = null,
|
17
|
+
logicalTypeHandlers: LogicalTypeHandlerStore =
|
18
|
+
LogicalTypeHandlerStore.empty
|
19
|
+
) extends ParquetWriter.Builder[PageReader, Builder](path) {
|
12
20
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
case class Builder(path: Path = null,
|
17
|
-
schema: Schema = null,
|
18
|
-
timestampFormatters: Seq[TimestampFormatter] = null,
|
19
|
-
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
20
|
-
extends ParquetWriter.Builder[PageReader, Builder](path)
|
21
|
-
{
|
22
|
-
|
23
|
-
def withPath(path: Path): Builder =
|
24
|
-
{
|
25
|
-
copy(path = path)
|
26
|
-
}
|
27
|
-
|
28
|
-
def withPath(pathString: String): Builder =
|
29
|
-
{
|
30
|
-
copy(path = new Path(pathString))
|
31
|
-
}
|
32
|
-
|
33
|
-
def withSchema(schema: Schema): Builder =
|
34
|
-
{
|
35
|
-
copy(schema = schema)
|
36
|
-
}
|
37
|
-
|
38
|
-
def withTimestampFormatters(timestampFormatters: Seq[TimestampFormatter]): Builder =
|
39
|
-
{
|
40
|
-
copy(timestampFormatters = timestampFormatters)
|
41
|
-
}
|
42
|
-
|
43
|
-
def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
|
44
|
-
{
|
45
|
-
copy(logicalTypeHandlers = logicalTypeHandlers)
|
46
|
-
}
|
47
|
-
|
48
|
-
override def self(): Builder =
|
49
|
-
{
|
50
|
-
this
|
51
|
-
}
|
52
|
-
|
53
|
-
override def getWriteSupport(conf: Configuration): WriteSupport[PageReader] =
|
54
|
-
{
|
55
|
-
ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
|
56
|
-
}
|
21
|
+
def withPath(path: Path): Builder = {
|
22
|
+
copy(path = path)
|
57
23
|
}
|
58
24
|
|
59
|
-
def
|
60
|
-
|
61
|
-
Builder()
|
25
|
+
def withPath(pathString: String): Builder = {
|
26
|
+
copy(path = new Path(pathString))
|
62
27
|
}
|
63
28
|
|
64
|
-
|
29
|
+
def withSchema(schema: Schema): Builder = {
|
30
|
+
copy(schema = schema)
|
31
|
+
}
|
32
|
+
|
33
|
+
def withTimestampFormatters(
|
34
|
+
timestampFormatters: Seq[TimestampFormatter]
|
35
|
+
): Builder = {
|
36
|
+
copy(timestampFormatters = timestampFormatters)
|
37
|
+
}
|
65
38
|
|
39
|
+
def withLogicalTypeHandlers(
|
40
|
+
logicalTypeHandlers: LogicalTypeHandlerStore
|
41
|
+
): Builder = {
|
42
|
+
copy(logicalTypeHandlers = logicalTypeHandlers)
|
43
|
+
}
|
66
44
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
71
|
-
{
|
45
|
+
override def self(): Builder = {
|
46
|
+
this
|
47
|
+
}
|
72
48
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
recordConsumer.endMessage()
|
49
|
+
override def getWriteSupport(
|
50
|
+
conf: Configuration
|
51
|
+
): WriteSupport[PageReader] = {
|
52
|
+
ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
|
78
53
|
}
|
54
|
+
}
|
55
|
+
|
56
|
+
def builder(): Builder = {
|
57
|
+
Builder()
|
58
|
+
}
|
59
|
+
|
60
|
+
}
|
61
|
+
|
62
|
+
private[parquet] case class ParquetFileWriter(
|
63
|
+
recordConsumer: RecordConsumer,
|
64
|
+
schema: Schema,
|
65
|
+
timestampFormatters: Seq[TimestampFormatter],
|
66
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
|
67
|
+
) {
|
68
|
+
|
69
|
+
def write(record: PageReader): Unit = {
|
70
|
+
recordConsumer.startMessage()
|
71
|
+
writeRecord(record)
|
72
|
+
recordConsumer.endMessage()
|
73
|
+
}
|
79
74
|
|
80
|
-
|
81
|
-
{
|
82
|
-
|
83
|
-
schema.visitColumns(new ColumnVisitor()
|
84
|
-
{
|
85
|
-
|
86
|
-
override def booleanColumn(column: Column): Unit =
|
87
|
-
{
|
88
|
-
nullOr(column, {
|
89
|
-
withWriteFieldContext(column, {
|
90
|
-
recordConsumer.addBoolean(record.getBoolean(column))
|
91
|
-
})
|
92
|
-
})
|
93
|
-
}
|
94
|
-
|
95
|
-
override def longColumn(column: Column): Unit =
|
96
|
-
{
|
97
|
-
nullOr(column, {
|
98
|
-
withWriteFieldContext(column, {
|
99
|
-
recordConsumer.addLong(record.getLong(column))
|
100
|
-
})
|
101
|
-
})
|
102
|
-
}
|
103
|
-
|
104
|
-
override def doubleColumn(column: Column): Unit =
|
105
|
-
{
|
106
|
-
nullOr(column, {
|
107
|
-
withWriteFieldContext(column, {
|
108
|
-
recordConsumer.addDouble(record.getDouble(column))
|
109
|
-
})
|
110
|
-
})
|
111
|
-
}
|
112
|
-
|
113
|
-
override def stringColumn(column: Column): Unit =
|
114
|
-
{
|
115
|
-
nullOr(column, {
|
116
|
-
withWriteFieldContext(column, {
|
117
|
-
val bin = Binary.fromString(record.getString(column))
|
118
|
-
recordConsumer.addBinary(bin)
|
119
|
-
})
|
120
|
-
})
|
121
|
-
}
|
122
|
-
|
123
|
-
override def timestampColumn(column: Column): Unit =
|
124
|
-
{
|
125
|
-
nullOr(column, {
|
126
|
-
withWriteFieldContext(column, {
|
127
|
-
val t = record.getTimestamp(column)
|
128
|
-
|
129
|
-
logicalTypeHandlers.get(column.getName, column.getType) match {
|
130
|
-
case Some(h) =>
|
131
|
-
h.consume(t, recordConsumer)
|
132
|
-
case _ =>
|
133
|
-
val ft = timestampFormatters(column.getIndex).format(t)
|
134
|
-
val bin = Binary.fromString(ft)
|
135
|
-
recordConsumer.addBinary(bin)
|
136
|
-
}
|
137
|
-
})
|
138
|
-
})
|
139
|
-
}
|
140
|
-
|
141
|
-
override def jsonColumn(column: Column): Unit =
|
142
|
-
{
|
143
|
-
nullOr(column, {
|
144
|
-
withWriteFieldContext(column, {
|
145
|
-
val msgPack = record.getJson(column)
|
146
|
-
|
147
|
-
logicalTypeHandlers.get(column.getName, column.getType) match {
|
148
|
-
case Some(h) =>
|
149
|
-
h.consume(msgPack, recordConsumer)
|
150
|
-
case _ =>
|
151
|
-
val bin = Binary.fromString(msgPack.toJson)
|
152
|
-
recordConsumer.addBinary(bin)
|
153
|
-
}
|
154
|
-
})
|
155
|
-
})
|
156
|
-
}
|
157
|
-
|
158
|
-
private def nullOr(column: Column,
|
159
|
-
f: => Unit): Unit =
|
160
|
-
{
|
161
|
-
if (!record.isNull(column)) f
|
162
|
-
}
|
163
|
-
|
164
|
-
private def withWriteFieldContext(column: Column,
|
165
|
-
f: => Unit): Unit =
|
166
|
-
{
|
167
|
-
recordConsumer.startField(column.getName, column.getIndex)
|
168
|
-
f
|
169
|
-
recordConsumer.endField(column.getName, column.getIndex)
|
170
|
-
}
|
75
|
+
private def writeRecord(record: PageReader): Unit = {
|
171
76
|
|
77
|
+
schema.visitColumns(new ColumnVisitor() {
|
78
|
+
|
79
|
+
override def booleanColumn(column: Column): Unit = {
|
80
|
+
nullOr(column, {
|
81
|
+
withWriteFieldContext(column, {
|
82
|
+
recordConsumer.addBoolean(record.getBoolean(column))
|
83
|
+
})
|
172
84
|
})
|
85
|
+
}
|
173
86
|
|
174
|
-
|
87
|
+
override def longColumn(column: Column): Unit = {
|
88
|
+
nullOr(column, {
|
89
|
+
withWriteFieldContext(column, {
|
90
|
+
recordConsumer.addLong(record.getLong(column))
|
91
|
+
})
|
92
|
+
})
|
93
|
+
}
|
175
94
|
|
176
|
-
|
95
|
+
override def doubleColumn(column: Column): Unit = {
|
96
|
+
nullOr(column, {
|
97
|
+
withWriteFieldContext(column, {
|
98
|
+
recordConsumer.addDouble(record.getDouble(column))
|
99
|
+
})
|
100
|
+
})
|
101
|
+
}
|
102
|
+
|
103
|
+
override def stringColumn(column: Column): Unit = {
|
104
|
+
nullOr(column, {
|
105
|
+
withWriteFieldContext(column, {
|
106
|
+
val bin = Binary.fromString(record.getString(column))
|
107
|
+
recordConsumer.addBinary(bin)
|
108
|
+
})
|
109
|
+
})
|
110
|
+
}
|
111
|
+
|
112
|
+
override def timestampColumn(column: Column): Unit = {
|
113
|
+
nullOr(
|
114
|
+
column, {
|
115
|
+
withWriteFieldContext(
|
116
|
+
column, {
|
117
|
+
val t = record.getTimestamp(column)
|
118
|
+
|
119
|
+
logicalTypeHandlers.get(column.getName, column.getType) match {
|
120
|
+
case Some(h) =>
|
121
|
+
h.consume(t, recordConsumer)
|
122
|
+
case _ =>
|
123
|
+
val ft = timestampFormatters(column.getIndex).format(t)
|
124
|
+
val bin = Binary.fromString(ft)
|
125
|
+
recordConsumer.addBinary(bin)
|
126
|
+
}
|
127
|
+
}
|
128
|
+
)
|
129
|
+
}
|
130
|
+
)
|
131
|
+
}
|
132
|
+
|
133
|
+
override def jsonColumn(column: Column): Unit = {
|
134
|
+
nullOr(
|
135
|
+
column, {
|
136
|
+
withWriteFieldContext(
|
137
|
+
column, {
|
138
|
+
val msgPack = record.getJson(column)
|
139
|
+
|
140
|
+
logicalTypeHandlers.get(column.getName, column.getType) match {
|
141
|
+
case Some(h) =>
|
142
|
+
h.consume(msgPack, recordConsumer)
|
143
|
+
case _ =>
|
144
|
+
val bin = Binary.fromString(msgPack.toJson)
|
145
|
+
recordConsumer.addBinary(bin)
|
146
|
+
}
|
147
|
+
}
|
148
|
+
)
|
149
|
+
}
|
150
|
+
)
|
151
|
+
}
|
152
|
+
|
153
|
+
private def nullOr(column: Column, f: => Unit): Unit = {
|
154
|
+
if (!record.isNull(column)) f
|
155
|
+
}
|
156
|
+
|
157
|
+
private def withWriteFieldContext(column: Column, f: => Unit): Unit = {
|
158
|
+
recordConsumer.startField(column.getName, column.getIndex)
|
159
|
+
f
|
160
|
+
recordConsumer.endField(column.getName, column.getIndex)
|
161
|
+
}
|
162
|
+
|
163
|
+
})
|
164
|
+
|
165
|
+
}
|
166
|
+
|
167
|
+
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.io.File
|
5
4
|
import java.nio.file.FileSystems
|
6
5
|
|
@@ -17,138 +16,166 @@ import org.embulk.spi.OutputPlugin
|
|
17
16
|
import org.embulk.test.{EmbulkTests, TestingEmbulk}
|
18
17
|
import org.junit.Rule
|
19
18
|
import org.junit.runner.RunWith
|
20
|
-
import org.scalatest.{
|
19
|
+
import org.scalatest.{
|
20
|
+
BeforeAndAfter,
|
21
|
+
BeforeAndAfterAll,
|
22
|
+
DiagrammedAssertions,
|
23
|
+
FunSuite
|
24
|
+
}
|
21
25
|
import org.scalatestplus.junit.JUnitRunner
|
22
26
|
|
23
27
|
import scala.annotation.meta.getter
|
24
28
|
import scala.jdk.CollectionConverters._
|
25
29
|
|
26
|
-
|
27
30
|
@RunWith(classOf[JUnitRunner])
|
28
31
|
class TestS3ParquetOutputPlugin
|
29
32
|
extends FunSuite
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
|
87
|
-
.stripLineEnd
|
88
|
-
.split("\n")
|
89
|
-
.map(record => record.split("\t").toSeq)
|
90
|
-
.toSeq
|
91
|
-
|
92
|
-
inRecords.zipWithIndex.foreach {
|
93
|
-
case (record, recordIndex) =>
|
94
|
-
0.to(5).foreach { columnIndex =>
|
95
|
-
val columnName = s"c$columnIndex"
|
96
|
-
val inData: String = inRecords(recordIndex)(columnIndex)
|
97
|
-
val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
|
98
|
-
|
99
|
-
assert(outData === inData, s"record: $recordIndex, column: $columnName")
|
100
|
-
}
|
101
|
-
}
|
102
|
-
}
|
103
|
-
|
104
|
-
def readParquetFile(bucket: String,
|
105
|
-
key: String): Seq[Map[String, String]] =
|
106
|
-
{
|
107
|
-
val createdParquetFile = embulk.createTempFile("in")
|
108
|
-
withLocalStackS3Client {s3 =>
|
109
|
-
val xfer = TransferManagerBuilder.standard()
|
110
|
-
.withS3Client(s3)
|
111
|
-
.build()
|
112
|
-
try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
|
113
|
-
finally xfer.shutdownNow()
|
33
|
+
with BeforeAndAfter
|
34
|
+
with BeforeAndAfterAll
|
35
|
+
with DiagrammedAssertions {
|
36
|
+
|
37
|
+
val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
|
38
|
+
val TEST_S3_ENDPOINT: String = "http://localhost:4572"
|
39
|
+
val TEST_S3_REGION: String = "us-east-1"
|
40
|
+
val TEST_S3_ACCESS_KEY_ID: String = "test"
|
41
|
+
val TEST_S3_SECRET_ACCESS_KEY: String = "test"
|
42
|
+
val TEST_BUCKET_NAME: String = "my-bucket"
|
43
|
+
|
44
|
+
@(Rule @getter)
|
45
|
+
val embulk: TestingEmbulk = TestingEmbulk
|
46
|
+
.builder()
|
47
|
+
.registerPlugin(
|
48
|
+
classOf[OutputPlugin],
|
49
|
+
"s3_parquet",
|
50
|
+
classOf[S3ParquetOutputPlugin]
|
51
|
+
)
|
52
|
+
.build()
|
53
|
+
|
54
|
+
before {
|
55
|
+
withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
|
56
|
+
}
|
57
|
+
|
58
|
+
after {
|
59
|
+
withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
|
60
|
+
}
|
61
|
+
|
62
|
+
def defaultOutConfig(): ConfigSource = {
|
63
|
+
embulk
|
64
|
+
.newConfig()
|
65
|
+
.set("type", "s3_parquet")
|
66
|
+
.set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
|
67
|
+
.set("bucket", TEST_BUCKET_NAME)
|
68
|
+
.set("path_prefix", "path/to/p")
|
69
|
+
.set("auth_method", "basic")
|
70
|
+
.set("access_key_id", TEST_S3_ACCESS_KEY_ID)
|
71
|
+
.set("secret_access_key", TEST_S3_SECRET_ACCESS_KEY)
|
72
|
+
.set("path_style_access_enabled", true)
|
73
|
+
.set("default_timezone", "Asia/Tokyo")
|
74
|
+
}
|
75
|
+
|
76
|
+
test("first test") {
|
77
|
+
val inPath = toPath("in1.csv")
|
78
|
+
val outConfig = defaultOutConfig()
|
79
|
+
|
80
|
+
val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
|
81
|
+
|
82
|
+
val outRecords: Seq[Map[String, String]] =
|
83
|
+
result.getOutputTaskReports.asScala
|
84
|
+
.map { tr =>
|
85
|
+
val b = tr.get(classOf[String], "bucket")
|
86
|
+
val k = tr.get(classOf[String], "key")
|
87
|
+
readParquetFile(b, k)
|
114
88
|
}
|
115
|
-
|
116
|
-
|
117
|
-
.builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
|
118
|
-
.build()
|
119
|
-
|
120
|
-
def read(reader: ParquetReader[SimpleRecord],
|
121
|
-
records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
|
122
|
-
{
|
123
|
-
val simpleRecord: SimpleRecord = reader.read()
|
124
|
-
if (simpleRecord != null) {
|
125
|
-
val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
|
126
|
-
return read(reader, records :+ r)
|
127
|
-
}
|
128
|
-
records
|
89
|
+
.foldLeft(Seq[Map[String, String]]()) { (merged, records) =>
|
90
|
+
merged ++ records
|
129
91
|
}
|
130
92
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
93
|
+
val inRecords: Seq[Seq[String]] = EmbulkTests
|
94
|
+
.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
|
95
|
+
.stripLineEnd
|
96
|
+
.split("\n")
|
97
|
+
.map(record => record.split("\t").toSeq)
|
98
|
+
.toSeq
|
99
|
+
|
100
|
+
inRecords.zipWithIndex.foreach {
|
101
|
+
case (record, recordIndex) =>
|
102
|
+
0.to(5).foreach { columnIndex =>
|
103
|
+
val columnName = s"c$columnIndex"
|
104
|
+
val inData: String = inRecords(recordIndex)(columnIndex)
|
105
|
+
val outData: String =
|
106
|
+
outRecords(recordIndex).getOrElse(columnName, "")
|
107
|
+
|
108
|
+
assert(
|
109
|
+
outData === inData,
|
110
|
+
s"record: $recordIndex, column: $columnName"
|
111
|
+
)
|
135
112
|
}
|
136
113
|
}
|
114
|
+
}
|
115
|
+
|
116
|
+
def readParquetFile(bucket: String, key: String): Seq[Map[String, String]] = {
|
117
|
+
val createdParquetFile = embulk.createTempFile("in")
|
118
|
+
withLocalStackS3Client { s3 =>
|
119
|
+
val xfer = TransferManagerBuilder
|
120
|
+
.standard()
|
121
|
+
.withS3Client(s3)
|
122
|
+
.build()
|
123
|
+
try xfer
|
124
|
+
.download(bucket, key, createdParquetFile.toFile)
|
125
|
+
.waitForCompletion()
|
126
|
+
finally xfer.shutdownNow()
|
127
|
+
}
|
137
128
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
129
|
+
val reader: ParquetReader[SimpleRecord] = ParquetReader
|
130
|
+
.builder(
|
131
|
+
new SimpleReadSupport(),
|
132
|
+
new HadoopPath(createdParquetFile.toString)
|
133
|
+
)
|
134
|
+
.build()
|
135
|
+
|
136
|
+
def read(
|
137
|
+
reader: ParquetReader[SimpleRecord],
|
138
|
+
records: Seq[Map[String, String]] = Seq()
|
139
|
+
): Seq[Map[String, String]] = {
|
140
|
+
val simpleRecord: SimpleRecord = reader.read()
|
141
|
+
if (simpleRecord != null) {
|
142
|
+
val r: Map[String, String] = simpleRecord.getValues.asScala
|
143
|
+
.map(v => v.getName -> v.getValue.toString)
|
144
|
+
.toMap
|
145
|
+
return read(reader, records :+ r)
|
146
|
+
}
|
147
|
+
records
|
142
148
|
}
|
143
149
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
.withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials(TEST_S3_ACCESS_KEY_ID, TEST_S3_SECRET_ACCESS_KEY)))
|
148
|
-
.withPathStyleAccessEnabled(true)
|
149
|
-
.build()
|
150
|
+
try read(reader)
|
151
|
+
finally {
|
152
|
+
reader.close()
|
150
153
|
|
151
|
-
try f(client)
|
152
|
-
finally client.shutdown()
|
153
154
|
}
|
155
|
+
}
|
156
|
+
|
157
|
+
private def toPath(fileName: String) = {
|
158
|
+
val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
|
159
|
+
FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
|
160
|
+
}
|
161
|
+
|
162
|
+
private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
|
163
|
+
val client: AmazonS3 = AmazonS3ClientBuilder.standard
|
164
|
+
.withEndpointConfiguration(
|
165
|
+
new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION)
|
166
|
+
)
|
167
|
+
.withCredentials(
|
168
|
+
new AWSStaticCredentialsProvider(
|
169
|
+
new BasicAWSCredentials(
|
170
|
+
TEST_S3_ACCESS_KEY_ID,
|
171
|
+
TEST_S3_SECRET_ACCESS_KEY
|
172
|
+
)
|
173
|
+
)
|
174
|
+
)
|
175
|
+
.withPathStyleAccessEnabled(true)
|
176
|
+
.build()
|
177
|
+
|
178
|
+
try f(client)
|
179
|
+
finally client.shutdown()
|
180
|
+
}
|
154
181
|
}
|