embulk-output-s3_parquet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +3 -0
- data/.github/workflows/test.yml +2 -0
- data/.scalafmt.conf +5 -0
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/build.gradle +19 -9
- data/example/config.yml +3 -1
- data/example/prepare_s3_bucket.sh +6 -0
- data/example/with_catalog.yml +3 -1
- data/example/with_logicaltypes.yml +3 -1
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -1
- data/gradlew +31 -20
- data/gradlew.bat +17 -1
- data/run_s3_local.sh +7 -0
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
- data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
- data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
- metadata +22 -15
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.apache.hadoop.conf.Configuration
|
5
4
|
import org.apache.parquet.hadoop.api.WriteSupport
|
6
5
|
import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
|
@@ -11,32 +10,34 @@ import org.embulk.spi.time.TimestampFormatter
|
|
11
10
|
|
12
11
|
import scala.jdk.CollectionConverters._
|
13
12
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
13
|
+
private[parquet] case class ParquetFileWriteSupport(
|
14
|
+
schema: Schema,
|
15
|
+
timestampFormatters: Seq[TimestampFormatter],
|
16
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
|
17
|
+
) extends WriteSupport[PageReader] {
|
18
|
+
|
19
|
+
private var currentParquetFileWriter: ParquetFileWriter = _
|
20
|
+
|
21
|
+
override def init(configuration: Configuration): WriteContext = {
|
22
|
+
val messageType: MessageType = EmbulkMessageType
|
23
|
+
.builder()
|
24
|
+
.withSchema(schema)
|
25
|
+
.withLogicalTypeHandlers(logicalTypeHandlers)
|
26
|
+
.build()
|
27
|
+
val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
|
28
|
+
new WriteContext(messageType, metadata.asJava)
|
29
|
+
}
|
30
|
+
|
31
|
+
override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
|
32
|
+
currentParquetFileWriter = ParquetFileWriter(
|
33
|
+
recordConsumer,
|
34
|
+
schema,
|
35
|
+
timestampFormatters,
|
36
|
+
logicalTypeHandlers
|
37
|
+
)
|
38
|
+
}
|
39
|
+
|
40
|
+
override def write(record: PageReader): Unit = {
|
41
|
+
currentParquetFileWriter.write(record)
|
42
|
+
}
|
42
43
|
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet.parquet
|
2
2
|
|
3
|
-
|
4
3
|
import org.apache.hadoop.conf.Configuration
|
5
4
|
import org.apache.hadoop.fs.Path
|
6
5
|
import org.apache.parquet.hadoop.ParquetWriter
|
@@ -9,168 +8,160 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer}
|
|
9
8
|
import org.embulk.spi.{Column, ColumnVisitor, PageReader, Schema}
|
10
9
|
import org.embulk.spi.time.TimestampFormatter
|
11
10
|
|
11
|
+
object ParquetFileWriter {
|
12
|
+
|
13
|
+
case class Builder(
|
14
|
+
path: Path = null,
|
15
|
+
schema: Schema = null,
|
16
|
+
timestampFormatters: Seq[TimestampFormatter] = null,
|
17
|
+
logicalTypeHandlers: LogicalTypeHandlerStore =
|
18
|
+
LogicalTypeHandlerStore.empty
|
19
|
+
) extends ParquetWriter.Builder[PageReader, Builder](path) {
|
12
20
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
case class Builder(path: Path = null,
|
17
|
-
schema: Schema = null,
|
18
|
-
timestampFormatters: Seq[TimestampFormatter] = null,
|
19
|
-
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
20
|
-
extends ParquetWriter.Builder[PageReader, Builder](path)
|
21
|
-
{
|
22
|
-
|
23
|
-
def withPath(path: Path): Builder =
|
24
|
-
{
|
25
|
-
copy(path = path)
|
26
|
-
}
|
27
|
-
|
28
|
-
def withPath(pathString: String): Builder =
|
29
|
-
{
|
30
|
-
copy(path = new Path(pathString))
|
31
|
-
}
|
32
|
-
|
33
|
-
def withSchema(schema: Schema): Builder =
|
34
|
-
{
|
35
|
-
copy(schema = schema)
|
36
|
-
}
|
37
|
-
|
38
|
-
def withTimestampFormatters(timestampFormatters: Seq[TimestampFormatter]): Builder =
|
39
|
-
{
|
40
|
-
copy(timestampFormatters = timestampFormatters)
|
41
|
-
}
|
42
|
-
|
43
|
-
def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
|
44
|
-
{
|
45
|
-
copy(logicalTypeHandlers = logicalTypeHandlers)
|
46
|
-
}
|
47
|
-
|
48
|
-
override def self(): Builder =
|
49
|
-
{
|
50
|
-
this
|
51
|
-
}
|
52
|
-
|
53
|
-
override def getWriteSupport(conf: Configuration): WriteSupport[PageReader] =
|
54
|
-
{
|
55
|
-
ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
|
56
|
-
}
|
21
|
+
def withPath(path: Path): Builder = {
|
22
|
+
copy(path = path)
|
57
23
|
}
|
58
24
|
|
59
|
-
def
|
60
|
-
|
61
|
-
Builder()
|
25
|
+
def withPath(pathString: String): Builder = {
|
26
|
+
copy(path = new Path(pathString))
|
62
27
|
}
|
63
28
|
|
64
|
-
|
29
|
+
def withSchema(schema: Schema): Builder = {
|
30
|
+
copy(schema = schema)
|
31
|
+
}
|
32
|
+
|
33
|
+
def withTimestampFormatters(
|
34
|
+
timestampFormatters: Seq[TimestampFormatter]
|
35
|
+
): Builder = {
|
36
|
+
copy(timestampFormatters = timestampFormatters)
|
37
|
+
}
|
65
38
|
|
39
|
+
def withLogicalTypeHandlers(
|
40
|
+
logicalTypeHandlers: LogicalTypeHandlerStore
|
41
|
+
): Builder = {
|
42
|
+
copy(logicalTypeHandlers = logicalTypeHandlers)
|
43
|
+
}
|
66
44
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
|
71
|
-
{
|
45
|
+
override def self(): Builder = {
|
46
|
+
this
|
47
|
+
}
|
72
48
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
recordConsumer.endMessage()
|
49
|
+
override def getWriteSupport(
|
50
|
+
conf: Configuration
|
51
|
+
): WriteSupport[PageReader] = {
|
52
|
+
ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
|
78
53
|
}
|
54
|
+
}
|
55
|
+
|
56
|
+
def builder(): Builder = {
|
57
|
+
Builder()
|
58
|
+
}
|
59
|
+
|
60
|
+
}
|
61
|
+
|
62
|
+
private[parquet] case class ParquetFileWriter(
|
63
|
+
recordConsumer: RecordConsumer,
|
64
|
+
schema: Schema,
|
65
|
+
timestampFormatters: Seq[TimestampFormatter],
|
66
|
+
logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
|
67
|
+
) {
|
68
|
+
|
69
|
+
def write(record: PageReader): Unit = {
|
70
|
+
recordConsumer.startMessage()
|
71
|
+
writeRecord(record)
|
72
|
+
recordConsumer.endMessage()
|
73
|
+
}
|
79
74
|
|
80
|
-
|
81
|
-
{
|
82
|
-
|
83
|
-
schema.visitColumns(new ColumnVisitor()
|
84
|
-
{
|
85
|
-
|
86
|
-
override def booleanColumn(column: Column): Unit =
|
87
|
-
{
|
88
|
-
nullOr(column, {
|
89
|
-
withWriteFieldContext(column, {
|
90
|
-
recordConsumer.addBoolean(record.getBoolean(column))
|
91
|
-
})
|
92
|
-
})
|
93
|
-
}
|
94
|
-
|
95
|
-
override def longColumn(column: Column): Unit =
|
96
|
-
{
|
97
|
-
nullOr(column, {
|
98
|
-
withWriteFieldContext(column, {
|
99
|
-
recordConsumer.addLong(record.getLong(column))
|
100
|
-
})
|
101
|
-
})
|
102
|
-
}
|
103
|
-
|
104
|
-
override def doubleColumn(column: Column): Unit =
|
105
|
-
{
|
106
|
-
nullOr(column, {
|
107
|
-
withWriteFieldContext(column, {
|
108
|
-
recordConsumer.addDouble(record.getDouble(column))
|
109
|
-
})
|
110
|
-
})
|
111
|
-
}
|
112
|
-
|
113
|
-
override def stringColumn(column: Column): Unit =
|
114
|
-
{
|
115
|
-
nullOr(column, {
|
116
|
-
withWriteFieldContext(column, {
|
117
|
-
val bin = Binary.fromString(record.getString(column))
|
118
|
-
recordConsumer.addBinary(bin)
|
119
|
-
})
|
120
|
-
})
|
121
|
-
}
|
122
|
-
|
123
|
-
override def timestampColumn(column: Column): Unit =
|
124
|
-
{
|
125
|
-
nullOr(column, {
|
126
|
-
withWriteFieldContext(column, {
|
127
|
-
val t = record.getTimestamp(column)
|
128
|
-
|
129
|
-
logicalTypeHandlers.get(column.getName, column.getType) match {
|
130
|
-
case Some(h) =>
|
131
|
-
h.consume(t, recordConsumer)
|
132
|
-
case _ =>
|
133
|
-
val ft = timestampFormatters(column.getIndex).format(t)
|
134
|
-
val bin = Binary.fromString(ft)
|
135
|
-
recordConsumer.addBinary(bin)
|
136
|
-
}
|
137
|
-
})
|
138
|
-
})
|
139
|
-
}
|
140
|
-
|
141
|
-
override def jsonColumn(column: Column): Unit =
|
142
|
-
{
|
143
|
-
nullOr(column, {
|
144
|
-
withWriteFieldContext(column, {
|
145
|
-
val msgPack = record.getJson(column)
|
146
|
-
|
147
|
-
logicalTypeHandlers.get(column.getName, column.getType) match {
|
148
|
-
case Some(h) =>
|
149
|
-
h.consume(msgPack, recordConsumer)
|
150
|
-
case _ =>
|
151
|
-
val bin = Binary.fromString(msgPack.toJson)
|
152
|
-
recordConsumer.addBinary(bin)
|
153
|
-
}
|
154
|
-
})
|
155
|
-
})
|
156
|
-
}
|
157
|
-
|
158
|
-
private def nullOr(column: Column,
|
159
|
-
f: => Unit): Unit =
|
160
|
-
{
|
161
|
-
if (!record.isNull(column)) f
|
162
|
-
}
|
163
|
-
|
164
|
-
private def withWriteFieldContext(column: Column,
|
165
|
-
f: => Unit): Unit =
|
166
|
-
{
|
167
|
-
recordConsumer.startField(column.getName, column.getIndex)
|
168
|
-
f
|
169
|
-
recordConsumer.endField(column.getName, column.getIndex)
|
170
|
-
}
|
75
|
+
private def writeRecord(record: PageReader): Unit = {
|
171
76
|
|
77
|
+
schema.visitColumns(new ColumnVisitor() {
|
78
|
+
|
79
|
+
override def booleanColumn(column: Column): Unit = {
|
80
|
+
nullOr(column, {
|
81
|
+
withWriteFieldContext(column, {
|
82
|
+
recordConsumer.addBoolean(record.getBoolean(column))
|
83
|
+
})
|
172
84
|
})
|
85
|
+
}
|
173
86
|
|
174
|
-
|
87
|
+
override def longColumn(column: Column): Unit = {
|
88
|
+
nullOr(column, {
|
89
|
+
withWriteFieldContext(column, {
|
90
|
+
recordConsumer.addLong(record.getLong(column))
|
91
|
+
})
|
92
|
+
})
|
93
|
+
}
|
175
94
|
|
176
|
-
|
95
|
+
override def doubleColumn(column: Column): Unit = {
|
96
|
+
nullOr(column, {
|
97
|
+
withWriteFieldContext(column, {
|
98
|
+
recordConsumer.addDouble(record.getDouble(column))
|
99
|
+
})
|
100
|
+
})
|
101
|
+
}
|
102
|
+
|
103
|
+
override def stringColumn(column: Column): Unit = {
|
104
|
+
nullOr(column, {
|
105
|
+
withWriteFieldContext(column, {
|
106
|
+
val bin = Binary.fromString(record.getString(column))
|
107
|
+
recordConsumer.addBinary(bin)
|
108
|
+
})
|
109
|
+
})
|
110
|
+
}
|
111
|
+
|
112
|
+
override def timestampColumn(column: Column): Unit = {
|
113
|
+
nullOr(
|
114
|
+
column, {
|
115
|
+
withWriteFieldContext(
|
116
|
+
column, {
|
117
|
+
val t = record.getTimestamp(column)
|
118
|
+
|
119
|
+
logicalTypeHandlers.get(column.getName, column.getType) match {
|
120
|
+
case Some(h) =>
|
121
|
+
h.consume(t, recordConsumer)
|
122
|
+
case _ =>
|
123
|
+
val ft = timestampFormatters(column.getIndex).format(t)
|
124
|
+
val bin = Binary.fromString(ft)
|
125
|
+
recordConsumer.addBinary(bin)
|
126
|
+
}
|
127
|
+
}
|
128
|
+
)
|
129
|
+
}
|
130
|
+
)
|
131
|
+
}
|
132
|
+
|
133
|
+
override def jsonColumn(column: Column): Unit = {
|
134
|
+
nullOr(
|
135
|
+
column, {
|
136
|
+
withWriteFieldContext(
|
137
|
+
column, {
|
138
|
+
val msgPack = record.getJson(column)
|
139
|
+
|
140
|
+
logicalTypeHandlers.get(column.getName, column.getType) match {
|
141
|
+
case Some(h) =>
|
142
|
+
h.consume(msgPack, recordConsumer)
|
143
|
+
case _ =>
|
144
|
+
val bin = Binary.fromString(msgPack.toJson)
|
145
|
+
recordConsumer.addBinary(bin)
|
146
|
+
}
|
147
|
+
}
|
148
|
+
)
|
149
|
+
}
|
150
|
+
)
|
151
|
+
}
|
152
|
+
|
153
|
+
private def nullOr(column: Column, f: => Unit): Unit = {
|
154
|
+
if (!record.isNull(column)) f
|
155
|
+
}
|
156
|
+
|
157
|
+
private def withWriteFieldContext(column: Column, f: => Unit): Unit = {
|
158
|
+
recordConsumer.startField(column.getName, column.getIndex)
|
159
|
+
f
|
160
|
+
recordConsumer.endField(column.getName, column.getIndex)
|
161
|
+
}
|
162
|
+
|
163
|
+
})
|
164
|
+
|
165
|
+
}
|
166
|
+
|
167
|
+
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
|
-
|
4
3
|
import java.io.File
|
5
4
|
import java.nio.file.FileSystems
|
6
5
|
|
@@ -17,138 +16,166 @@ import org.embulk.spi.OutputPlugin
|
|
17
16
|
import org.embulk.test.{EmbulkTests, TestingEmbulk}
|
18
17
|
import org.junit.Rule
|
19
18
|
import org.junit.runner.RunWith
|
20
|
-
import org.scalatest.{
|
19
|
+
import org.scalatest.{
|
20
|
+
BeforeAndAfter,
|
21
|
+
BeforeAndAfterAll,
|
22
|
+
DiagrammedAssertions,
|
23
|
+
FunSuite
|
24
|
+
}
|
21
25
|
import org.scalatestplus.junit.JUnitRunner
|
22
26
|
|
23
27
|
import scala.annotation.meta.getter
|
24
28
|
import scala.jdk.CollectionConverters._
|
25
29
|
|
26
|
-
|
27
30
|
@RunWith(classOf[JUnitRunner])
|
28
31
|
class TestS3ParquetOutputPlugin
|
29
32
|
extends FunSuite
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
|
87
|
-
.stripLineEnd
|
88
|
-
.split("\n")
|
89
|
-
.map(record => record.split("\t").toSeq)
|
90
|
-
.toSeq
|
91
|
-
|
92
|
-
inRecords.zipWithIndex.foreach {
|
93
|
-
case (record, recordIndex) =>
|
94
|
-
0.to(5).foreach { columnIndex =>
|
95
|
-
val columnName = s"c$columnIndex"
|
96
|
-
val inData: String = inRecords(recordIndex)(columnIndex)
|
97
|
-
val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
|
98
|
-
|
99
|
-
assert(outData === inData, s"record: $recordIndex, column: $columnName")
|
100
|
-
}
|
101
|
-
}
|
102
|
-
}
|
103
|
-
|
104
|
-
def readParquetFile(bucket: String,
|
105
|
-
key: String): Seq[Map[String, String]] =
|
106
|
-
{
|
107
|
-
val createdParquetFile = embulk.createTempFile("in")
|
108
|
-
withLocalStackS3Client {s3 =>
|
109
|
-
val xfer = TransferManagerBuilder.standard()
|
110
|
-
.withS3Client(s3)
|
111
|
-
.build()
|
112
|
-
try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
|
113
|
-
finally xfer.shutdownNow()
|
33
|
+
with BeforeAndAfter
|
34
|
+
with BeforeAndAfterAll
|
35
|
+
with DiagrammedAssertions {
|
36
|
+
|
37
|
+
val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
|
38
|
+
val TEST_S3_ENDPOINT: String = "http://localhost:4572"
|
39
|
+
val TEST_S3_REGION: String = "us-east-1"
|
40
|
+
val TEST_S3_ACCESS_KEY_ID: String = "test"
|
41
|
+
val TEST_S3_SECRET_ACCESS_KEY: String = "test"
|
42
|
+
val TEST_BUCKET_NAME: String = "my-bucket"
|
43
|
+
|
44
|
+
@(Rule @getter)
|
45
|
+
val embulk: TestingEmbulk = TestingEmbulk
|
46
|
+
.builder()
|
47
|
+
.registerPlugin(
|
48
|
+
classOf[OutputPlugin],
|
49
|
+
"s3_parquet",
|
50
|
+
classOf[S3ParquetOutputPlugin]
|
51
|
+
)
|
52
|
+
.build()
|
53
|
+
|
54
|
+
before {
|
55
|
+
withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
|
56
|
+
}
|
57
|
+
|
58
|
+
after {
|
59
|
+
withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
|
60
|
+
}
|
61
|
+
|
62
|
+
def defaultOutConfig(): ConfigSource = {
|
63
|
+
embulk
|
64
|
+
.newConfig()
|
65
|
+
.set("type", "s3_parquet")
|
66
|
+
.set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
|
67
|
+
.set("bucket", TEST_BUCKET_NAME)
|
68
|
+
.set("path_prefix", "path/to/p")
|
69
|
+
.set("auth_method", "basic")
|
70
|
+
.set("access_key_id", TEST_S3_ACCESS_KEY_ID)
|
71
|
+
.set("secret_access_key", TEST_S3_SECRET_ACCESS_KEY)
|
72
|
+
.set("path_style_access_enabled", true)
|
73
|
+
.set("default_timezone", "Asia/Tokyo")
|
74
|
+
}
|
75
|
+
|
76
|
+
test("first test") {
|
77
|
+
val inPath = toPath("in1.csv")
|
78
|
+
val outConfig = defaultOutConfig()
|
79
|
+
|
80
|
+
val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
|
81
|
+
|
82
|
+
val outRecords: Seq[Map[String, String]] =
|
83
|
+
result.getOutputTaskReports.asScala
|
84
|
+
.map { tr =>
|
85
|
+
val b = tr.get(classOf[String], "bucket")
|
86
|
+
val k = tr.get(classOf[String], "key")
|
87
|
+
readParquetFile(b, k)
|
114
88
|
}
|
115
|
-
|
116
|
-
|
117
|
-
.builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
|
118
|
-
.build()
|
119
|
-
|
120
|
-
def read(reader: ParquetReader[SimpleRecord],
|
121
|
-
records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
|
122
|
-
{
|
123
|
-
val simpleRecord: SimpleRecord = reader.read()
|
124
|
-
if (simpleRecord != null) {
|
125
|
-
val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
|
126
|
-
return read(reader, records :+ r)
|
127
|
-
}
|
128
|
-
records
|
89
|
+
.foldLeft(Seq[Map[String, String]]()) { (merged, records) =>
|
90
|
+
merged ++ records
|
129
91
|
}
|
130
92
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
93
|
+
val inRecords: Seq[Seq[String]] = EmbulkTests
|
94
|
+
.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
|
95
|
+
.stripLineEnd
|
96
|
+
.split("\n")
|
97
|
+
.map(record => record.split("\t").toSeq)
|
98
|
+
.toSeq
|
99
|
+
|
100
|
+
inRecords.zipWithIndex.foreach {
|
101
|
+
case (record, recordIndex) =>
|
102
|
+
0.to(5).foreach { columnIndex =>
|
103
|
+
val columnName = s"c$columnIndex"
|
104
|
+
val inData: String = inRecords(recordIndex)(columnIndex)
|
105
|
+
val outData: String =
|
106
|
+
outRecords(recordIndex).getOrElse(columnName, "")
|
107
|
+
|
108
|
+
assert(
|
109
|
+
outData === inData,
|
110
|
+
s"record: $recordIndex, column: $columnName"
|
111
|
+
)
|
135
112
|
}
|
136
113
|
}
|
114
|
+
}
|
115
|
+
|
116
|
+
def readParquetFile(bucket: String, key: String): Seq[Map[String, String]] = {
|
117
|
+
val createdParquetFile = embulk.createTempFile("in")
|
118
|
+
withLocalStackS3Client { s3 =>
|
119
|
+
val xfer = TransferManagerBuilder
|
120
|
+
.standard()
|
121
|
+
.withS3Client(s3)
|
122
|
+
.build()
|
123
|
+
try xfer
|
124
|
+
.download(bucket, key, createdParquetFile.toFile)
|
125
|
+
.waitForCompletion()
|
126
|
+
finally xfer.shutdownNow()
|
127
|
+
}
|
137
128
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
129
|
+
val reader: ParquetReader[SimpleRecord] = ParquetReader
|
130
|
+
.builder(
|
131
|
+
new SimpleReadSupport(),
|
132
|
+
new HadoopPath(createdParquetFile.toString)
|
133
|
+
)
|
134
|
+
.build()
|
135
|
+
|
136
|
+
def read(
|
137
|
+
reader: ParquetReader[SimpleRecord],
|
138
|
+
records: Seq[Map[String, String]] = Seq()
|
139
|
+
): Seq[Map[String, String]] = {
|
140
|
+
val simpleRecord: SimpleRecord = reader.read()
|
141
|
+
if (simpleRecord != null) {
|
142
|
+
val r: Map[String, String] = simpleRecord.getValues.asScala
|
143
|
+
.map(v => v.getName -> v.getValue.toString)
|
144
|
+
.toMap
|
145
|
+
return read(reader, records :+ r)
|
146
|
+
}
|
147
|
+
records
|
142
148
|
}
|
143
149
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
.withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials(TEST_S3_ACCESS_KEY_ID, TEST_S3_SECRET_ACCESS_KEY)))
|
148
|
-
.withPathStyleAccessEnabled(true)
|
149
|
-
.build()
|
150
|
+
try read(reader)
|
151
|
+
finally {
|
152
|
+
reader.close()
|
150
153
|
|
151
|
-
try f(client)
|
152
|
-
finally client.shutdown()
|
153
154
|
}
|
155
|
+
}
|
156
|
+
|
157
|
+
private def toPath(fileName: String) = {
|
158
|
+
val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
|
159
|
+
FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
|
160
|
+
}
|
161
|
+
|
162
|
+
private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
|
163
|
+
val client: AmazonS3 = AmazonS3ClientBuilder.standard
|
164
|
+
.withEndpointConfiguration(
|
165
|
+
new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION)
|
166
|
+
)
|
167
|
+
.withCredentials(
|
168
|
+
new AWSStaticCredentialsProvider(
|
169
|
+
new BasicAWSCredentials(
|
170
|
+
TEST_S3_ACCESS_KEY_ID,
|
171
|
+
TEST_S3_SECRET_ACCESS_KEY
|
172
|
+
)
|
173
|
+
)
|
174
|
+
)
|
175
|
+
.withPathStyleAccessEnabled(true)
|
176
|
+
.build()
|
177
|
+
|
178
|
+
try f(client)
|
179
|
+
finally client.shutdown()
|
180
|
+
}
|
154
181
|
}
|