embulk-output-s3_parquet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.apache.hadoop.conf.Configuration
5
4
  import org.apache.parquet.hadoop.api.WriteSupport
6
5
  import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
@@ -11,32 +10,34 @@ import org.embulk.spi.time.TimestampFormatter
11
10
 
12
11
  import scala.jdk.CollectionConverters._
13
12
 
14
-
15
- private[parquet] case class ParquetFileWriteSupport(schema: Schema,
16
- timestampFormatters: Seq[TimestampFormatter],
17
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
18
- extends WriteSupport[PageReader]
19
- {
20
-
21
- private var currentParquetFileWriter: ParquetFileWriter = _
22
-
23
- override def init(configuration: Configuration): WriteContext =
24
- {
25
- val messageType: MessageType = EmbulkMessageType.builder()
26
- .withSchema(schema)
27
- .withLogicalTypeHandlers(logicalTypeHandlers)
28
- .build()
29
- val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
30
- new WriteContext(messageType, metadata.asJava)
31
- }
32
-
33
- override def prepareForWrite(recordConsumer: RecordConsumer): Unit =
34
- {
35
- currentParquetFileWriter = ParquetFileWriter(recordConsumer, schema, timestampFormatters, logicalTypeHandlers)
36
- }
37
-
38
- override def write(record: PageReader): Unit =
39
- {
40
- currentParquetFileWriter.write(record)
41
- }
13
+ private[parquet] case class ParquetFileWriteSupport(
14
+ schema: Schema,
15
+ timestampFormatters: Seq[TimestampFormatter],
16
+ logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
17
+ ) extends WriteSupport[PageReader] {
18
+
19
+ private var currentParquetFileWriter: ParquetFileWriter = _
20
+
21
+ override def init(configuration: Configuration): WriteContext = {
22
+ val messageType: MessageType = EmbulkMessageType
23
+ .builder()
24
+ .withSchema(schema)
25
+ .withLogicalTypeHandlers(logicalTypeHandlers)
26
+ .build()
27
+ val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
28
+ new WriteContext(messageType, metadata.asJava)
29
+ }
30
+
31
+ override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
32
+ currentParquetFileWriter = ParquetFileWriter(
33
+ recordConsumer,
34
+ schema,
35
+ timestampFormatters,
36
+ logicalTypeHandlers
37
+ )
38
+ }
39
+
40
+ override def write(record: PageReader): Unit = {
41
+ currentParquetFileWriter.write(record)
42
+ }
42
43
  }
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.apache.hadoop.conf.Configuration
5
4
  import org.apache.hadoop.fs.Path
6
5
  import org.apache.parquet.hadoop.ParquetWriter
@@ -9,168 +8,160 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer}
9
8
  import org.embulk.spi.{Column, ColumnVisitor, PageReader, Schema}
10
9
  import org.embulk.spi.time.TimestampFormatter
11
10
 
11
+ object ParquetFileWriter {
12
+
13
+ case class Builder(
14
+ path: Path = null,
15
+ schema: Schema = null,
16
+ timestampFormatters: Seq[TimestampFormatter] = null,
17
+ logicalTypeHandlers: LogicalTypeHandlerStore =
18
+ LogicalTypeHandlerStore.empty
19
+ ) extends ParquetWriter.Builder[PageReader, Builder](path) {
12
20
 
13
- object ParquetFileWriter
14
- {
15
-
16
- case class Builder(path: Path = null,
17
- schema: Schema = null,
18
- timestampFormatters: Seq[TimestampFormatter] = null,
19
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
20
- extends ParquetWriter.Builder[PageReader, Builder](path)
21
- {
22
-
23
- def withPath(path: Path): Builder =
24
- {
25
- copy(path = path)
26
- }
27
-
28
- def withPath(pathString: String): Builder =
29
- {
30
- copy(path = new Path(pathString))
31
- }
32
-
33
- def withSchema(schema: Schema): Builder =
34
- {
35
- copy(schema = schema)
36
- }
37
-
38
- def withTimestampFormatters(timestampFormatters: Seq[TimestampFormatter]): Builder =
39
- {
40
- copy(timestampFormatters = timestampFormatters)
41
- }
42
-
43
- def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
44
- {
45
- copy(logicalTypeHandlers = logicalTypeHandlers)
46
- }
47
-
48
- override def self(): Builder =
49
- {
50
- this
51
- }
52
-
53
- override def getWriteSupport(conf: Configuration): WriteSupport[PageReader] =
54
- {
55
- ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
56
- }
21
+ def withPath(path: Path): Builder = {
22
+ copy(path = path)
57
23
  }
58
24
 
59
- def builder(): Builder =
60
- {
61
- Builder()
25
+ def withPath(pathString: String): Builder = {
26
+ copy(path = new Path(pathString))
62
27
  }
63
28
 
64
- }
29
+ def withSchema(schema: Schema): Builder = {
30
+ copy(schema = schema)
31
+ }
32
+
33
+ def withTimestampFormatters(
34
+ timestampFormatters: Seq[TimestampFormatter]
35
+ ): Builder = {
36
+ copy(timestampFormatters = timestampFormatters)
37
+ }
65
38
 
39
+ def withLogicalTypeHandlers(
40
+ logicalTypeHandlers: LogicalTypeHandlerStore
41
+ ): Builder = {
42
+ copy(logicalTypeHandlers = logicalTypeHandlers)
43
+ }
66
44
 
67
- private[parquet] case class ParquetFileWriter(recordConsumer: RecordConsumer,
68
- schema: Schema,
69
- timestampFormatters: Seq[TimestampFormatter],
70
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
71
- {
45
+ override def self(): Builder = {
46
+ this
47
+ }
72
48
 
73
- def write(record: PageReader): Unit =
74
- {
75
- recordConsumer.startMessage()
76
- writeRecord(record)
77
- recordConsumer.endMessage()
49
+ override def getWriteSupport(
50
+ conf: Configuration
51
+ ): WriteSupport[PageReader] = {
52
+ ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
78
53
  }
54
+ }
55
+
56
+ def builder(): Builder = {
57
+ Builder()
58
+ }
59
+
60
+ }
61
+
62
+ private[parquet] case class ParquetFileWriter(
63
+ recordConsumer: RecordConsumer,
64
+ schema: Schema,
65
+ timestampFormatters: Seq[TimestampFormatter],
66
+ logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
67
+ ) {
68
+
69
+ def write(record: PageReader): Unit = {
70
+ recordConsumer.startMessage()
71
+ writeRecord(record)
72
+ recordConsumer.endMessage()
73
+ }
79
74
 
80
- private def writeRecord(record: PageReader): Unit =
81
- {
82
-
83
- schema.visitColumns(new ColumnVisitor()
84
- {
85
-
86
- override def booleanColumn(column: Column): Unit =
87
- {
88
- nullOr(column, {
89
- withWriteFieldContext(column, {
90
- recordConsumer.addBoolean(record.getBoolean(column))
91
- })
92
- })
93
- }
94
-
95
- override def longColumn(column: Column): Unit =
96
- {
97
- nullOr(column, {
98
- withWriteFieldContext(column, {
99
- recordConsumer.addLong(record.getLong(column))
100
- })
101
- })
102
- }
103
-
104
- override def doubleColumn(column: Column): Unit =
105
- {
106
- nullOr(column, {
107
- withWriteFieldContext(column, {
108
- recordConsumer.addDouble(record.getDouble(column))
109
- })
110
- })
111
- }
112
-
113
- override def stringColumn(column: Column): Unit =
114
- {
115
- nullOr(column, {
116
- withWriteFieldContext(column, {
117
- val bin = Binary.fromString(record.getString(column))
118
- recordConsumer.addBinary(bin)
119
- })
120
- })
121
- }
122
-
123
- override def timestampColumn(column: Column): Unit =
124
- {
125
- nullOr(column, {
126
- withWriteFieldContext(column, {
127
- val t = record.getTimestamp(column)
128
-
129
- logicalTypeHandlers.get(column.getName, column.getType) match {
130
- case Some(h) =>
131
- h.consume(t, recordConsumer)
132
- case _ =>
133
- val ft = timestampFormatters(column.getIndex).format(t)
134
- val bin = Binary.fromString(ft)
135
- recordConsumer.addBinary(bin)
136
- }
137
- })
138
- })
139
- }
140
-
141
- override def jsonColumn(column: Column): Unit =
142
- {
143
- nullOr(column, {
144
- withWriteFieldContext(column, {
145
- val msgPack = record.getJson(column)
146
-
147
- logicalTypeHandlers.get(column.getName, column.getType) match {
148
- case Some(h) =>
149
- h.consume(msgPack, recordConsumer)
150
- case _ =>
151
- val bin = Binary.fromString(msgPack.toJson)
152
- recordConsumer.addBinary(bin)
153
- }
154
- })
155
- })
156
- }
157
-
158
- private def nullOr(column: Column,
159
- f: => Unit): Unit =
160
- {
161
- if (!record.isNull(column)) f
162
- }
163
-
164
- private def withWriteFieldContext(column: Column,
165
- f: => Unit): Unit =
166
- {
167
- recordConsumer.startField(column.getName, column.getIndex)
168
- f
169
- recordConsumer.endField(column.getName, column.getIndex)
170
- }
75
+ private def writeRecord(record: PageReader): Unit = {
171
76
 
77
+ schema.visitColumns(new ColumnVisitor() {
78
+
79
+ override def booleanColumn(column: Column): Unit = {
80
+ nullOr(column, {
81
+ withWriteFieldContext(column, {
82
+ recordConsumer.addBoolean(record.getBoolean(column))
83
+ })
172
84
  })
85
+ }
173
86
 
174
- }
87
+ override def longColumn(column: Column): Unit = {
88
+ nullOr(column, {
89
+ withWriteFieldContext(column, {
90
+ recordConsumer.addLong(record.getLong(column))
91
+ })
92
+ })
93
+ }
175
94
 
176
- }
95
+ override def doubleColumn(column: Column): Unit = {
96
+ nullOr(column, {
97
+ withWriteFieldContext(column, {
98
+ recordConsumer.addDouble(record.getDouble(column))
99
+ })
100
+ })
101
+ }
102
+
103
+ override def stringColumn(column: Column): Unit = {
104
+ nullOr(column, {
105
+ withWriteFieldContext(column, {
106
+ val bin = Binary.fromString(record.getString(column))
107
+ recordConsumer.addBinary(bin)
108
+ })
109
+ })
110
+ }
111
+
112
+ override def timestampColumn(column: Column): Unit = {
113
+ nullOr(
114
+ column, {
115
+ withWriteFieldContext(
116
+ column, {
117
+ val t = record.getTimestamp(column)
118
+
119
+ logicalTypeHandlers.get(column.getName, column.getType) match {
120
+ case Some(h) =>
121
+ h.consume(t, recordConsumer)
122
+ case _ =>
123
+ val ft = timestampFormatters(column.getIndex).format(t)
124
+ val bin = Binary.fromString(ft)
125
+ recordConsumer.addBinary(bin)
126
+ }
127
+ }
128
+ )
129
+ }
130
+ )
131
+ }
132
+
133
+ override def jsonColumn(column: Column): Unit = {
134
+ nullOr(
135
+ column, {
136
+ withWriteFieldContext(
137
+ column, {
138
+ val msgPack = record.getJson(column)
139
+
140
+ logicalTypeHandlers.get(column.getName, column.getType) match {
141
+ case Some(h) =>
142
+ h.consume(msgPack, recordConsumer)
143
+ case _ =>
144
+ val bin = Binary.fromString(msgPack.toJson)
145
+ recordConsumer.addBinary(bin)
146
+ }
147
+ }
148
+ )
149
+ }
150
+ )
151
+ }
152
+
153
+ private def nullOr(column: Column, f: => Unit): Unit = {
154
+ if (!record.isNull(column)) f
155
+ }
156
+
157
+ private def withWriteFieldContext(column: Column, f: => Unit): Unit = {
158
+ recordConsumer.startField(column.getName, column.getIndex)
159
+ f
160
+ recordConsumer.endField(column.getName, column.getIndex)
161
+ }
162
+
163
+ })
164
+
165
+ }
166
+
167
+ }
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
-
4
3
  import java.io.File
5
4
  import java.nio.file.FileSystems
6
5
 
@@ -17,138 +16,166 @@ import org.embulk.spi.OutputPlugin
17
16
  import org.embulk.test.{EmbulkTests, TestingEmbulk}
18
17
  import org.junit.Rule
19
18
  import org.junit.runner.RunWith
20
- import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, DiagrammedAssertions, FunSuite}
19
+ import org.scalatest.{
20
+ BeforeAndAfter,
21
+ BeforeAndAfterAll,
22
+ DiagrammedAssertions,
23
+ FunSuite
24
+ }
21
25
  import org.scalatestplus.junit.JUnitRunner
22
26
 
23
27
  import scala.annotation.meta.getter
24
28
  import scala.jdk.CollectionConverters._
25
29
 
26
-
27
30
  @RunWith(classOf[JUnitRunner])
28
31
  class TestS3ParquetOutputPlugin
29
32
  extends FunSuite
30
- with BeforeAndAfter
31
- with BeforeAndAfterAll
32
- with DiagrammedAssertions
33
- {
34
-
35
- val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
36
- val TEST_S3_ENDPOINT: String = "http://localhost:4572"
37
- val TEST_S3_REGION: String = "us-east-1"
38
- val TEST_S3_ACCESS_KEY_ID: String = "test"
39
- val TEST_S3_SECRET_ACCESS_KEY: String = "test"
40
- val TEST_BUCKET_NAME: String = "my-bucket"
41
-
42
- @(Rule@getter)
43
- val embulk: TestingEmbulk = TestingEmbulk.builder()
44
- .registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
45
- .build()
46
-
47
- before {
48
- withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
49
- }
50
-
51
- after {
52
- withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
53
- }
54
-
55
- def defaultOutConfig(): ConfigSource =
56
- {
57
- embulk.newConfig()
58
- .set("type", "s3_parquet")
59
- .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
60
- .set("bucket", TEST_BUCKET_NAME)
61
- .set("path_prefix", "path/to/p")
62
- .set("auth_method", "basic")
63
- .set("access_key_id", TEST_S3_ACCESS_KEY_ID)
64
- .set("secret_access_key", TEST_S3_SECRET_ACCESS_KEY)
65
- .set("path_style_access_enabled", true)
66
- .set("default_timezone", "Asia/Tokyo")
67
- }
68
-
69
-
70
- test("first test") {
71
- val inPath = toPath("in1.csv")
72
- val outConfig = defaultOutConfig()
73
-
74
- val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
75
-
76
-
77
- val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
78
- val b = tr.get(classOf[String], "bucket")
79
- val k = tr.get(classOf[String], "key")
80
- readParquetFile(b, k)
81
- }.foldLeft(Seq[Map[String, String]]()) { (merged,
82
- records) =>
83
- merged ++ records
84
- }
85
-
86
- val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
87
- .stripLineEnd
88
- .split("\n")
89
- .map(record => record.split("\t").toSeq)
90
- .toSeq
91
-
92
- inRecords.zipWithIndex.foreach {
93
- case (record, recordIndex) =>
94
- 0.to(5).foreach { columnIndex =>
95
- val columnName = s"c$columnIndex"
96
- val inData: String = inRecords(recordIndex)(columnIndex)
97
- val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
98
-
99
- assert(outData === inData, s"record: $recordIndex, column: $columnName")
100
- }
101
- }
102
- }
103
-
104
- def readParquetFile(bucket: String,
105
- key: String): Seq[Map[String, String]] =
106
- {
107
- val createdParquetFile = embulk.createTempFile("in")
108
- withLocalStackS3Client {s3 =>
109
- val xfer = TransferManagerBuilder.standard()
110
- .withS3Client(s3)
111
- .build()
112
- try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
113
- finally xfer.shutdownNow()
33
+ with BeforeAndAfter
34
+ with BeforeAndAfterAll
35
+ with DiagrammedAssertions {
36
+
37
+ val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
38
+ val TEST_S3_ENDPOINT: String = "http://localhost:4572"
39
+ val TEST_S3_REGION: String = "us-east-1"
40
+ val TEST_S3_ACCESS_KEY_ID: String = "test"
41
+ val TEST_S3_SECRET_ACCESS_KEY: String = "test"
42
+ val TEST_BUCKET_NAME: String = "my-bucket"
43
+
44
+ @(Rule @getter)
45
+ val embulk: TestingEmbulk = TestingEmbulk
46
+ .builder()
47
+ .registerPlugin(
48
+ classOf[OutputPlugin],
49
+ "s3_parquet",
50
+ classOf[S3ParquetOutputPlugin]
51
+ )
52
+ .build()
53
+
54
+ before {
55
+ withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
56
+ }
57
+
58
+ after {
59
+ withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
60
+ }
61
+
62
+ def defaultOutConfig(): ConfigSource = {
63
+ embulk
64
+ .newConfig()
65
+ .set("type", "s3_parquet")
66
+ .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
67
+ .set("bucket", TEST_BUCKET_NAME)
68
+ .set("path_prefix", "path/to/p")
69
+ .set("auth_method", "basic")
70
+ .set("access_key_id", TEST_S3_ACCESS_KEY_ID)
71
+ .set("secret_access_key", TEST_S3_SECRET_ACCESS_KEY)
72
+ .set("path_style_access_enabled", true)
73
+ .set("default_timezone", "Asia/Tokyo")
74
+ }
75
+
76
+ test("first test") {
77
+ val inPath = toPath("in1.csv")
78
+ val outConfig = defaultOutConfig()
79
+
80
+ val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
81
+
82
+ val outRecords: Seq[Map[String, String]] =
83
+ result.getOutputTaskReports.asScala
84
+ .map { tr =>
85
+ val b = tr.get(classOf[String], "bucket")
86
+ val k = tr.get(classOf[String], "key")
87
+ readParquetFile(b, k)
114
88
  }
115
-
116
- val reader: ParquetReader[SimpleRecord] = ParquetReader
117
- .builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
118
- .build()
119
-
120
- def read(reader: ParquetReader[SimpleRecord],
121
- records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
122
- {
123
- val simpleRecord: SimpleRecord = reader.read()
124
- if (simpleRecord != null) {
125
- val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
126
- return read(reader, records :+ r)
127
- }
128
- records
89
+ .foldLeft(Seq[Map[String, String]]()) { (merged, records) =>
90
+ merged ++ records
129
91
  }
130
92
 
131
- try read(reader)
132
- finally {
133
- reader.close()
134
-
93
+ val inRecords: Seq[Seq[String]] = EmbulkTests
94
+ .readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
95
+ .stripLineEnd
96
+ .split("\n")
97
+ .map(record => record.split("\t").toSeq)
98
+ .toSeq
99
+
100
+ inRecords.zipWithIndex.foreach {
101
+ case (record, recordIndex) =>
102
+ 0.to(5).foreach { columnIndex =>
103
+ val columnName = s"c$columnIndex"
104
+ val inData: String = inRecords(recordIndex)(columnIndex)
105
+ val outData: String =
106
+ outRecords(recordIndex).getOrElse(columnName, "")
107
+
108
+ assert(
109
+ outData === inData,
110
+ s"record: $recordIndex, column: $columnName"
111
+ )
135
112
  }
136
113
  }
114
+ }
115
+
116
+ def readParquetFile(bucket: String, key: String): Seq[Map[String, String]] = {
117
+ val createdParquetFile = embulk.createTempFile("in")
118
+ withLocalStackS3Client { s3 =>
119
+ val xfer = TransferManagerBuilder
120
+ .standard()
121
+ .withS3Client(s3)
122
+ .build()
123
+ try xfer
124
+ .download(bucket, key, createdParquetFile.toFile)
125
+ .waitForCompletion()
126
+ finally xfer.shutdownNow()
127
+ }
137
128
 
138
- private def toPath(fileName: String) =
139
- {
140
- val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
141
- FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
129
+ val reader: ParquetReader[SimpleRecord] = ParquetReader
130
+ .builder(
131
+ new SimpleReadSupport(),
132
+ new HadoopPath(createdParquetFile.toString)
133
+ )
134
+ .build()
135
+
136
+ def read(
137
+ reader: ParquetReader[SimpleRecord],
138
+ records: Seq[Map[String, String]] = Seq()
139
+ ): Seq[Map[String, String]] = {
140
+ val simpleRecord: SimpleRecord = reader.read()
141
+ if (simpleRecord != null) {
142
+ val r: Map[String, String] = simpleRecord.getValues.asScala
143
+ .map(v => v.getName -> v.getValue.toString)
144
+ .toMap
145
+ return read(reader, records :+ r)
146
+ }
147
+ records
142
148
  }
143
149
 
144
- private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
145
- val client: AmazonS3 = AmazonS3ClientBuilder.standard
146
- .withEndpointConfiguration(new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION))
147
- .withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials(TEST_S3_ACCESS_KEY_ID, TEST_S3_SECRET_ACCESS_KEY)))
148
- .withPathStyleAccessEnabled(true)
149
- .build()
150
+ try read(reader)
151
+ finally {
152
+ reader.close()
150
153
 
151
- try f(client)
152
- finally client.shutdown()
153
154
  }
155
+ }
156
+
157
+ private def toPath(fileName: String) = {
158
+ val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
159
+ FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
160
+ }
161
+
162
+ private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
163
+ val client: AmazonS3 = AmazonS3ClientBuilder.standard
164
+ .withEndpointConfiguration(
165
+ new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION)
166
+ )
167
+ .withCredentials(
168
+ new AWSStaticCredentialsProvider(
169
+ new BasicAWSCredentials(
170
+ TEST_S3_ACCESS_KEY_ID,
171
+ TEST_S3_SECRET_ACCESS_KEY
172
+ )
173
+ )
174
+ )
175
+ .withPathStyleAccessEnabled(true)
176
+ .build()
177
+
178
+ try f(client)
179
+ finally client.shutdown()
180
+ }
154
181
  }