embulk-output-s3_parquet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +3 -0
  3. data/.github/workflows/test.yml +2 -0
  4. data/.scalafmt.conf +5 -0
  5. data/CHANGELOG.md +15 -0
  6. data/README.md +3 -2
  7. data/build.gradle +19 -9
  8. data/example/config.yml +3 -1
  9. data/example/prepare_s3_bucket.sh +6 -0
  10. data/example/with_catalog.yml +3 -1
  11. data/example/with_logicaltypes.yml +3 -1
  12. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  13. data/gradle/wrapper/gradle-wrapper.properties +1 -1
  14. data/gradlew +31 -20
  15. data/gradlew.bat +17 -1
  16. data/run_s3_local.sh +7 -0
  17. data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +226 -178
  18. data/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala +18 -0
  19. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +293 -204
  20. data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +46 -49
  21. data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +46 -50
  22. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +18 -23
  23. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +146 -119
  24. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +32 -35
  25. data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +45 -41
  26. data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -43
  27. data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +138 -92
  28. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandler.scala +117 -102
  29. data/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeHandlerStore.scala +91 -84
  30. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +30 -29
  31. data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +143 -152
  32. data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +144 -117
  33. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandler.scala +72 -66
  34. data/src/test/scala/org/embulk/output/s3_parquet/parquet/TestLogicalTypeHandlerStore.scala +149 -132
  35. metadata +22 -15
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.apache.hadoop.conf.Configuration
5
4
  import org.apache.parquet.hadoop.api.WriteSupport
6
5
  import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
@@ -11,32 +10,34 @@ import org.embulk.spi.time.TimestampFormatter
11
10
 
12
11
  import scala.jdk.CollectionConverters._
13
12
 
14
-
15
- private[parquet] case class ParquetFileWriteSupport(schema: Schema,
16
- timestampFormatters: Seq[TimestampFormatter],
17
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
18
- extends WriteSupport[PageReader]
19
- {
20
-
21
- private var currentParquetFileWriter: ParquetFileWriter = _
22
-
23
- override def init(configuration: Configuration): WriteContext =
24
- {
25
- val messageType: MessageType = EmbulkMessageType.builder()
26
- .withSchema(schema)
27
- .withLogicalTypeHandlers(logicalTypeHandlers)
28
- .build()
29
- val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
30
- new WriteContext(messageType, metadata.asJava)
31
- }
32
-
33
- override def prepareForWrite(recordConsumer: RecordConsumer): Unit =
34
- {
35
- currentParquetFileWriter = ParquetFileWriter(recordConsumer, schema, timestampFormatters, logicalTypeHandlers)
36
- }
37
-
38
- override def write(record: PageReader): Unit =
39
- {
40
- currentParquetFileWriter.write(record)
41
- }
13
+ private[parquet] case class ParquetFileWriteSupport(
14
+ schema: Schema,
15
+ timestampFormatters: Seq[TimestampFormatter],
16
+ logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
17
+ ) extends WriteSupport[PageReader] {
18
+
19
+ private var currentParquetFileWriter: ParquetFileWriter = _
20
+
21
+ override def init(configuration: Configuration): WriteContext = {
22
+ val messageType: MessageType = EmbulkMessageType
23
+ .builder()
24
+ .withSchema(schema)
25
+ .withLogicalTypeHandlers(logicalTypeHandlers)
26
+ .build()
27
+ val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
28
+ new WriteContext(messageType, metadata.asJava)
29
+ }
30
+
31
+ override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
32
+ currentParquetFileWriter = ParquetFileWriter(
33
+ recordConsumer,
34
+ schema,
35
+ timestampFormatters,
36
+ logicalTypeHandlers
37
+ )
38
+ }
39
+
40
+ override def write(record: PageReader): Unit = {
41
+ currentParquetFileWriter.write(record)
42
+ }
42
43
  }
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet.parquet
2
2
 
3
-
4
3
  import org.apache.hadoop.conf.Configuration
5
4
  import org.apache.hadoop.fs.Path
6
5
  import org.apache.parquet.hadoop.ParquetWriter
@@ -9,168 +8,160 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer}
9
8
  import org.embulk.spi.{Column, ColumnVisitor, PageReader, Schema}
10
9
  import org.embulk.spi.time.TimestampFormatter
11
10
 
11
+ object ParquetFileWriter {
12
+
13
+ case class Builder(
14
+ path: Path = null,
15
+ schema: Schema = null,
16
+ timestampFormatters: Seq[TimestampFormatter] = null,
17
+ logicalTypeHandlers: LogicalTypeHandlerStore =
18
+ LogicalTypeHandlerStore.empty
19
+ ) extends ParquetWriter.Builder[PageReader, Builder](path) {
12
20
 
13
- object ParquetFileWriter
14
- {
15
-
16
- case class Builder(path: Path = null,
17
- schema: Schema = null,
18
- timestampFormatters: Seq[TimestampFormatter] = null,
19
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
20
- extends ParquetWriter.Builder[PageReader, Builder](path)
21
- {
22
-
23
- def withPath(path: Path): Builder =
24
- {
25
- copy(path = path)
26
- }
27
-
28
- def withPath(pathString: String): Builder =
29
- {
30
- copy(path = new Path(pathString))
31
- }
32
-
33
- def withSchema(schema: Schema): Builder =
34
- {
35
- copy(schema = schema)
36
- }
37
-
38
- def withTimestampFormatters(timestampFormatters: Seq[TimestampFormatter]): Builder =
39
- {
40
- copy(timestampFormatters = timestampFormatters)
41
- }
42
-
43
- def withLogicalTypeHandlers(logicalTypeHandlers: LogicalTypeHandlerStore): Builder =
44
- {
45
- copy(logicalTypeHandlers = logicalTypeHandlers)
46
- }
47
-
48
- override def self(): Builder =
49
- {
50
- this
51
- }
52
-
53
- override def getWriteSupport(conf: Configuration): WriteSupport[PageReader] =
54
- {
55
- ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
56
- }
21
+ def withPath(path: Path): Builder = {
22
+ copy(path = path)
57
23
  }
58
24
 
59
- def builder(): Builder =
60
- {
61
- Builder()
25
+ def withPath(pathString: String): Builder = {
26
+ copy(path = new Path(pathString))
62
27
  }
63
28
 
64
- }
29
+ def withSchema(schema: Schema): Builder = {
30
+ copy(schema = schema)
31
+ }
32
+
33
+ def withTimestampFormatters(
34
+ timestampFormatters: Seq[TimestampFormatter]
35
+ ): Builder = {
36
+ copy(timestampFormatters = timestampFormatters)
37
+ }
65
38
 
39
+ def withLogicalTypeHandlers(
40
+ logicalTypeHandlers: LogicalTypeHandlerStore
41
+ ): Builder = {
42
+ copy(logicalTypeHandlers = logicalTypeHandlers)
43
+ }
66
44
 
67
- private[parquet] case class ParquetFileWriter(recordConsumer: RecordConsumer,
68
- schema: Schema,
69
- timestampFormatters: Seq[TimestampFormatter],
70
- logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty)
71
- {
45
+ override def self(): Builder = {
46
+ this
47
+ }
72
48
 
73
- def write(record: PageReader): Unit =
74
- {
75
- recordConsumer.startMessage()
76
- writeRecord(record)
77
- recordConsumer.endMessage()
49
+ override def getWriteSupport(
50
+ conf: Configuration
51
+ ): WriteSupport[PageReader] = {
52
+ ParquetFileWriteSupport(schema, timestampFormatters, logicalTypeHandlers)
78
53
  }
54
+ }
55
+
56
+ def builder(): Builder = {
57
+ Builder()
58
+ }
59
+
60
+ }
61
+
62
+ private[parquet] case class ParquetFileWriter(
63
+ recordConsumer: RecordConsumer,
64
+ schema: Schema,
65
+ timestampFormatters: Seq[TimestampFormatter],
66
+ logicalTypeHandlers: LogicalTypeHandlerStore = LogicalTypeHandlerStore.empty
67
+ ) {
68
+
69
+ def write(record: PageReader): Unit = {
70
+ recordConsumer.startMessage()
71
+ writeRecord(record)
72
+ recordConsumer.endMessage()
73
+ }
79
74
 
80
- private def writeRecord(record: PageReader): Unit =
81
- {
82
-
83
- schema.visitColumns(new ColumnVisitor()
84
- {
85
-
86
- override def booleanColumn(column: Column): Unit =
87
- {
88
- nullOr(column, {
89
- withWriteFieldContext(column, {
90
- recordConsumer.addBoolean(record.getBoolean(column))
91
- })
92
- })
93
- }
94
-
95
- override def longColumn(column: Column): Unit =
96
- {
97
- nullOr(column, {
98
- withWriteFieldContext(column, {
99
- recordConsumer.addLong(record.getLong(column))
100
- })
101
- })
102
- }
103
-
104
- override def doubleColumn(column: Column): Unit =
105
- {
106
- nullOr(column, {
107
- withWriteFieldContext(column, {
108
- recordConsumer.addDouble(record.getDouble(column))
109
- })
110
- })
111
- }
112
-
113
- override def stringColumn(column: Column): Unit =
114
- {
115
- nullOr(column, {
116
- withWriteFieldContext(column, {
117
- val bin = Binary.fromString(record.getString(column))
118
- recordConsumer.addBinary(bin)
119
- })
120
- })
121
- }
122
-
123
- override def timestampColumn(column: Column): Unit =
124
- {
125
- nullOr(column, {
126
- withWriteFieldContext(column, {
127
- val t = record.getTimestamp(column)
128
-
129
- logicalTypeHandlers.get(column.getName, column.getType) match {
130
- case Some(h) =>
131
- h.consume(t, recordConsumer)
132
- case _ =>
133
- val ft = timestampFormatters(column.getIndex).format(t)
134
- val bin = Binary.fromString(ft)
135
- recordConsumer.addBinary(bin)
136
- }
137
- })
138
- })
139
- }
140
-
141
- override def jsonColumn(column: Column): Unit =
142
- {
143
- nullOr(column, {
144
- withWriteFieldContext(column, {
145
- val msgPack = record.getJson(column)
146
-
147
- logicalTypeHandlers.get(column.getName, column.getType) match {
148
- case Some(h) =>
149
- h.consume(msgPack, recordConsumer)
150
- case _ =>
151
- val bin = Binary.fromString(msgPack.toJson)
152
- recordConsumer.addBinary(bin)
153
- }
154
- })
155
- })
156
- }
157
-
158
- private def nullOr(column: Column,
159
- f: => Unit): Unit =
160
- {
161
- if (!record.isNull(column)) f
162
- }
163
-
164
- private def withWriteFieldContext(column: Column,
165
- f: => Unit): Unit =
166
- {
167
- recordConsumer.startField(column.getName, column.getIndex)
168
- f
169
- recordConsumer.endField(column.getName, column.getIndex)
170
- }
75
+ private def writeRecord(record: PageReader): Unit = {
171
76
 
77
+ schema.visitColumns(new ColumnVisitor() {
78
+
79
+ override def booleanColumn(column: Column): Unit = {
80
+ nullOr(column, {
81
+ withWriteFieldContext(column, {
82
+ recordConsumer.addBoolean(record.getBoolean(column))
83
+ })
172
84
  })
85
+ }
173
86
 
174
- }
87
+ override def longColumn(column: Column): Unit = {
88
+ nullOr(column, {
89
+ withWriteFieldContext(column, {
90
+ recordConsumer.addLong(record.getLong(column))
91
+ })
92
+ })
93
+ }
175
94
 
176
- }
95
+ override def doubleColumn(column: Column): Unit = {
96
+ nullOr(column, {
97
+ withWriteFieldContext(column, {
98
+ recordConsumer.addDouble(record.getDouble(column))
99
+ })
100
+ })
101
+ }
102
+
103
+ override def stringColumn(column: Column): Unit = {
104
+ nullOr(column, {
105
+ withWriteFieldContext(column, {
106
+ val bin = Binary.fromString(record.getString(column))
107
+ recordConsumer.addBinary(bin)
108
+ })
109
+ })
110
+ }
111
+
112
+ override def timestampColumn(column: Column): Unit = {
113
+ nullOr(
114
+ column, {
115
+ withWriteFieldContext(
116
+ column, {
117
+ val t = record.getTimestamp(column)
118
+
119
+ logicalTypeHandlers.get(column.getName, column.getType) match {
120
+ case Some(h) =>
121
+ h.consume(t, recordConsumer)
122
+ case _ =>
123
+ val ft = timestampFormatters(column.getIndex).format(t)
124
+ val bin = Binary.fromString(ft)
125
+ recordConsumer.addBinary(bin)
126
+ }
127
+ }
128
+ )
129
+ }
130
+ )
131
+ }
132
+
133
+ override def jsonColumn(column: Column): Unit = {
134
+ nullOr(
135
+ column, {
136
+ withWriteFieldContext(
137
+ column, {
138
+ val msgPack = record.getJson(column)
139
+
140
+ logicalTypeHandlers.get(column.getName, column.getType) match {
141
+ case Some(h) =>
142
+ h.consume(msgPack, recordConsumer)
143
+ case _ =>
144
+ val bin = Binary.fromString(msgPack.toJson)
145
+ recordConsumer.addBinary(bin)
146
+ }
147
+ }
148
+ )
149
+ }
150
+ )
151
+ }
152
+
153
+ private def nullOr(column: Column, f: => Unit): Unit = {
154
+ if (!record.isNull(column)) f
155
+ }
156
+
157
+ private def withWriteFieldContext(column: Column, f: => Unit): Unit = {
158
+ recordConsumer.startField(column.getName, column.getIndex)
159
+ f
160
+ recordConsumer.endField(column.getName, column.getIndex)
161
+ }
162
+
163
+ })
164
+
165
+ }
166
+
167
+ }
@@ -1,6 +1,5 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
-
4
3
  import java.io.File
5
4
  import java.nio.file.FileSystems
6
5
 
@@ -17,138 +16,166 @@ import org.embulk.spi.OutputPlugin
17
16
  import org.embulk.test.{EmbulkTests, TestingEmbulk}
18
17
  import org.junit.Rule
19
18
  import org.junit.runner.RunWith
20
- import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, DiagrammedAssertions, FunSuite}
19
+ import org.scalatest.{
20
+ BeforeAndAfter,
21
+ BeforeAndAfterAll,
22
+ DiagrammedAssertions,
23
+ FunSuite
24
+ }
21
25
  import org.scalatestplus.junit.JUnitRunner
22
26
 
23
27
  import scala.annotation.meta.getter
24
28
  import scala.jdk.CollectionConverters._
25
29
 
26
-
27
30
  @RunWith(classOf[JUnitRunner])
28
31
  class TestS3ParquetOutputPlugin
29
32
  extends FunSuite
30
- with BeforeAndAfter
31
- with BeforeAndAfterAll
32
- with DiagrammedAssertions
33
- {
34
-
35
- val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
36
- val TEST_S3_ENDPOINT: String = "http://localhost:4572"
37
- val TEST_S3_REGION: String = "us-east-1"
38
- val TEST_S3_ACCESS_KEY_ID: String = "test"
39
- val TEST_S3_SECRET_ACCESS_KEY: String = "test"
40
- val TEST_BUCKET_NAME: String = "my-bucket"
41
-
42
- @(Rule@getter)
43
- val embulk: TestingEmbulk = TestingEmbulk.builder()
44
- .registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
45
- .build()
46
-
47
- before {
48
- withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
49
- }
50
-
51
- after {
52
- withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
53
- }
54
-
55
- def defaultOutConfig(): ConfigSource =
56
- {
57
- embulk.newConfig()
58
- .set("type", "s3_parquet")
59
- .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
60
- .set("bucket", TEST_BUCKET_NAME)
61
- .set("path_prefix", "path/to/p")
62
- .set("auth_method", "basic")
63
- .set("access_key_id", TEST_S3_ACCESS_KEY_ID)
64
- .set("secret_access_key", TEST_S3_SECRET_ACCESS_KEY)
65
- .set("path_style_access_enabled", true)
66
- .set("default_timezone", "Asia/Tokyo")
67
- }
68
-
69
-
70
- test("first test") {
71
- val inPath = toPath("in1.csv")
72
- val outConfig = defaultOutConfig()
73
-
74
- val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
75
-
76
-
77
- val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
78
- val b = tr.get(classOf[String], "bucket")
79
- val k = tr.get(classOf[String], "key")
80
- readParquetFile(b, k)
81
- }.foldLeft(Seq[Map[String, String]]()) { (merged,
82
- records) =>
83
- merged ++ records
84
- }
85
-
86
- val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
87
- .stripLineEnd
88
- .split("\n")
89
- .map(record => record.split("\t").toSeq)
90
- .toSeq
91
-
92
- inRecords.zipWithIndex.foreach {
93
- case (record, recordIndex) =>
94
- 0.to(5).foreach { columnIndex =>
95
- val columnName = s"c$columnIndex"
96
- val inData: String = inRecords(recordIndex)(columnIndex)
97
- val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
98
-
99
- assert(outData === inData, s"record: $recordIndex, column: $columnName")
100
- }
101
- }
102
- }
103
-
104
- def readParquetFile(bucket: String,
105
- key: String): Seq[Map[String, String]] =
106
- {
107
- val createdParquetFile = embulk.createTempFile("in")
108
- withLocalStackS3Client {s3 =>
109
- val xfer = TransferManagerBuilder.standard()
110
- .withS3Client(s3)
111
- .build()
112
- try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
113
- finally xfer.shutdownNow()
33
+ with BeforeAndAfter
34
+ with BeforeAndAfterAll
35
+ with DiagrammedAssertions {
36
+
37
+ val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
38
+ val TEST_S3_ENDPOINT: String = "http://localhost:4572"
39
+ val TEST_S3_REGION: String = "us-east-1"
40
+ val TEST_S3_ACCESS_KEY_ID: String = "test"
41
+ val TEST_S3_SECRET_ACCESS_KEY: String = "test"
42
+ val TEST_BUCKET_NAME: String = "my-bucket"
43
+
44
+ @(Rule @getter)
45
+ val embulk: TestingEmbulk = TestingEmbulk
46
+ .builder()
47
+ .registerPlugin(
48
+ classOf[OutputPlugin],
49
+ "s3_parquet",
50
+ classOf[S3ParquetOutputPlugin]
51
+ )
52
+ .build()
53
+
54
+ before {
55
+ withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
56
+ }
57
+
58
+ after {
59
+ withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
60
+ }
61
+
62
+ def defaultOutConfig(): ConfigSource = {
63
+ embulk
64
+ .newConfig()
65
+ .set("type", "s3_parquet")
66
+ .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
67
+ .set("bucket", TEST_BUCKET_NAME)
68
+ .set("path_prefix", "path/to/p")
69
+ .set("auth_method", "basic")
70
+ .set("access_key_id", TEST_S3_ACCESS_KEY_ID)
71
+ .set("secret_access_key", TEST_S3_SECRET_ACCESS_KEY)
72
+ .set("path_style_access_enabled", true)
73
+ .set("default_timezone", "Asia/Tokyo")
74
+ }
75
+
76
+ test("first test") {
77
+ val inPath = toPath("in1.csv")
78
+ val outConfig = defaultOutConfig()
79
+
80
+ val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
81
+
82
+ val outRecords: Seq[Map[String, String]] =
83
+ result.getOutputTaskReports.asScala
84
+ .map { tr =>
85
+ val b = tr.get(classOf[String], "bucket")
86
+ val k = tr.get(classOf[String], "key")
87
+ readParquetFile(b, k)
114
88
  }
115
-
116
- val reader: ParquetReader[SimpleRecord] = ParquetReader
117
- .builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
118
- .build()
119
-
120
- def read(reader: ParquetReader[SimpleRecord],
121
- records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
122
- {
123
- val simpleRecord: SimpleRecord = reader.read()
124
- if (simpleRecord != null) {
125
- val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
126
- return read(reader, records :+ r)
127
- }
128
- records
89
+ .foldLeft(Seq[Map[String, String]]()) { (merged, records) =>
90
+ merged ++ records
129
91
  }
130
92
 
131
- try read(reader)
132
- finally {
133
- reader.close()
134
-
93
+ val inRecords: Seq[Seq[String]] = EmbulkTests
94
+ .readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
95
+ .stripLineEnd
96
+ .split("\n")
97
+ .map(record => record.split("\t").toSeq)
98
+ .toSeq
99
+
100
+ inRecords.zipWithIndex.foreach {
101
+ case (record, recordIndex) =>
102
+ 0.to(5).foreach { columnIndex =>
103
+ val columnName = s"c$columnIndex"
104
+ val inData: String = inRecords(recordIndex)(columnIndex)
105
+ val outData: String =
106
+ outRecords(recordIndex).getOrElse(columnName, "")
107
+
108
+ assert(
109
+ outData === inData,
110
+ s"record: $recordIndex, column: $columnName"
111
+ )
135
112
  }
136
113
  }
114
+ }
115
+
116
+ def readParquetFile(bucket: String, key: String): Seq[Map[String, String]] = {
117
+ val createdParquetFile = embulk.createTempFile("in")
118
+ withLocalStackS3Client { s3 =>
119
+ val xfer = TransferManagerBuilder
120
+ .standard()
121
+ .withS3Client(s3)
122
+ .build()
123
+ try xfer
124
+ .download(bucket, key, createdParquetFile.toFile)
125
+ .waitForCompletion()
126
+ finally xfer.shutdownNow()
127
+ }
137
128
 
138
- private def toPath(fileName: String) =
139
- {
140
- val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
141
- FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
129
+ val reader: ParquetReader[SimpleRecord] = ParquetReader
130
+ .builder(
131
+ new SimpleReadSupport(),
132
+ new HadoopPath(createdParquetFile.toString)
133
+ )
134
+ .build()
135
+
136
+ def read(
137
+ reader: ParquetReader[SimpleRecord],
138
+ records: Seq[Map[String, String]] = Seq()
139
+ ): Seq[Map[String, String]] = {
140
+ val simpleRecord: SimpleRecord = reader.read()
141
+ if (simpleRecord != null) {
142
+ val r: Map[String, String] = simpleRecord.getValues.asScala
143
+ .map(v => v.getName -> v.getValue.toString)
144
+ .toMap
145
+ return read(reader, records :+ r)
146
+ }
147
+ records
142
148
  }
143
149
 
144
- private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
145
- val client: AmazonS3 = AmazonS3ClientBuilder.standard
146
- .withEndpointConfiguration(new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION))
147
- .withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials(TEST_S3_ACCESS_KEY_ID, TEST_S3_SECRET_ACCESS_KEY)))
148
- .withPathStyleAccessEnabled(true)
149
- .build()
150
+ try read(reader)
151
+ finally {
152
+ reader.close()
150
153
 
151
- try f(client)
152
- finally client.shutdown()
153
154
  }
155
+ }
156
+
157
+ private def toPath(fileName: String) = {
158
+ val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
159
+ FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
160
+ }
161
+
162
+ private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
163
+ val client: AmazonS3 = AmazonS3ClientBuilder.standard
164
+ .withEndpointConfiguration(
165
+ new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION)
166
+ )
167
+ .withCredentials(
168
+ new AWSStaticCredentialsProvider(
169
+ new BasicAWSCredentials(
170
+ TEST_S3_ACCESS_KEY_ID,
171
+ TEST_S3_SECRET_ACCESS_KEY
172
+ )
173
+ )
174
+ )
175
+ .withPathStyleAccessEnabled(true)
176
+ .build()
177
+
178
+ try f(client)
179
+ finally client.shutdown()
180
+ }
154
181
  }