embulk-output-s3_parquet 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/build.gradle +12 -13
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +178 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +166 -144
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +43 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +47 -29
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +22 -14
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +104 -95
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +34 -26
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +39 -31
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -32
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +57 -37
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +26 -19
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +128 -94
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +113 -104
- metadata +18 -16
- data/.scalafmt.conf +0 -9
@@ -1,8 +1,8 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
3
|
|
4
|
-
import java.io.
|
5
|
-
import java.nio.file.
|
4
|
+
import java.io.File
|
5
|
+
import java.nio.file.FileSystems
|
6
6
|
|
7
7
|
import cloud.localstack.{DockerTestUtils, Localstack, TestUtils}
|
8
8
|
import cloud.localstack.docker.LocalstackDocker
|
@@ -21,120 +21,129 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, DiagrammedAssertions, F
|
|
21
21
|
import org.scalatest.junit.JUnitRunner
|
22
22
|
|
23
23
|
import scala.annotation.meta.getter
|
24
|
-
import scala.
|
24
|
+
import scala.jdk.CollectionConverters._
|
25
|
+
|
25
26
|
|
26
27
|
@RunWith(classOf[JUnitRunner])
|
27
28
|
class TestS3ParquetOutputPlugin
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
override protected def afterAll(): Unit = {
|
45
|
-
LOCALSTACK_DOCKER.stop()
|
46
|
-
super.afterAll()
|
47
|
-
}
|
48
|
-
|
49
|
-
@(Rule@getter)
|
50
|
-
val embulk: TestingEmbulk = TestingEmbulk.builder()
|
51
|
-
.registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
|
52
|
-
.build()
|
53
|
-
|
54
|
-
before {
|
55
|
-
DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
|
56
|
-
}
|
57
|
-
|
58
|
-
def defaultOutConfig(): ConfigSource = {
|
59
|
-
embulk.newConfig()
|
60
|
-
.set("type", "s3_parquet")
|
61
|
-
.set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
|
62
|
-
.set("bucket", BUCKET_NAME)
|
63
|
-
.set("path_prefix", "path/to/p")
|
64
|
-
.set("auth_method", "basic")
|
65
|
-
.set("access_key_id", TestUtils.TEST_ACCESS_KEY)
|
66
|
-
.set("secret_access_key", TestUtils.TEST_SECRET_KEY)
|
67
|
-
.set("path_style_access_enabled", true)
|
68
|
-
.set("default_timezone", "Asia/Tokyo")
|
69
|
-
}
|
70
|
-
|
71
|
-
|
72
|
-
test("first test") {
|
73
|
-
val inPath = toPath("in1.csv")
|
74
|
-
val outConfig = defaultOutConfig()
|
75
|
-
|
76
|
-
val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
|
77
|
-
|
78
|
-
|
79
|
-
val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
|
80
|
-
val b = tr.get(classOf[String], "bucket")
|
81
|
-
val k = tr.get(classOf[String], "key")
|
82
|
-
readParquetFile(b, k)
|
83
|
-
}.foldLeft(Seq[Map[String, String]]()) { (merged,
|
84
|
-
records) =>
|
85
|
-
merged ++ records
|
29
|
+
extends FunSuite
|
30
|
+
with BeforeAndAfter
|
31
|
+
with BeforeAndAfterAll
|
32
|
+
with DiagrammedAssertions
|
33
|
+
{
|
34
|
+
|
35
|
+
val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
|
36
|
+
val BUCKET_NAME: String = "my-bucket"
|
37
|
+
|
38
|
+
val LOCALSTACK_DOCKER: LocalstackDocker = LocalstackDocker.INSTANCE
|
39
|
+
|
40
|
+
override protected def beforeAll(): Unit =
|
41
|
+
{
|
42
|
+
Localstack.teardownInfrastructure()
|
43
|
+
LOCALSTACK_DOCKER.startup(LocalstackDockerConfiguration.DEFAULT)
|
44
|
+
super.beforeAll()
|
86
45
|
}
|
87
46
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
47
|
+
override protected def afterAll(): Unit =
|
48
|
+
{
|
49
|
+
LOCALSTACK_DOCKER.stop()
|
50
|
+
super.afterAll()
|
51
|
+
}
|
92
52
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
val inData: String = inRecords(recordIndex)(columnIndex)
|
98
|
-
val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
|
53
|
+
@(Rule@getter)
|
54
|
+
val embulk: TestingEmbulk = TestingEmbulk.builder()
|
55
|
+
.registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
|
56
|
+
.build()
|
99
57
|
|
100
|
-
|
101
|
-
|
58
|
+
before {
|
59
|
+
DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
|
102
60
|
}
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
.build()
|
117
|
-
|
118
|
-
def read(reader: ParquetReader[SimpleRecord],
|
119
|
-
records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] = {
|
120
|
-
val simpleRecord: SimpleRecord = reader.read()
|
121
|
-
if (simpleRecord != null) {
|
122
|
-
val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
|
123
|
-
return read(reader, records :+ r)
|
124
|
-
}
|
125
|
-
records
|
61
|
+
|
62
|
+
def defaultOutConfig(): ConfigSource =
|
63
|
+
{
|
64
|
+
embulk.newConfig()
|
65
|
+
.set("type", "s3_parquet")
|
66
|
+
.set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
|
67
|
+
.set("bucket", BUCKET_NAME)
|
68
|
+
.set("path_prefix", "path/to/p")
|
69
|
+
.set("auth_method", "basic")
|
70
|
+
.set("access_key_id", TestUtils.TEST_ACCESS_KEY)
|
71
|
+
.set("secret_access_key", TestUtils.TEST_SECRET_KEY)
|
72
|
+
.set("path_style_access_enabled", true)
|
73
|
+
.set("default_timezone", "Asia/Tokyo")
|
126
74
|
}
|
127
75
|
|
128
|
-
try read(reader)
|
129
|
-
finally {
|
130
|
-
reader.close()
|
131
76
|
|
77
|
+
test("first test") {
|
78
|
+
val inPath = toPath("in1.csv")
|
79
|
+
val outConfig = defaultOutConfig()
|
80
|
+
|
81
|
+
val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
|
82
|
+
|
83
|
+
|
84
|
+
val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
|
85
|
+
val b = tr.get(classOf[String], "bucket")
|
86
|
+
val k = tr.get(classOf[String], "key")
|
87
|
+
readParquetFile(b, k)
|
88
|
+
}.foldLeft(Seq[Map[String, String]]()) { (merged,
|
89
|
+
records) =>
|
90
|
+
merged ++ records
|
91
|
+
}
|
92
|
+
|
93
|
+
val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
|
94
|
+
.stripLineEnd
|
95
|
+
.split("\n")
|
96
|
+
.map(record => record.split("\t").toSeq)
|
97
|
+
.toSeq
|
98
|
+
|
99
|
+
inRecords.zipWithIndex.foreach {
|
100
|
+
case (record, recordIndex) =>
|
101
|
+
0.to(5).foreach { columnIndex =>
|
102
|
+
val columnName = s"c$columnIndex"
|
103
|
+
val inData: String = inRecords(recordIndex)(columnIndex)
|
104
|
+
val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
|
105
|
+
|
106
|
+
assert(outData === inData, s"record: $recordIndex, column: $columnName")
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
def readParquetFile(bucket: String,
|
112
|
+
key: String): Seq[Map[String, String]] =
|
113
|
+
{
|
114
|
+
val xfer = TransferManagerBuilder.standard()
|
115
|
+
.withS3Client(DockerTestUtils.getClientS3)
|
116
|
+
.build()
|
117
|
+
val createdParquetFile = embulk.createTempFile("in")
|
118
|
+
try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
|
119
|
+
finally xfer.shutdownNow()
|
120
|
+
|
121
|
+
val reader: ParquetReader[SimpleRecord] = ParquetReader
|
122
|
+
.builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
|
123
|
+
.build()
|
124
|
+
|
125
|
+
def read(reader: ParquetReader[SimpleRecord],
|
126
|
+
records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
|
127
|
+
{
|
128
|
+
val simpleRecord: SimpleRecord = reader.read()
|
129
|
+
if (simpleRecord != null) {
|
130
|
+
val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
|
131
|
+
return read(reader, records :+ r)
|
132
|
+
}
|
133
|
+
records
|
134
|
+
}
|
135
|
+
|
136
|
+
try read(reader)
|
137
|
+
finally {
|
138
|
+
reader.close()
|
139
|
+
|
140
|
+
}
|
132
141
|
}
|
133
|
-
}
|
134
142
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
143
|
+
private def toPath(fileName: String) =
|
144
|
+
{
|
145
|
+
val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
|
146
|
+
FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
|
147
|
+
}
|
139
148
|
|
140
149
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-s3_parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Civitaspo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -46,7 +46,6 @@ extensions: []
|
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
48
|
- ".gitignore"
|
49
|
-
- ".scalafmt.conf"
|
50
49
|
- CHANGELOG.md
|
51
50
|
- LICENSE.txt
|
52
51
|
- README.md
|
@@ -60,11 +59,13 @@ files:
|
|
60
59
|
- classpath/asm-3.1.jar
|
61
60
|
- classpath/asm-5.0.4.jar
|
62
61
|
- classpath/avro-1.7.7.jar
|
63
|
-
- classpath/aws-java-sdk-core-1.11.
|
64
|
-
- classpath/aws-java-sdk-
|
65
|
-
- classpath/aws-java-sdk-
|
66
|
-
- classpath/aws-java-sdk-
|
62
|
+
- classpath/aws-java-sdk-core-1.11.592.jar
|
63
|
+
- classpath/aws-java-sdk-glue-1.11.592.jar
|
64
|
+
- classpath/aws-java-sdk-kms-1.11.592.jar
|
65
|
+
- classpath/aws-java-sdk-s3-1.11.592.jar
|
66
|
+
- classpath/aws-java-sdk-sts-1.11.592.jar
|
67
67
|
- classpath/commons-beanutils-1.7.0.jar
|
68
|
+
- classpath/commons-cli-1.2.jar
|
68
69
|
- classpath/commons-codec-1.10.jar
|
69
70
|
- classpath/commons-collections-3.2.2.jar
|
70
71
|
- classpath/commons-compress-1.4.1.jar
|
@@ -79,7 +80,7 @@ files:
|
|
79
80
|
- classpath/curator-client-2.7.1.jar
|
80
81
|
- classpath/curator-framework-2.7.1.jar
|
81
82
|
- classpath/curator-recipes-2.7.1.jar
|
82
|
-
- classpath/embulk-output-s3_parquet-0.0.
|
83
|
+
- classpath/embulk-output-s3_parquet-0.0.3.jar
|
83
84
|
- classpath/gson-2.2.4.jar
|
84
85
|
- classpath/hadoop-annotations-2.9.2.jar
|
85
86
|
- classpath/hadoop-auth-2.9.2.jar
|
@@ -107,7 +108,7 @@ files:
|
|
107
108
|
- classpath/jetty-sslengine-6.1.26.jar
|
108
109
|
- classpath/jetty-util-6.1.26.jar
|
109
110
|
- classpath/jline-0.9.94.jar
|
110
|
-
- classpath/jmespath-java-1.11.
|
111
|
+
- classpath/jmespath-java-1.11.592.jar
|
111
112
|
- classpath/jsch-0.1.54.jar
|
112
113
|
- classpath/json-smart-2.3.jar
|
113
114
|
- classpath/jsp-api-2.1.jar
|
@@ -116,19 +117,19 @@ files:
|
|
116
117
|
- classpath/netty-3.7.0.Final.jar
|
117
118
|
- classpath/nimbus-jose-jwt-4.41.1.jar
|
118
119
|
- classpath/paranamer-2.3.jar
|
119
|
-
- classpath/parquet-column-1.10.
|
120
|
-
- classpath/parquet-common-1.10.
|
121
|
-
- classpath/parquet-encoding-1.10.
|
120
|
+
- classpath/parquet-column-1.10.1.jar
|
121
|
+
- classpath/parquet-common-1.10.1.jar
|
122
|
+
- classpath/parquet-encoding-1.10.1.jar
|
122
123
|
- classpath/parquet-format-2.4.0.jar
|
123
|
-
- classpath/parquet-hadoop-1.10.
|
124
|
-
- classpath/parquet-jackson-1.10.
|
124
|
+
- classpath/parquet-hadoop-1.10.1.jar
|
125
|
+
- classpath/parquet-jackson-1.10.1.jar
|
125
126
|
- classpath/protobuf-java-2.5.0.jar
|
126
|
-
- classpath/scala-library-2.
|
127
|
+
- classpath/scala-library-2.13.0.jar
|
127
128
|
- classpath/servlet-api-2.5-20081211.jar
|
128
129
|
- classpath/servlet-api-2.5.jar
|
129
130
|
- classpath/slf4j-api-1.7.25.jar
|
130
131
|
- classpath/slf4j-log4j12-1.7.25.jar
|
131
|
-
- classpath/snappy-java-1.1.7.
|
132
|
+
- classpath/snappy-java-1.1.7.3.jar
|
132
133
|
- classpath/stax-api-1.0-2.jar
|
133
134
|
- classpath/stax2-api-3.1.4.jar
|
134
135
|
- classpath/woodstox-core-5.0.3.jar
|
@@ -143,6 +144,7 @@ files:
|
|
143
144
|
- gradlew.bat
|
144
145
|
- lib/embulk/output/s3_parquet.rb
|
145
146
|
- settings.gradle
|
147
|
+
- src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala
|
146
148
|
- src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala
|
147
149
|
- src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala
|
148
150
|
- src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala
|
data/.scalafmt.conf
DELETED