embulk-output-s3_parquet 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/build.gradle +12 -13
- data/src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala +178 -0
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala +166 -144
- data/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala +43 -35
- data/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala +47 -29
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala +22 -14
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala +104 -95
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala +34 -26
- data/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala +39 -31
- data/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala +40 -32
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/EmbulkMessageType.scala +57 -37
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala +26 -19
- data/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriter.scala +128 -94
- data/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala +113 -104
- metadata +18 -16
- data/.scalafmt.conf +0 -9
@@ -1,8 +1,8 @@
|
|
1
1
|
package org.embulk.output.s3_parquet
|
2
2
|
|
3
3
|
|
4
|
-
import java.io.
|
5
|
-
import java.nio.file.
|
4
|
+
import java.io.File
|
5
|
+
import java.nio.file.FileSystems
|
6
6
|
|
7
7
|
import cloud.localstack.{DockerTestUtils, Localstack, TestUtils}
|
8
8
|
import cloud.localstack.docker.LocalstackDocker
|
@@ -21,120 +21,129 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, DiagrammedAssertions, F
|
|
21
21
|
import org.scalatest.junit.JUnitRunner
|
22
22
|
|
23
23
|
import scala.annotation.meta.getter
|
24
|
-
import scala.
|
24
|
+
import scala.jdk.CollectionConverters._
|
25
|
+
|
25
26
|
|
26
27
|
@RunWith(classOf[JUnitRunner])
|
27
28
|
class TestS3ParquetOutputPlugin
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
override protected def afterAll(): Unit = {
|
45
|
-
LOCALSTACK_DOCKER.stop()
|
46
|
-
super.afterAll()
|
47
|
-
}
|
48
|
-
|
49
|
-
@(Rule@getter)
|
50
|
-
val embulk: TestingEmbulk = TestingEmbulk.builder()
|
51
|
-
.registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
|
52
|
-
.build()
|
53
|
-
|
54
|
-
before {
|
55
|
-
DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
|
56
|
-
}
|
57
|
-
|
58
|
-
def defaultOutConfig(): ConfigSource = {
|
59
|
-
embulk.newConfig()
|
60
|
-
.set("type", "s3_parquet")
|
61
|
-
.set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
|
62
|
-
.set("bucket", BUCKET_NAME)
|
63
|
-
.set("path_prefix", "path/to/p")
|
64
|
-
.set("auth_method", "basic")
|
65
|
-
.set("access_key_id", TestUtils.TEST_ACCESS_KEY)
|
66
|
-
.set("secret_access_key", TestUtils.TEST_SECRET_KEY)
|
67
|
-
.set("path_style_access_enabled", true)
|
68
|
-
.set("default_timezone", "Asia/Tokyo")
|
69
|
-
}
|
70
|
-
|
71
|
-
|
72
|
-
test("first test") {
|
73
|
-
val inPath = toPath("in1.csv")
|
74
|
-
val outConfig = defaultOutConfig()
|
75
|
-
|
76
|
-
val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
|
77
|
-
|
78
|
-
|
79
|
-
val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
|
80
|
-
val b = tr.get(classOf[String], "bucket")
|
81
|
-
val k = tr.get(classOf[String], "key")
|
82
|
-
readParquetFile(b, k)
|
83
|
-
}.foldLeft(Seq[Map[String, String]]()) { (merged,
|
84
|
-
records) =>
|
85
|
-
merged ++ records
|
29
|
+
extends FunSuite
|
30
|
+
with BeforeAndAfter
|
31
|
+
with BeforeAndAfterAll
|
32
|
+
with DiagrammedAssertions
|
33
|
+
{
|
34
|
+
|
35
|
+
val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
|
36
|
+
val BUCKET_NAME: String = "my-bucket"
|
37
|
+
|
38
|
+
val LOCALSTACK_DOCKER: LocalstackDocker = LocalstackDocker.INSTANCE
|
39
|
+
|
40
|
+
override protected def beforeAll(): Unit =
|
41
|
+
{
|
42
|
+
Localstack.teardownInfrastructure()
|
43
|
+
LOCALSTACK_DOCKER.startup(LocalstackDockerConfiguration.DEFAULT)
|
44
|
+
super.beforeAll()
|
86
45
|
}
|
87
46
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
47
|
+
override protected def afterAll(): Unit =
|
48
|
+
{
|
49
|
+
LOCALSTACK_DOCKER.stop()
|
50
|
+
super.afterAll()
|
51
|
+
}
|
92
52
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
val inData: String = inRecords(recordIndex)(columnIndex)
|
98
|
-
val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
|
53
|
+
@(Rule@getter)
|
54
|
+
val embulk: TestingEmbulk = TestingEmbulk.builder()
|
55
|
+
.registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
|
56
|
+
.build()
|
99
57
|
|
100
|
-
|
101
|
-
|
58
|
+
before {
|
59
|
+
DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
|
102
60
|
}
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
.build()
|
117
|
-
|
118
|
-
def read(reader: ParquetReader[SimpleRecord],
|
119
|
-
records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] = {
|
120
|
-
val simpleRecord: SimpleRecord = reader.read()
|
121
|
-
if (simpleRecord != null) {
|
122
|
-
val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
|
123
|
-
return read(reader, records :+ r)
|
124
|
-
}
|
125
|
-
records
|
61
|
+
|
62
|
+
def defaultOutConfig(): ConfigSource =
|
63
|
+
{
|
64
|
+
embulk.newConfig()
|
65
|
+
.set("type", "s3_parquet")
|
66
|
+
.set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
|
67
|
+
.set("bucket", BUCKET_NAME)
|
68
|
+
.set("path_prefix", "path/to/p")
|
69
|
+
.set("auth_method", "basic")
|
70
|
+
.set("access_key_id", TestUtils.TEST_ACCESS_KEY)
|
71
|
+
.set("secret_access_key", TestUtils.TEST_SECRET_KEY)
|
72
|
+
.set("path_style_access_enabled", true)
|
73
|
+
.set("default_timezone", "Asia/Tokyo")
|
126
74
|
}
|
127
75
|
|
128
|
-
try read(reader)
|
129
|
-
finally {
|
130
|
-
reader.close()
|
131
76
|
|
77
|
+
test("first test") {
|
78
|
+
val inPath = toPath("in1.csv")
|
79
|
+
val outConfig = defaultOutConfig()
|
80
|
+
|
81
|
+
val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
|
82
|
+
|
83
|
+
|
84
|
+
val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
|
85
|
+
val b = tr.get(classOf[String], "bucket")
|
86
|
+
val k = tr.get(classOf[String], "key")
|
87
|
+
readParquetFile(b, k)
|
88
|
+
}.foldLeft(Seq[Map[String, String]]()) { (merged,
|
89
|
+
records) =>
|
90
|
+
merged ++ records
|
91
|
+
}
|
92
|
+
|
93
|
+
val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
|
94
|
+
.stripLineEnd
|
95
|
+
.split("\n")
|
96
|
+
.map(record => record.split("\t").toSeq)
|
97
|
+
.toSeq
|
98
|
+
|
99
|
+
inRecords.zipWithIndex.foreach {
|
100
|
+
case (record, recordIndex) =>
|
101
|
+
0.to(5).foreach { columnIndex =>
|
102
|
+
val columnName = s"c$columnIndex"
|
103
|
+
val inData: String = inRecords(recordIndex)(columnIndex)
|
104
|
+
val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
|
105
|
+
|
106
|
+
assert(outData === inData, s"record: $recordIndex, column: $columnName")
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
def readParquetFile(bucket: String,
|
112
|
+
key: String): Seq[Map[String, String]] =
|
113
|
+
{
|
114
|
+
val xfer = TransferManagerBuilder.standard()
|
115
|
+
.withS3Client(DockerTestUtils.getClientS3)
|
116
|
+
.build()
|
117
|
+
val createdParquetFile = embulk.createTempFile("in")
|
118
|
+
try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
|
119
|
+
finally xfer.shutdownNow()
|
120
|
+
|
121
|
+
val reader: ParquetReader[SimpleRecord] = ParquetReader
|
122
|
+
.builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
|
123
|
+
.build()
|
124
|
+
|
125
|
+
def read(reader: ParquetReader[SimpleRecord],
|
126
|
+
records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
|
127
|
+
{
|
128
|
+
val simpleRecord: SimpleRecord = reader.read()
|
129
|
+
if (simpleRecord != null) {
|
130
|
+
val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
|
131
|
+
return read(reader, records :+ r)
|
132
|
+
}
|
133
|
+
records
|
134
|
+
}
|
135
|
+
|
136
|
+
try read(reader)
|
137
|
+
finally {
|
138
|
+
reader.close()
|
139
|
+
|
140
|
+
}
|
132
141
|
}
|
133
|
-
}
|
134
142
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
143
|
+
private def toPath(fileName: String) =
|
144
|
+
{
|
145
|
+
val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
|
146
|
+
FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
|
147
|
+
}
|
139
148
|
|
140
149
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-s3_parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Civitaspo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -46,7 +46,6 @@ extensions: []
|
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
48
|
- ".gitignore"
|
49
|
-
- ".scalafmt.conf"
|
50
49
|
- CHANGELOG.md
|
51
50
|
- LICENSE.txt
|
52
51
|
- README.md
|
@@ -60,11 +59,13 @@ files:
|
|
60
59
|
- classpath/asm-3.1.jar
|
61
60
|
- classpath/asm-5.0.4.jar
|
62
61
|
- classpath/avro-1.7.7.jar
|
63
|
-
- classpath/aws-java-sdk-core-1.11.
|
64
|
-
- classpath/aws-java-sdk-
|
65
|
-
- classpath/aws-java-sdk-
|
66
|
-
- classpath/aws-java-sdk-
|
62
|
+
- classpath/aws-java-sdk-core-1.11.592.jar
|
63
|
+
- classpath/aws-java-sdk-glue-1.11.592.jar
|
64
|
+
- classpath/aws-java-sdk-kms-1.11.592.jar
|
65
|
+
- classpath/aws-java-sdk-s3-1.11.592.jar
|
66
|
+
- classpath/aws-java-sdk-sts-1.11.592.jar
|
67
67
|
- classpath/commons-beanutils-1.7.0.jar
|
68
|
+
- classpath/commons-cli-1.2.jar
|
68
69
|
- classpath/commons-codec-1.10.jar
|
69
70
|
- classpath/commons-collections-3.2.2.jar
|
70
71
|
- classpath/commons-compress-1.4.1.jar
|
@@ -79,7 +80,7 @@ files:
|
|
79
80
|
- classpath/curator-client-2.7.1.jar
|
80
81
|
- classpath/curator-framework-2.7.1.jar
|
81
82
|
- classpath/curator-recipes-2.7.1.jar
|
82
|
-
- classpath/embulk-output-s3_parquet-0.0.
|
83
|
+
- classpath/embulk-output-s3_parquet-0.0.3.jar
|
83
84
|
- classpath/gson-2.2.4.jar
|
84
85
|
- classpath/hadoop-annotations-2.9.2.jar
|
85
86
|
- classpath/hadoop-auth-2.9.2.jar
|
@@ -107,7 +108,7 @@ files:
|
|
107
108
|
- classpath/jetty-sslengine-6.1.26.jar
|
108
109
|
- classpath/jetty-util-6.1.26.jar
|
109
110
|
- classpath/jline-0.9.94.jar
|
110
|
-
- classpath/jmespath-java-1.11.
|
111
|
+
- classpath/jmespath-java-1.11.592.jar
|
111
112
|
- classpath/jsch-0.1.54.jar
|
112
113
|
- classpath/json-smart-2.3.jar
|
113
114
|
- classpath/jsp-api-2.1.jar
|
@@ -116,19 +117,19 @@ files:
|
|
116
117
|
- classpath/netty-3.7.0.Final.jar
|
117
118
|
- classpath/nimbus-jose-jwt-4.41.1.jar
|
118
119
|
- classpath/paranamer-2.3.jar
|
119
|
-
- classpath/parquet-column-1.10.
|
120
|
-
- classpath/parquet-common-1.10.
|
121
|
-
- classpath/parquet-encoding-1.10.
|
120
|
+
- classpath/parquet-column-1.10.1.jar
|
121
|
+
- classpath/parquet-common-1.10.1.jar
|
122
|
+
- classpath/parquet-encoding-1.10.1.jar
|
122
123
|
- classpath/parquet-format-2.4.0.jar
|
123
|
-
- classpath/parquet-hadoop-1.10.
|
124
|
-
- classpath/parquet-jackson-1.10.
|
124
|
+
- classpath/parquet-hadoop-1.10.1.jar
|
125
|
+
- classpath/parquet-jackson-1.10.1.jar
|
125
126
|
- classpath/protobuf-java-2.5.0.jar
|
126
|
-
- classpath/scala-library-2.
|
127
|
+
- classpath/scala-library-2.13.0.jar
|
127
128
|
- classpath/servlet-api-2.5-20081211.jar
|
128
129
|
- classpath/servlet-api-2.5.jar
|
129
130
|
- classpath/slf4j-api-1.7.25.jar
|
130
131
|
- classpath/slf4j-log4j12-1.7.25.jar
|
131
|
-
- classpath/snappy-java-1.1.7.
|
132
|
+
- classpath/snappy-java-1.1.7.3.jar
|
132
133
|
- classpath/stax-api-1.0-2.jar
|
133
134
|
- classpath/stax2-api-3.1.4.jar
|
134
135
|
- classpath/woodstox-core-5.0.3.jar
|
@@ -143,6 +144,7 @@ files:
|
|
143
144
|
- gradlew.bat
|
144
145
|
- lib/embulk/output/s3_parquet.rb
|
145
146
|
- settings.gradle
|
147
|
+
- src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala
|
146
148
|
- src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala
|
147
149
|
- src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala
|
148
150
|
- src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala
|
data/.scalafmt.conf
DELETED