embulk-output-s3_parquet 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,8 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
3
 
4
- import java.io.{File, PrintWriter}
5
- import java.nio.file.{FileSystems, Path}
4
+ import java.io.File
5
+ import java.nio.file.FileSystems
6
6
 
7
7
  import cloud.localstack.{DockerTestUtils, Localstack, TestUtils}
8
8
  import cloud.localstack.docker.LocalstackDocker
@@ -21,120 +21,129 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, DiagrammedAssertions, F
21
21
  import org.scalatest.junit.JUnitRunner
22
22
 
23
23
  import scala.annotation.meta.getter
24
- import scala.collection.JavaConverters._
24
+ import scala.jdk.CollectionConverters._
25
+
25
26
 
26
27
  @RunWith(classOf[JUnitRunner])
27
28
  class TestS3ParquetOutputPlugin
28
- extends FunSuite
29
- with BeforeAndAfter
30
- with BeforeAndAfterAll
31
- with DiagrammedAssertions {
32
-
33
- val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
34
- val BUCKET_NAME: String = "my-bucket"
35
-
36
- val LOCALSTACK_DOCKER: LocalstackDocker = LocalstackDocker.INSTANCE
37
-
38
- override protected def beforeAll(): Unit = {
39
- Localstack.teardownInfrastructure()
40
- LOCALSTACK_DOCKER.startup(LocalstackDockerConfiguration.DEFAULT)
41
- super.beforeAll()
42
- }
43
-
44
- override protected def afterAll(): Unit = {
45
- LOCALSTACK_DOCKER.stop()
46
- super.afterAll()
47
- }
48
-
49
- @(Rule@getter)
50
- val embulk: TestingEmbulk = TestingEmbulk.builder()
51
- .registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
52
- .build()
53
-
54
- before {
55
- DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
56
- }
57
-
58
- def defaultOutConfig(): ConfigSource = {
59
- embulk.newConfig()
60
- .set("type", "s3_parquet")
61
- .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
62
- .set("bucket", BUCKET_NAME)
63
- .set("path_prefix", "path/to/p")
64
- .set("auth_method", "basic")
65
- .set("access_key_id", TestUtils.TEST_ACCESS_KEY)
66
- .set("secret_access_key", TestUtils.TEST_SECRET_KEY)
67
- .set("path_style_access_enabled", true)
68
- .set("default_timezone", "Asia/Tokyo")
69
- }
70
-
71
-
72
- test("first test") {
73
- val inPath = toPath("in1.csv")
74
- val outConfig = defaultOutConfig()
75
-
76
- val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
77
-
78
-
79
- val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
80
- val b = tr.get(classOf[String], "bucket")
81
- val k = tr.get(classOf[String], "key")
82
- readParquetFile(b, k)
83
- }.foldLeft(Seq[Map[String, String]]()) { (merged,
84
- records) =>
85
- merged ++ records
29
+ extends FunSuite
30
+ with BeforeAndAfter
31
+ with BeforeAndAfterAll
32
+ with DiagrammedAssertions
33
+ {
34
+
35
+ val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
36
+ val BUCKET_NAME: String = "my-bucket"
37
+
38
+ val LOCALSTACK_DOCKER: LocalstackDocker = LocalstackDocker.INSTANCE
39
+
40
+ override protected def beforeAll(): Unit =
41
+ {
42
+ Localstack.teardownInfrastructure()
43
+ LOCALSTACK_DOCKER.startup(LocalstackDockerConfiguration.DEFAULT)
44
+ super.beforeAll()
86
45
  }
87
46
 
88
- val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
89
- .stripLineEnd
90
- .split("\n")
91
- .map(record => record.split("\t").toSeq)
47
+ override protected def afterAll(): Unit =
48
+ {
49
+ LOCALSTACK_DOCKER.stop()
50
+ super.afterAll()
51
+ }
92
52
 
93
- inRecords.zipWithIndex.foreach {
94
- case (record, recordIndex) =>
95
- 0.to(5).foreach { columnIndex =>
96
- val columnName = s"c$columnIndex"
97
- val inData: String = inRecords(recordIndex)(columnIndex)
98
- val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
53
+ @(Rule@getter)
54
+ val embulk: TestingEmbulk = TestingEmbulk.builder()
55
+ .registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
56
+ .build()
99
57
 
100
- assert(outData === inData, s"record: $recordIndex, column: $columnName")
101
- }
58
+ before {
59
+ DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
102
60
  }
103
- }
104
-
105
- def readParquetFile(bucket: String,
106
- key: String): Seq[Map[String, String]] = {
107
- val xfer = TransferManagerBuilder.standard()
108
- .withS3Client(DockerTestUtils.getClientS3)
109
- .build()
110
- val createdParquetFile = embulk.createTempFile("in")
111
- try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
112
- finally xfer.shutdownNow()
113
-
114
- val reader: ParquetReader[SimpleRecord] = ParquetReader
115
- .builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
116
- .build()
117
-
118
- def read(reader: ParquetReader[SimpleRecord],
119
- records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] = {
120
- val simpleRecord: SimpleRecord = reader.read()
121
- if (simpleRecord != null) {
122
- val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
123
- return read(reader, records :+ r)
124
- }
125
- records
61
+
62
+ def defaultOutConfig(): ConfigSource =
63
+ {
64
+ embulk.newConfig()
65
+ .set("type", "s3_parquet")
66
+ .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
67
+ .set("bucket", BUCKET_NAME)
68
+ .set("path_prefix", "path/to/p")
69
+ .set("auth_method", "basic")
70
+ .set("access_key_id", TestUtils.TEST_ACCESS_KEY)
71
+ .set("secret_access_key", TestUtils.TEST_SECRET_KEY)
72
+ .set("path_style_access_enabled", true)
73
+ .set("default_timezone", "Asia/Tokyo")
126
74
  }
127
75
 
128
- try read(reader)
129
- finally {
130
- reader.close()
131
76
 
77
+ test("first test") {
78
+ val inPath = toPath("in1.csv")
79
+ val outConfig = defaultOutConfig()
80
+
81
+ val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
82
+
83
+
84
+ val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
85
+ val b = tr.get(classOf[String], "bucket")
86
+ val k = tr.get(classOf[String], "key")
87
+ readParquetFile(b, k)
88
+ }.foldLeft(Seq[Map[String, String]]()) { (merged,
89
+ records) =>
90
+ merged ++ records
91
+ }
92
+
93
+ val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
94
+ .stripLineEnd
95
+ .split("\n")
96
+ .map(record => record.split("\t").toSeq)
97
+ .toSeq
98
+
99
+ inRecords.zipWithIndex.foreach {
100
+ case (record, recordIndex) =>
101
+ 0.to(5).foreach { columnIndex =>
102
+ val columnName = s"c$columnIndex"
103
+ val inData: String = inRecords(recordIndex)(columnIndex)
104
+ val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
105
+
106
+ assert(outData === inData, s"record: $recordIndex, column: $columnName")
107
+ }
108
+ }
109
+ }
110
+
111
+ def readParquetFile(bucket: String,
112
+ key: String): Seq[Map[String, String]] =
113
+ {
114
+ val xfer = TransferManagerBuilder.standard()
115
+ .withS3Client(DockerTestUtils.getClientS3)
116
+ .build()
117
+ val createdParquetFile = embulk.createTempFile("in")
118
+ try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
119
+ finally xfer.shutdownNow()
120
+
121
+ val reader: ParquetReader[SimpleRecord] = ParquetReader
122
+ .builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
123
+ .build()
124
+
125
+ def read(reader: ParquetReader[SimpleRecord],
126
+ records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
127
+ {
128
+ val simpleRecord: SimpleRecord = reader.read()
129
+ if (simpleRecord != null) {
130
+ val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
131
+ return read(reader, records :+ r)
132
+ }
133
+ records
134
+ }
135
+
136
+ try read(reader)
137
+ finally {
138
+ reader.close()
139
+
140
+ }
132
141
  }
133
- }
134
142
 
135
- private def toPath(fileName: String) = {
136
- val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
137
- FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
138
- }
143
+ private def toPath(fileName: String) =
144
+ {
145
+ val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
146
+ FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
147
+ }
139
148
 
140
149
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-s3_parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-21 00:00:00.000000000 Z
11
+ date: 2019-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -46,7 +46,6 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - ".gitignore"
49
- - ".scalafmt.conf"
50
49
  - CHANGELOG.md
51
50
  - LICENSE.txt
52
51
  - README.md
@@ -60,11 +59,13 @@ files:
60
59
  - classpath/asm-3.1.jar
61
60
  - classpath/asm-5.0.4.jar
62
61
  - classpath/avro-1.7.7.jar
63
- - classpath/aws-java-sdk-core-1.11.479.jar
64
- - classpath/aws-java-sdk-kms-1.11.479.jar
65
- - classpath/aws-java-sdk-s3-1.11.479.jar
66
- - classpath/aws-java-sdk-sts-1.11.479.jar
62
+ - classpath/aws-java-sdk-core-1.11.592.jar
63
+ - classpath/aws-java-sdk-glue-1.11.592.jar
64
+ - classpath/aws-java-sdk-kms-1.11.592.jar
65
+ - classpath/aws-java-sdk-s3-1.11.592.jar
66
+ - classpath/aws-java-sdk-sts-1.11.592.jar
67
67
  - classpath/commons-beanutils-1.7.0.jar
68
+ - classpath/commons-cli-1.2.jar
68
69
  - classpath/commons-codec-1.10.jar
69
70
  - classpath/commons-collections-3.2.2.jar
70
71
  - classpath/commons-compress-1.4.1.jar
@@ -79,7 +80,7 @@ files:
79
80
  - classpath/curator-client-2.7.1.jar
80
81
  - classpath/curator-framework-2.7.1.jar
81
82
  - classpath/curator-recipes-2.7.1.jar
82
- - classpath/embulk-output-s3_parquet-0.0.2.jar
83
+ - classpath/embulk-output-s3_parquet-0.0.3.jar
83
84
  - classpath/gson-2.2.4.jar
84
85
  - classpath/hadoop-annotations-2.9.2.jar
85
86
  - classpath/hadoop-auth-2.9.2.jar
@@ -107,7 +108,7 @@ files:
107
108
  - classpath/jetty-sslengine-6.1.26.jar
108
109
  - classpath/jetty-util-6.1.26.jar
109
110
  - classpath/jline-0.9.94.jar
110
- - classpath/jmespath-java-1.11.479.jar
111
+ - classpath/jmespath-java-1.11.592.jar
111
112
  - classpath/jsch-0.1.54.jar
112
113
  - classpath/json-smart-2.3.jar
113
114
  - classpath/jsp-api-2.1.jar
@@ -116,19 +117,19 @@ files:
116
117
  - classpath/netty-3.7.0.Final.jar
117
118
  - classpath/nimbus-jose-jwt-4.41.1.jar
118
119
  - classpath/paranamer-2.3.jar
119
- - classpath/parquet-column-1.10.0.jar
120
- - classpath/parquet-common-1.10.0.jar
121
- - classpath/parquet-encoding-1.10.0.jar
120
+ - classpath/parquet-column-1.10.1.jar
121
+ - classpath/parquet-common-1.10.1.jar
122
+ - classpath/parquet-encoding-1.10.1.jar
122
123
  - classpath/parquet-format-2.4.0.jar
123
- - classpath/parquet-hadoop-1.10.0.jar
124
- - classpath/parquet-jackson-1.10.0.jar
124
+ - classpath/parquet-hadoop-1.10.1.jar
125
+ - classpath/parquet-jackson-1.10.1.jar
125
126
  - classpath/protobuf-java-2.5.0.jar
126
- - classpath/scala-library-2.12.8.jar
127
+ - classpath/scala-library-2.13.0.jar
127
128
  - classpath/servlet-api-2.5-20081211.jar
128
129
  - classpath/servlet-api-2.5.jar
129
130
  - classpath/slf4j-api-1.7.25.jar
130
131
  - classpath/slf4j-log4j12-1.7.25.jar
131
- - classpath/snappy-java-1.1.7.2.jar
132
+ - classpath/snappy-java-1.1.7.3.jar
132
133
  - classpath/stax-api-1.0-2.jar
133
134
  - classpath/stax2-api-3.1.4.jar
134
135
  - classpath/woodstox-core-5.0.3.jar
@@ -143,6 +144,7 @@ files:
143
144
  - gradlew.bat
144
145
  - lib/embulk/output/s3_parquet.rb
145
146
  - settings.gradle
147
+ - src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala
146
148
  - src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala
147
149
  - src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala
148
150
  - src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala
@@ -1,9 +0,0 @@
1
- # https://scalameta.org/scalafmt/#Configuration
2
-
3
- style = IntelliJ
4
- maxColumn = 160
5
- align = none
6
- newlines.penalizeSingleSelectMultiArgList = false
7
- newlines.alwaysBeforeElseAfterCurlyIf = true
8
- newlines.alwaysBeforeTopLevelStatements = true
9
-