embulk-output-s3_parquet 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,8 @@
1
1
  package org.embulk.output.s3_parquet
2
2
 
3
3
 
4
- import java.io.{File, PrintWriter}
5
- import java.nio.file.{FileSystems, Path}
4
+ import java.io.File
5
+ import java.nio.file.FileSystems
6
6
 
7
7
  import cloud.localstack.{DockerTestUtils, Localstack, TestUtils}
8
8
  import cloud.localstack.docker.LocalstackDocker
@@ -21,120 +21,129 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, DiagrammedAssertions, F
21
21
  import org.scalatest.junit.JUnitRunner
22
22
 
23
23
  import scala.annotation.meta.getter
24
- import scala.collection.JavaConverters._
24
+ import scala.jdk.CollectionConverters._
25
+
25
26
 
26
27
  @RunWith(classOf[JUnitRunner])
27
28
  class TestS3ParquetOutputPlugin
28
- extends FunSuite
29
- with BeforeAndAfter
30
- with BeforeAndAfterAll
31
- with DiagrammedAssertions {
32
-
33
- val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
34
- val BUCKET_NAME: String = "my-bucket"
35
-
36
- val LOCALSTACK_DOCKER: LocalstackDocker = LocalstackDocker.INSTANCE
37
-
38
- override protected def beforeAll(): Unit = {
39
- Localstack.teardownInfrastructure()
40
- LOCALSTACK_DOCKER.startup(LocalstackDockerConfiguration.DEFAULT)
41
- super.beforeAll()
42
- }
43
-
44
- override protected def afterAll(): Unit = {
45
- LOCALSTACK_DOCKER.stop()
46
- super.afterAll()
47
- }
48
-
49
- @(Rule@getter)
50
- val embulk: TestingEmbulk = TestingEmbulk.builder()
51
- .registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
52
- .build()
53
-
54
- before {
55
- DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
56
- }
57
-
58
- def defaultOutConfig(): ConfigSource = {
59
- embulk.newConfig()
60
- .set("type", "s3_parquet")
61
- .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
62
- .set("bucket", BUCKET_NAME)
63
- .set("path_prefix", "path/to/p")
64
- .set("auth_method", "basic")
65
- .set("access_key_id", TestUtils.TEST_ACCESS_KEY)
66
- .set("secret_access_key", TestUtils.TEST_SECRET_KEY)
67
- .set("path_style_access_enabled", true)
68
- .set("default_timezone", "Asia/Tokyo")
69
- }
70
-
71
-
72
- test("first test") {
73
- val inPath = toPath("in1.csv")
74
- val outConfig = defaultOutConfig()
75
-
76
- val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
77
-
78
-
79
- val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
80
- val b = tr.get(classOf[String], "bucket")
81
- val k = tr.get(classOf[String], "key")
82
- readParquetFile(b, k)
83
- }.foldLeft(Seq[Map[String, String]]()) { (merged,
84
- records) =>
85
- merged ++ records
29
+ extends FunSuite
30
+ with BeforeAndAfter
31
+ with BeforeAndAfterAll
32
+ with DiagrammedAssertions
33
+ {
34
+
35
+ val RESOURCE_NAME_PREFIX: String = "org/embulk/output/s3_parquet/"
36
+ val BUCKET_NAME: String = "my-bucket"
37
+
38
+ val LOCALSTACK_DOCKER: LocalstackDocker = LocalstackDocker.INSTANCE
39
+
40
+ override protected def beforeAll(): Unit =
41
+ {
42
+ Localstack.teardownInfrastructure()
43
+ LOCALSTACK_DOCKER.startup(LocalstackDockerConfiguration.DEFAULT)
44
+ super.beforeAll()
86
45
  }
87
46
 
88
- val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
89
- .stripLineEnd
90
- .split("\n")
91
- .map(record => record.split("\t").toSeq)
47
+ override protected def afterAll(): Unit =
48
+ {
49
+ LOCALSTACK_DOCKER.stop()
50
+ super.afterAll()
51
+ }
92
52
 
93
- inRecords.zipWithIndex.foreach {
94
- case (record, recordIndex) =>
95
- 0.to(5).foreach { columnIndex =>
96
- val columnName = s"c$columnIndex"
97
- val inData: String = inRecords(recordIndex)(columnIndex)
98
- val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
53
+ @(Rule@getter)
54
+ val embulk: TestingEmbulk = TestingEmbulk.builder()
55
+ .registerPlugin(classOf[OutputPlugin], "s3_parquet", classOf[S3ParquetOutputPlugin])
56
+ .build()
99
57
 
100
- assert(outData === inData, s"record: $recordIndex, column: $columnName")
101
- }
58
+ before {
59
+ DockerTestUtils.getClientS3.createBucket(BUCKET_NAME)
102
60
  }
103
- }
104
-
105
- def readParquetFile(bucket: String,
106
- key: String): Seq[Map[String, String]] = {
107
- val xfer = TransferManagerBuilder.standard()
108
- .withS3Client(DockerTestUtils.getClientS3)
109
- .build()
110
- val createdParquetFile = embulk.createTempFile("in")
111
- try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
112
- finally xfer.shutdownNow()
113
-
114
- val reader: ParquetReader[SimpleRecord] = ParquetReader
115
- .builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
116
- .build()
117
-
118
- def read(reader: ParquetReader[SimpleRecord],
119
- records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] = {
120
- val simpleRecord: SimpleRecord = reader.read()
121
- if (simpleRecord != null) {
122
- val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
123
- return read(reader, records :+ r)
124
- }
125
- records
61
+
62
+ def defaultOutConfig(): ConfigSource =
63
+ {
64
+ embulk.newConfig()
65
+ .set("type", "s3_parquet")
66
+ .set("endpoint", "http://localhost:4572") // See https://github.com/localstack/localstack#overview
67
+ .set("bucket", BUCKET_NAME)
68
+ .set("path_prefix", "path/to/p")
69
+ .set("auth_method", "basic")
70
+ .set("access_key_id", TestUtils.TEST_ACCESS_KEY)
71
+ .set("secret_access_key", TestUtils.TEST_SECRET_KEY)
72
+ .set("path_style_access_enabled", true)
73
+ .set("default_timezone", "Asia/Tokyo")
126
74
  }
127
75
 
128
- try read(reader)
129
- finally {
130
- reader.close()
131
76
 
77
+ test("first test") {
78
+ val inPath = toPath("in1.csv")
79
+ val outConfig = defaultOutConfig()
80
+
81
+ val result: TestingEmbulk.RunResult = embulk.runOutput(outConfig, inPath)
82
+
83
+
84
+ val outRecords: Seq[Map[String, String]] = result.getOutputTaskReports.asScala.map { tr =>
85
+ val b = tr.get(classOf[String], "bucket")
86
+ val k = tr.get(classOf[String], "key")
87
+ readParquetFile(b, k)
88
+ }.foldLeft(Seq[Map[String, String]]()) { (merged,
89
+ records) =>
90
+ merged ++ records
91
+ }
92
+
93
+ val inRecords: Seq[Seq[String]] = EmbulkTests.readResource(RESOURCE_NAME_PREFIX + "out1.tsv")
94
+ .stripLineEnd
95
+ .split("\n")
96
+ .map(record => record.split("\t").toSeq)
97
+ .toSeq
98
+
99
+ inRecords.zipWithIndex.foreach {
100
+ case (record, recordIndex) =>
101
+ 0.to(5).foreach { columnIndex =>
102
+ val columnName = s"c$columnIndex"
103
+ val inData: String = inRecords(recordIndex)(columnIndex)
104
+ val outData: String = outRecords(recordIndex).getOrElse(columnName, "")
105
+
106
+ assert(outData === inData, s"record: $recordIndex, column: $columnName")
107
+ }
108
+ }
109
+ }
110
+
111
+ def readParquetFile(bucket: String,
112
+ key: String): Seq[Map[String, String]] =
113
+ {
114
+ val xfer = TransferManagerBuilder.standard()
115
+ .withS3Client(DockerTestUtils.getClientS3)
116
+ .build()
117
+ val createdParquetFile = embulk.createTempFile("in")
118
+ try xfer.download(bucket, key, createdParquetFile.toFile).waitForCompletion()
119
+ finally xfer.shutdownNow()
120
+
121
+ val reader: ParquetReader[SimpleRecord] = ParquetReader
122
+ .builder(new SimpleReadSupport(), new HadoopPath(createdParquetFile.toString))
123
+ .build()
124
+
125
+ def read(reader: ParquetReader[SimpleRecord],
126
+ records: Seq[Map[String, String]] = Seq()): Seq[Map[String, String]] =
127
+ {
128
+ val simpleRecord: SimpleRecord = reader.read()
129
+ if (simpleRecord != null) {
130
+ val r: Map[String, String] = simpleRecord.getValues.asScala.map(v => v.getName -> v.getValue.toString).toMap
131
+ return read(reader, records :+ r)
132
+ }
133
+ records
134
+ }
135
+
136
+ try read(reader)
137
+ finally {
138
+ reader.close()
139
+
140
+ }
132
141
  }
133
- }
134
142
 
135
- private def toPath(fileName: String) = {
136
- val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
137
- FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
138
- }
143
+ private def toPath(fileName: String) =
144
+ {
145
+ val url = Resources.getResource(RESOURCE_NAME_PREFIX + fileName)
146
+ FileSystems.getDefault.getPath(new File(url.toURI).getAbsolutePath)
147
+ }
139
148
 
140
149
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-s3_parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-21 00:00:00.000000000 Z
11
+ date: 2019-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -46,7 +46,6 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - ".gitignore"
49
- - ".scalafmt.conf"
50
49
  - CHANGELOG.md
51
50
  - LICENSE.txt
52
51
  - README.md
@@ -60,11 +59,13 @@ files:
60
59
  - classpath/asm-3.1.jar
61
60
  - classpath/asm-5.0.4.jar
62
61
  - classpath/avro-1.7.7.jar
63
- - classpath/aws-java-sdk-core-1.11.479.jar
64
- - classpath/aws-java-sdk-kms-1.11.479.jar
65
- - classpath/aws-java-sdk-s3-1.11.479.jar
66
- - classpath/aws-java-sdk-sts-1.11.479.jar
62
+ - classpath/aws-java-sdk-core-1.11.592.jar
63
+ - classpath/aws-java-sdk-glue-1.11.592.jar
64
+ - classpath/aws-java-sdk-kms-1.11.592.jar
65
+ - classpath/aws-java-sdk-s3-1.11.592.jar
66
+ - classpath/aws-java-sdk-sts-1.11.592.jar
67
67
  - classpath/commons-beanutils-1.7.0.jar
68
+ - classpath/commons-cli-1.2.jar
68
69
  - classpath/commons-codec-1.10.jar
69
70
  - classpath/commons-collections-3.2.2.jar
70
71
  - classpath/commons-compress-1.4.1.jar
@@ -79,7 +80,7 @@ files:
79
80
  - classpath/curator-client-2.7.1.jar
80
81
  - classpath/curator-framework-2.7.1.jar
81
82
  - classpath/curator-recipes-2.7.1.jar
82
- - classpath/embulk-output-s3_parquet-0.0.2.jar
83
+ - classpath/embulk-output-s3_parquet-0.0.3.jar
83
84
  - classpath/gson-2.2.4.jar
84
85
  - classpath/hadoop-annotations-2.9.2.jar
85
86
  - classpath/hadoop-auth-2.9.2.jar
@@ -107,7 +108,7 @@ files:
107
108
  - classpath/jetty-sslengine-6.1.26.jar
108
109
  - classpath/jetty-util-6.1.26.jar
109
110
  - classpath/jline-0.9.94.jar
110
- - classpath/jmespath-java-1.11.479.jar
111
+ - classpath/jmespath-java-1.11.592.jar
111
112
  - classpath/jsch-0.1.54.jar
112
113
  - classpath/json-smart-2.3.jar
113
114
  - classpath/jsp-api-2.1.jar
@@ -116,19 +117,19 @@ files:
116
117
  - classpath/netty-3.7.0.Final.jar
117
118
  - classpath/nimbus-jose-jwt-4.41.1.jar
118
119
  - classpath/paranamer-2.3.jar
119
- - classpath/parquet-column-1.10.0.jar
120
- - classpath/parquet-common-1.10.0.jar
121
- - classpath/parquet-encoding-1.10.0.jar
120
+ - classpath/parquet-column-1.10.1.jar
121
+ - classpath/parquet-common-1.10.1.jar
122
+ - classpath/parquet-encoding-1.10.1.jar
122
123
  - classpath/parquet-format-2.4.0.jar
123
- - classpath/parquet-hadoop-1.10.0.jar
124
- - classpath/parquet-jackson-1.10.0.jar
124
+ - classpath/parquet-hadoop-1.10.1.jar
125
+ - classpath/parquet-jackson-1.10.1.jar
125
126
  - classpath/protobuf-java-2.5.0.jar
126
- - classpath/scala-library-2.12.8.jar
127
+ - classpath/scala-library-2.13.0.jar
127
128
  - classpath/servlet-api-2.5-20081211.jar
128
129
  - classpath/servlet-api-2.5.jar
129
130
  - classpath/slf4j-api-1.7.25.jar
130
131
  - classpath/slf4j-log4j12-1.7.25.jar
131
- - classpath/snappy-java-1.1.7.2.jar
132
+ - classpath/snappy-java-1.1.7.3.jar
132
133
  - classpath/stax-api-1.0-2.jar
133
134
  - classpath/stax2-api-3.1.4.jar
134
135
  - classpath/woodstox-core-5.0.3.jar
@@ -143,6 +144,7 @@ files:
143
144
  - gradlew.bat
144
145
  - lib/embulk/output/s3_parquet.rb
145
146
  - settings.gradle
147
+ - src/main/scala/org/embulk/output/s3_parquet/CatalogRegistrator.scala
146
148
  - src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala
147
149
  - src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala
148
150
  - src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala
@@ -1,9 +0,0 @@
1
- # https://scalameta.org/scalafmt/#Configuration
2
-
3
- style = IntelliJ
4
- maxColumn = 160
5
- align = none
6
- newlines.penalizeSingleSelectMultiArgList = false
7
- newlines.alwaysBeforeElseAfterCurlyIf = true
8
- newlines.alwaysBeforeTopLevelStatements = true
9
-