embulk-output-orc 0.3.0 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+ import java.nio.file.{Files, Paths}
5
+
6
+ import com.amazonaws.auth.profile.ProfileCredentialsProvider
7
+ import com.amazonaws.services.s3.AmazonS3Client
8
+ import com.amazonaws.services.s3.model.DeleteObjectRequest
9
+
10
+ import scala.beans.BeanProperty
11
+
12
+ object OrcOutputPluginHelper {
13
+ def removeOldFile(fpath: String, task: PluginTask): Unit = {
14
+ // NOTE: Delete a file if local-filesystem, not HDFS or S3.
15
+ val schema = getSchema(fpath)
16
+ if (isDeleteTarget(schema)) schema match {
17
+ case "file" =>
18
+ try Files.deleteIfExists(Paths.get(fpath))
19
+ catch {
20
+ case e: IOException => throw e
21
+ }
22
+ case "s3" | "s3n" | "s3a" =>
23
+ val s3Url = parseS3Url(fpath)
24
+ val s3client = new AmazonS3Client(new ProfileCredentialsProvider)
25
+ if (task.getEndpoint.isPresent) s3client.setEndpoint(task.getEndpoint.get)
26
+ s3client.deleteObject(new DeleteObjectRequest(s3Url.bucket, s3Url.key))
27
+ case _ =>
28
+ }
29
+ }
30
+
31
+ def isDeleteTarget(schema: String): Boolean = schema match {
32
+ case "file" => true
33
+ case "s3" | "s3n" | "s3a" => true
34
+ case _ => false
35
+ }
36
+
37
+ def getSchema(fpath: String): String = {
38
+ val schema = fpath.split("://").toList.head
39
+ schema match {
40
+ case "s3" | "s3a" | "s3n" => schema
41
+ case _ => {
42
+ val path = Paths.get(fpath)
43
+ path.getFileSystem.provider.getScheme
44
+ }
45
+ }
46
+ }
47
+
48
+ def parseS3Url(s3url: String): AmazonS3URILikeObject = {
49
+ val parts = s3url.split("(://|/)").toList
50
+ val bucket = parts.apply(1)
51
+ val key = parts.slice(2, parts.size).mkString("/")
52
+ OrcOutputPluginHelper.AmazonS3URILikeObject(bucket, key)
53
+ }
54
+
55
+ case class AmazonS3URILikeObject(@BeanProperty bucket: String, @BeanProperty key: String)
56
+
57
+ }
@@ -0,0 +1,52 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+
5
+ import org.apache.orc.Writer
6
+ import org.embulk.config.TaskReport
7
+ import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
8
+
9
+ class OrcTransactionalPageOutput(val reader: PageReader, val writer: Writer, val task: PluginTask) extends TransactionalPageOutput {
10
+ override def add(page: Page): Unit = synchronized {
11
+ try {
12
+ // int size = page.getStringReferences().size();
13
+ val schema = OrcOutputPlugin.getSchema(reader.getSchema)
14
+ val batch = schema.createRowBatch
15
+ // batch.size = size;
16
+ reader.setPage(page)
17
+ while ( {
18
+ reader.nextRecord
19
+ }) {
20
+ val row = {
21
+ batch.size += 1;
22
+ batch.size - 1
23
+ }
24
+ reader.getSchema.visitColumns(new OrcColumnVisitor(reader, batch, row))
25
+ if (batch.size >= batch.getMaxSize) {
26
+ writer.addRowBatch(batch)
27
+ batch.reset()
28
+ }
29
+ }
30
+ if (batch.size != 0) {
31
+ writer.addRowBatch(batch)
32
+ batch.reset()
33
+ }
34
+ } catch {
35
+ case e: IOException =>
36
+ e.printStackTrace()
37
+ }
38
+ }
39
+
40
+ override def finish(): Unit = {
41
+ try writer.close()
42
+ catch {
43
+ case e: IOException => throw e
44
+ }
45
+ }
46
+
47
+ override def close(): Unit = {}
48
+
49
+ override def abort(): Unit = {}
50
+
51
+ override def commit: TaskReport = Exec.newTaskReport
52
+ }
@@ -0,0 +1,56 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.util
4
+
5
+ import com.google.common.base.Optional
6
+ import org.embulk.config.{Config, ConfigDefault, Task}
7
+ import org.embulk.spi.time.TimestampFormatter
8
+ import org.embulk.util.aws.credentials.AwsCredentialsTask
9
+ import org.joda.time.DateTimeZone
10
+
11
+ trait PluginTask extends Task with TimestampFormatter.Task with AwsCredentialsTask {
12
+ @Config("path_prefix")
13
+ def getPathPrefix: String
14
+
15
+ @Config("file_ext")
16
+ @ConfigDefault("\".orc\"")
17
+ def getFileNameExtension: String
18
+
19
+ @Config("column_options")
20
+ @ConfigDefault("{}")
21
+ def getColumnOptions: util.Map[String, TimestampColumnOption]
22
+
23
+ @Config("sequence_format")
24
+ @ConfigDefault("\".%03d\"")
25
+ def getSequenceFormat: String
26
+
27
+ // see: https://orc.apache.org/docs/hive-config.html
28
+ // ORC File options
29
+ @Config("strip_size")
30
+ @ConfigDefault("67108864") // 64MB
31
+ def getStripSize: Integer
32
+
33
+ @Config("buffer_size")
34
+ @ConfigDefault("262144") // 256KB
35
+ def getBufferSize: Integer
36
+
37
+ @Config("block_size")
38
+ @ConfigDefault("268435456") // 256MB
39
+ def getBlockSize: Integer
40
+
41
+ @Config("compression_kind")
42
+ @ConfigDefault("ZLIB")
43
+ def getCompressionKind: String
44
+
45
+ @Config("overwrite")
46
+ @ConfigDefault("false")
47
+ def getOverwrite: Boolean
48
+
49
+ @Config("default_from_timezone")
50
+ @ConfigDefault("\"UTC\"")
51
+ def getDefaultFromTimeZone: DateTimeZone
52
+
53
+ @Config("endpoint")
54
+ @ConfigDefault("null")
55
+ def getEndpoint: Optional[String]
56
+ }
@@ -0,0 +1,32 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.util
4
+
5
+ import com.google.common.base.Optional
6
+ import org.embulk.config.{Config, ConfigDefault, Task}
7
+ import org.embulk.spi.time.TimestampFormatter
8
+ import org.joda.time.DateTimeZone
9
+
10
+ /*
11
+ public interface TimestampColumnOption
12
+ extends Task, TimestampFormatter.TimestampColumnOption
13
+ {
14
+ @Config("from_timezone")
15
+ @ConfigDefault("null")
16
+ Optional<DateTimeZone> getFromTimeZone();
17
+
18
+ @Config("from_format")
19
+ @ConfigDefault("null")
20
+ Optional<List<String>> getFromFormat();
21
+ }
22
+ */
23
+
24
+ trait TimestampColumnOption extends Task with TimestampFormatter.TimestampColumnOption {
25
+ @Config("from_timezone")
26
+ @ConfigDefault("null")
27
+ def getFromTimeZone: Optional[DateTimeZone]
28
+
29
+ @Config("from_format")
30
+ @ConfigDefault("null")
31
+ def getFromFormat: Optional[util.List[String]]
32
+ }
@@ -0,0 +1,71 @@
1
+ package org.embulk.output.orc;
2
+
3
+ import org.testng.annotations.DataProvider;
4
+ import org.testng.annotations.Test;
5
+
6
+ import static org.hamcrest.MatcherAssert.assertThat;
7
+ import static org.hamcrest.core.Is.is;
8
+
9
+ public class OrcOutputPluginHelperTest
10
+ {
11
+ @DataProvider(name = "url-provider")
12
+ public Object[][] dataProvider()
13
+ {
14
+ return new Object[][] {
15
+ {"file://tmp/output.orc", "file"},
16
+ {"/tmp/output.000.orc", "file"},
17
+ {"s3n://embulk-test/output.0001.orc", "s3n"},
18
+ {"s3a://embulk-test/output.0001.orc", "s3a"},
19
+ {"s3://embulk-test/output.0001.orc", "s3"},
20
+ };
21
+ }
22
+
23
+ @Test(dataProvider = "url-provider")
24
+ public void getFPathTest(String file, String expect)
25
+ {
26
+ String schema = OrcOutputPluginHelper.getSchema(file);
27
+ assertThat(schema, is(expect));
28
+ }
29
+
30
+ @DataProvider(name = "schema-provider")
31
+ public Object[][] schemaProvider()
32
+ {
33
+ return new Object[][] {
34
+ {"file", true},
35
+ {"s3", true},
36
+ {"s3n", true},
37
+ {"s3a", true},
38
+ {"hdfs", false},
39
+ };
40
+ }
41
+
42
+ @Test(dataProvider = "schema-provider")
43
+ public void isDeleteTargetTest(String schema, boolean expect)
44
+ {
45
+ boolean result = OrcOutputPluginHelper.isDeleteTarget(schema);
46
+ assertThat(result, is(expect));
47
+ }
48
+
49
+ @DataProvider(name = "parserTest-provider")
50
+ public Object[][] parserTestProvider()
51
+ {
52
+ String baseurl = "demo-bucket/test/output.000.orc";
53
+ String bucket = "demo-bucket";
54
+ String keyname = "test/output.000.orc";
55
+
56
+ return new Object[][] {
57
+ {"s3://" + baseurl, bucket, keyname},
58
+ {"s3a://" + baseurl, bucket, keyname},
59
+ {"s3n://" + baseurl, bucket, keyname},
60
+ };
61
+ }
62
+
63
+ @Test(dataProvider = "parserTest-provider")
64
+ public void parseS3UrlTest(String url, String bucket, String key)
65
+ {
66
+ OrcOutputPluginHelper.AmazonS3URILikeObject parts =
67
+ OrcOutputPluginHelper.parseS3Url(url);
68
+ assertThat(parts.getBucket(), is(bucket));
69
+ assertThat(parts.getKey(), is(key));
70
+ }
71
+ }
@@ -0,0 +1,25 @@
1
+ ---
2
+ in:
3
+ type: randomj
4
+ rows: 8
5
+ threads: 1
6
+ # default_timezone: Asia/Tokyo
7
+ primary_key: myid
8
+ schema:
9
+ - {name: myid, type: long}
10
+ - {name: named, type: string, null_rate: 10000}
11
+ - {name: x_flag, type: boolean, null_rate: 10000}
12
+ - {name: pit_rate, type: double, null_rate: 10000}
13
+ - {name: score, type: long, null_rate: 10000}
14
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S', null_rate: 10000}
15
+ - {name: purchase, type: timestamp, format: '%Y/%m/%d', null_rate: 10000}
16
+
17
+ exec:
18
+ max_threads: 2 # run at most 8 tasks concurrently
19
+ min_output_tasks: 1 # disable page scattering
20
+
21
+ out:
22
+ type: orc
23
+ overwrite: true
24
+ path_prefix: "/tmp/output"
25
+ compression_kind: ZLIB
@@ -0,0 +1,25 @@
1
+ ---
2
+ in:
3
+ type: randomj
4
+ rows: 1024
5
+ threads: 1
6
+ # default_timezone: Asia/Tokyo
7
+ primary_key: myid
8
+ schema:
9
+ - {name: myid, type: long}
10
+ - {name: named, type: string, null_rate: 1000}
11
+ - {name: x_flag, type: boolean, null_rate: 1000}
12
+ - {name: pit_rate, type: double, null_rate: 1000}
13
+ - {name: score, type: long, null_rate: 1000}
14
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
15
+ - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
16
+
17
+ exec:
18
+ max_threads: 2 # run at most 8 tasks concurrently
19
+ min_output_tasks: 1 # disable page scattering
20
+
21
+ out:
22
+ type: orc
23
+ overwrite: true
24
+ path_prefix: "/tmp/output"
25
+ compression_kind: ZLIB
metadata CHANGED
@@ -1,19 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-28 00:00:00.000000000 Z
11
+ date: 2020-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
- - - ~>
16
+ - - "~>"
17
17
  - !ruby/object:Gem::Version
18
18
  version: '1.0'
19
19
  name: bundler
@@ -21,13 +21,13 @@ dependencies:
21
21
  type: :development
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: '10.0'
33
33
  name: rake
@@ -35,7 +35,7 @@ dependencies:
35
35
  type: :development
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  description: Dumps records to Orc format file.
@@ -45,27 +45,12 @@ executables: []
45
45
  extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
- - .gitignore
49
- - .travis.yml
48
+ - ".github/workflows/gradle.yml"
49
+ - ".gitignore"
50
50
  - LICENSE.txt
51
51
  - README.md
52
52
  - build.gradle
53
- - config/checkstyle/checkstyle.xml
54
- - config/checkstyle/default.xml
55
- - example/example.yml
56
- - gradle/wrapper/gradle-wrapper.jar
57
- - gradle/wrapper/gradle-wrapper.properties
58
- - gradlew
59
- - gradlew.bat
60
- - lib/embulk/output/orc.rb
61
- - src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
62
- - src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
63
- - src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
64
- - src/main/java/org/embulk/output/orc/PluginTask.java
65
- - src/main/java/org/embulk/output/orc/TimestampColumnOption.java
66
- - src/test/java/org/embulk/output/orc/TestOrcOutputPlugin.java
67
- - classpath/activation-1.1.jar
68
- - classpath/aircompressor-0.3.jar
53
+ - classpath/aircompressor-0.10.jar
69
54
  - classpath/apacheds-i18n-2.0.0-M15.jar
70
55
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
71
56
  - classpath/api-asn1-api-1.0.0-M20.jar
@@ -84,7 +69,6 @@ files:
84
69
  - classpath/commons-configuration-1.6.jar
85
70
  - classpath/commons-daemon-1.0.13.jar
86
71
  - classpath/commons-digester-1.8.jar
87
- - classpath/commons-el-1.0.jar
88
72
  - classpath/commons-httpclient-3.1.jar
89
73
  - classpath/commons-io-2.4.jar
90
74
  - classpath/commons-lang-2.6.jar
@@ -94,16 +78,15 @@ files:
94
78
  - classpath/curator-client-2.7.1.jar
95
79
  - classpath/curator-framework-2.7.1.jar
96
80
  - classpath/curator-recipes-2.7.1.jar
97
- - classpath/embulk-output-orc-0.3.0.jar
81
+ - classpath/embulk-output-orc-0.3.5.jar
98
82
  - classpath/embulk-util-aws-credentials-0.2.8.jar
99
83
  - classpath/gson-2.2.4.jar
100
- - classpath/hadoop-annotations-2.7.3.jar
101
- - classpath/hadoop-auth-2.7.3.jar
102
- - classpath/hadoop-aws-2.7.3.jar
103
- - classpath/hadoop-common-2.7.3.jar
104
- - classpath/hadoop-hdfs-2.6.4.jar
105
- - classpath/hive-storage-api-2.2.1.jar
106
- - classpath/htrace-core-3.0.4.jar
84
+ - classpath/hadoop-annotations-2.7.5.jar
85
+ - classpath/hadoop-auth-2.7.5.jar
86
+ - classpath/hadoop-aws-2.7.5.jar
87
+ - classpath/hadoop-common-2.7.5.jar
88
+ - classpath/hadoop-hdfs-2.7.5.jar
89
+ - classpath/hive-storage-api-2.6.0.jar
107
90
  - classpath/htrace-core-3.1.0-incubating.jar
108
91
  - classpath/httpclient-4.3.6.jar
109
92
  - classpath/httpcore-4.3.3.jar
@@ -111,9 +94,8 @@ files:
111
94
  - classpath/jackson-jaxrs-1.8.3.jar
112
95
  - classpath/jackson-mapper-asl-1.9.13.jar
113
96
  - classpath/jackson-xc-1.8.3.jar
114
- - classpath/jasper-runtime-5.5.23.jar
115
97
  - classpath/java-xmlbuilder-0.4.jar
116
- - classpath/jaxb-api-2.2.2.jar
98
+ - classpath/jaxb-api-2.2.11.jar
117
99
  - classpath/jaxb-impl-2.2.3-1.jar
118
100
  - classpath/jcl-over-slf4j-1.7.12.jar
119
101
  - classpath/jersey-core-1.9.jar
@@ -122,25 +104,46 @@ files:
122
104
  - classpath/jets3t-0.9.0.jar
123
105
  - classpath/jettison-1.1.jar
124
106
  - classpath/jetty-6.1.26.jar
107
+ - classpath/jetty-sslengine-6.1.26.jar
125
108
  - classpath/jetty-util-6.1.26.jar
126
109
  - classpath/jline-0.9.94.jar
127
- - classpath/joda-time-2.9.9.jar
128
- - classpath/jsch-0.1.42.jar
110
+ - classpath/jsch-0.1.54.jar
129
111
  - classpath/jsp-api-2.1.jar
130
112
  - classpath/jsr305-3.0.0.jar
113
+ - classpath/leveldbjni-all-1.8.jar
131
114
  - classpath/log4j-1.2.17.jar
132
115
  - classpath/netty-3.7.0.Final.jar
133
- - classpath/orc-core-1.4.0.jar
116
+ - classpath/netty-all-4.0.23.Final.jar
117
+ - classpath/orc-core-1.5.4.jar
118
+ - classpath/orc-shims-1.5.4.jar
134
119
  - classpath/paranamer-2.3.jar
135
120
  - classpath/protobuf-java-2.5.0.jar
121
+ - classpath/scala-library-2.12.12.jar
122
+ - classpath/servlet-api-2.5-20081211.jar
136
123
  - classpath/servlet-api-2.5.jar
137
124
  - classpath/snappy-java-1.0.4.1.jar
138
- - classpath/stax-api-1.0-2.jar
139
125
  - classpath/xercesImpl-2.9.1.jar
140
126
  - classpath/xml-apis-1.3.04.jar
141
127
  - classpath/xmlenc-0.52.jar
142
128
  - classpath/xz-1.0.jar
143
129
  - classpath/zookeeper-3.4.6.jar
130
+ - config/checkstyle/checkstyle.xml
131
+ - config/checkstyle/default.xml
132
+ - example/example.yml
133
+ - gradle/wrapper/gradle-wrapper.jar
134
+ - gradle/wrapper/gradle-wrapper.properties
135
+ - gradlew
136
+ - gradlew.bat
137
+ - lib/embulk/output/orc.rb
138
+ - src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala
139
+ - src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala
140
+ - src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala
141
+ - src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala
142
+ - src/main/scala/org/embulk/output/orc/PluginTask.scala
143
+ - src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala
144
+ - src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java
145
+ - src/test/resources/example-null.yml
146
+ - src/test/resources/example.yml
144
147
  homepage: https://github.com/yuokada/embulk-output-orc
145
148
  licenses:
146
149
  - MIT
@@ -151,17 +154,17 @@ require_paths:
151
154
  - lib
152
155
  required_ruby_version: !ruby/object:Gem::Requirement
153
156
  requirements:
154
- - - '>='
157
+ - - ">="
155
158
  - !ruby/object:Gem::Version
156
159
  version: '0'
157
160
  required_rubygems_version: !ruby/object:Gem::Requirement
158
161
  requirements:
159
- - - '>='
162
+ - - ">="
160
163
  - !ruby/object:Gem::Version
161
164
  version: '0'
162
165
  requirements: []
163
166
  rubyforge_project:
164
- rubygems_version: 2.1.9
167
+ rubygems_version: 2.6.8
165
168
  signing_key:
166
169
  specification_version: 4
167
170
  summary: Orc output plugin for Embulk