embulk-output-orc 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+ import java.nio.file.{Files, Paths}
5
+
6
+ import com.amazonaws.auth.profile.ProfileCredentialsProvider
7
+ import com.amazonaws.services.s3.AmazonS3Client
8
+ import com.amazonaws.services.s3.model.DeleteObjectRequest
9
+
10
+ import scala.beans.BeanProperty
11
+
12
+ object OrcOutputPluginHelper {
13
+ def removeOldFile(fpath: String, task: PluginTask): Unit = {
14
+ // NOTE: Delete a file if local-filesystem, not HDFS or S3.
15
+ val schema = getSchema(fpath)
16
+ if (isDeleteTarget(schema)) schema match {
17
+ case "file" =>
18
+ try Files.deleteIfExists(Paths.get(fpath))
19
+ catch {
20
+ case e: IOException => throw e
21
+ }
22
+ case "s3" | "s3n" | "s3a" =>
23
+ val s3Url = parseS3Url(fpath)
24
+ val s3client = new AmazonS3Client(new ProfileCredentialsProvider)
25
+ if (task.getEndpoint.isPresent) s3client.setEndpoint(task.getEndpoint.get)
26
+ s3client.deleteObject(new DeleteObjectRequest(s3Url.bucket, s3Url.key))
27
+ case _ =>
28
+ }
29
+ }
30
+
31
+ def isDeleteTarget(schema: String): Boolean = schema match {
32
+ case "file" => true
33
+ case "s3" | "s3n" | "s3a" => true
34
+ case _ => false
35
+ }
36
+
37
+ def getSchema(fpath: String): String = {
38
+ val schema = fpath.split("://").toList.head
39
+ schema match {
40
+ case "s3" | "s3a" | "s3n" => schema
41
+ case _ => {
42
+ val path = Paths.get(fpath)
43
+ path.getFileSystem.provider.getScheme
44
+ }
45
+ }
46
+ }
47
+
48
+ def parseS3Url(s3url: String): AmazonS3URILikeObject = {
49
+ val parts = s3url.split("(://|/)").toList
50
+ val bucket = parts.apply(1)
51
+ val key = parts.slice(2, parts.size).mkString("/")
52
+ OrcOutputPluginHelper.AmazonS3URILikeObject(bucket, key)
53
+ }
54
+
55
+ case class AmazonS3URILikeObject(@BeanProperty bucket: String, @BeanProperty key: String)
56
+
57
+ }
@@ -0,0 +1,52 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.io.IOException
4
+
5
+ import org.apache.orc.Writer
6
+ import org.embulk.config.TaskReport
7
+ import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
8
+
9
+ class OrcTransactionalPageOutput(val reader: PageReader, val writer: Writer, val task: PluginTask) extends TransactionalPageOutput {
10
+ override def add(page: Page): Unit = synchronized {
11
+ try {
12
+ // int size = page.getStringReferences().size();
13
+ val schema = OrcOutputPlugin.getSchema(reader.getSchema)
14
+ val batch = schema.createRowBatch
15
+ // batch.size = size;
16
+ reader.setPage(page)
17
+ while ( {
18
+ reader.nextRecord
19
+ }) {
20
+ val row = {
21
+ batch.size += 1;
22
+ batch.size - 1
23
+ }
24
+ reader.getSchema.visitColumns(new OrcColumnVisitor(reader, batch, row))
25
+ if (batch.size >= batch.getMaxSize) {
26
+ writer.addRowBatch(batch)
27
+ batch.reset()
28
+ }
29
+ }
30
+ if (batch.size != 0) {
31
+ writer.addRowBatch(batch)
32
+ batch.reset()
33
+ }
34
+ } catch {
35
+ case e: IOException =>
36
+ e.printStackTrace()
37
+ }
38
+ }
39
+
40
+ override def finish(): Unit = {
41
+ try writer.close()
42
+ catch {
43
+ case e: IOException => throw e
44
+ }
45
+ }
46
+
47
+ override def close(): Unit = {}
48
+
49
+ override def abort(): Unit = {}
50
+
51
+ override def commit: TaskReport = Exec.newTaskReport
52
+ }
@@ -0,0 +1,56 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.util
4
+
5
+ import com.google.common.base.Optional
6
+ import org.embulk.config.{Config, ConfigDefault, Task}
7
+ import org.embulk.spi.time.TimestampFormatter
8
+ import org.embulk.util.aws.credentials.AwsCredentialsTask
9
+ import org.joda.time.DateTimeZone
10
+
11
+ trait PluginTask extends Task with TimestampFormatter.Task with AwsCredentialsTask {
12
+ @Config("path_prefix")
13
+ def getPathPrefix: String
14
+
15
+ @Config("file_ext")
16
+ @ConfigDefault("\".orc\"")
17
+ def getFileNameExtension: String
18
+
19
+ @Config("column_options")
20
+ @ConfigDefault("{}")
21
+ def getColumnOptions: util.Map[String, TimestampColumnOption]
22
+
23
+ @Config("sequence_format")
24
+ @ConfigDefault("\".%03d\"")
25
+ def getSequenceFormat: String
26
+
27
+ // see: https://orc.apache.org/docs/hive-config.html
28
+ // ORC File options
29
+ @Config("strip_size")
30
+ @ConfigDefault("67108864") // 64MB
31
+ def getStripSize: Integer
32
+
33
+ @Config("buffer_size")
34
+ @ConfigDefault("262144") // 256KB
35
+ def getBufferSize: Integer
36
+
37
+ @Config("block_size")
38
+ @ConfigDefault("268435456") // 256MB
39
+ def getBlockSize: Integer
40
+
41
+ @Config("compression_kind")
42
+ @ConfigDefault("ZLIB")
43
+ def getCompressionKind: String
44
+
45
+ @Config("overwrite")
46
+ @ConfigDefault("false")
47
+ def getOverwrite: Boolean
48
+
49
+ @Config("default_from_timezone")
50
+ @ConfigDefault("\"UTC\"")
51
+ def getDefaultFromTimeZone: DateTimeZone
52
+
53
+ @Config("endpoint")
54
+ @ConfigDefault("null")
55
+ def getEndpoint: Optional[String]
56
+ }
@@ -0,0 +1,32 @@
1
+ package org.embulk.output.orc
2
+
3
+ import java.util
4
+
5
+ import com.google.common.base.Optional
6
+ import org.embulk.config.{Config, ConfigDefault, Task}
7
+ import org.embulk.spi.time.TimestampFormatter
8
+ import org.joda.time.DateTimeZone
9
+
10
+ /*
11
+ public interface TimestampColumnOption
12
+ extends Task, TimestampFormatter.TimestampColumnOption
13
+ {
14
+ @Config("from_timezone")
15
+ @ConfigDefault("null")
16
+ Optional<DateTimeZone> getFromTimeZone();
17
+
18
+ @Config("from_format")
19
+ @ConfigDefault("null")
20
+ Optional<List<String>> getFromFormat();
21
+ }
22
+ */
23
+
24
+ trait TimestampColumnOption extends Task with TimestampFormatter.TimestampColumnOption {
25
+ @Config("from_timezone")
26
+ @ConfigDefault("null")
27
+ def getFromTimeZone: Optional[DateTimeZone]
28
+
29
+ @Config("from_format")
30
+ @ConfigDefault("null")
31
+ def getFromFormat: Optional[util.List[String]]
32
+ }
@@ -0,0 +1,71 @@
1
+ package org.embulk.output.orc;
2
+
3
+ import org.testng.annotations.DataProvider;
4
+ import org.testng.annotations.Test;
5
+
6
+ import static org.hamcrest.MatcherAssert.assertThat;
7
+ import static org.hamcrest.core.Is.is;
8
+
9
+ public class OrcOutputPluginHelperTest
10
+ {
11
+ @DataProvider(name = "url-provider")
12
+ public Object[][] dataProvider()
13
+ {
14
+ return new Object[][] {
15
+ {"file://tmp/output.orc", "file"},
16
+ {"/tmp/output.000.orc", "file"},
17
+ {"s3n://embulk-test/output.0001.orc", "s3n"},
18
+ {"s3a://embulk-test/output.0001.orc", "s3a"},
19
+ {"s3://embulk-test/output.0001.orc", "s3"},
20
+ };
21
+ }
22
+
23
+ @Test(dataProvider = "url-provider")
24
+ public void getFPathTest(String file, String expect)
25
+ {
26
+ String schema = OrcOutputPluginHelper.getSchema(file);
27
+ assertThat(schema, is(expect));
28
+ }
29
+
30
+ @DataProvider(name = "schema-provider")
31
+ public Object[][] schemaProvider()
32
+ {
33
+ return new Object[][] {
34
+ {"file", true},
35
+ {"s3", true},
36
+ {"s3n", true},
37
+ {"s3a", true},
38
+ {"hdfs", false},
39
+ };
40
+ }
41
+
42
+ @Test(dataProvider = "schema-provider")
43
+ public void isDeleteTargetTest(String schema, boolean expect)
44
+ {
45
+ boolean result = OrcOutputPluginHelper.isDeleteTarget(schema);
46
+ assertThat(result, is(expect));
47
+ }
48
+
49
+ @DataProvider(name = "parserTest-provider")
50
+ public Object[][] parserTestProvider()
51
+ {
52
+ String baseurl = "demo-bucket/test/output.000.orc";
53
+ String bucket = "demo-bucket";
54
+ String keyname = "test/output.000.orc";
55
+
56
+ return new Object[][] {
57
+ {"s3://" + baseurl, bucket, keyname},
58
+ {"s3a://" + baseurl, bucket, keyname},
59
+ {"s3n://" + baseurl, bucket, keyname},
60
+ };
61
+ }
62
+
63
+ @Test(dataProvider = "parserTest-provider")
64
+ public void parseS3UrlTest(String url, String bucket, String key)
65
+ {
66
+ OrcOutputPluginHelper.AmazonS3URILikeObject parts =
67
+ OrcOutputPluginHelper.parseS3Url(url);
68
+ assertThat(parts.getBucket(), is(bucket));
69
+ assertThat(parts.getKey(), is(key));
70
+ }
71
+ }
@@ -0,0 +1,25 @@
1
+ ---
2
+ in:
3
+ type: randomj
4
+ rows: 8
5
+ threads: 1
6
+ # default_timezone: Asia/Tokyo
7
+ primary_key: myid
8
+ schema:
9
+ - {name: myid, type: long}
10
+ - {name: named, type: string, null_rate: 10000}
11
+ - {name: x_flag, type: boolean, null_rate: 10000}
12
+ - {name: pit_rate, type: double, null_rate: 10000}
13
+ - {name: score, type: long, null_rate: 10000}
14
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S', null_rate: 10000}
15
+ - {name: purchase, type: timestamp, format: '%Y/%m/%d', null_rate: 10000}
16
+
17
+ exec:
18
+ max_threads: 2 # run at most 8 tasks concurrently
19
+ min_output_tasks: 1 # disable page scattering
20
+
21
+ out:
22
+ type: orc
23
+ overwrite: true
24
+ path_prefix: "/tmp/output"
25
+ compression_kind: ZLIB
@@ -0,0 +1,25 @@
1
+ ---
2
+ in:
3
+ type: randomj
4
+ rows: 1024
5
+ threads: 1
6
+ # default_timezone: Asia/Tokyo
7
+ primary_key: myid
8
+ schema:
9
+ - {name: myid, type: long}
10
+ - {name: named, type: string, null_rate: 1000}
11
+ - {name: x_flag, type: boolean, null_rate: 1000}
12
+ - {name: pit_rate, type: double, null_rate: 1000}
13
+ - {name: score, type: long, null_rate: 1000}
14
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
15
+ - {name: purchase, type: timestamp, format: '%Y/%m/%d'}
16
+
17
+ exec:
18
+ max_threads: 2 # run at most 8 tasks concurrently
19
+ min_output_tasks: 1 # disable page scattering
20
+
21
+ out:
22
+ type: orc
23
+ overwrite: true
24
+ path_prefix: "/tmp/output"
25
+ compression_kind: ZLIB
metadata CHANGED
@@ -1,19 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-orc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yuokada
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-28 00:00:00.000000000 Z
11
+ date: 2020-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
- - - ~>
16
+ - - "~>"
17
17
  - !ruby/object:Gem::Version
18
18
  version: '1.0'
19
19
  name: bundler
@@ -21,13 +21,13 @@ dependencies:
21
21
  type: :development
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: '10.0'
33
33
  name: rake
@@ -35,7 +35,7 @@ dependencies:
35
35
  type: :development
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  description: Dumps records to Orc format file.
@@ -45,27 +45,12 @@ executables: []
45
45
  extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
- - .gitignore
49
- - .travis.yml
48
+ - ".github/workflows/gradle.yml"
49
+ - ".gitignore"
50
50
  - LICENSE.txt
51
51
  - README.md
52
52
  - build.gradle
53
- - config/checkstyle/checkstyle.xml
54
- - config/checkstyle/default.xml
55
- - example/example.yml
56
- - gradle/wrapper/gradle-wrapper.jar
57
- - gradle/wrapper/gradle-wrapper.properties
58
- - gradlew
59
- - gradlew.bat
60
- - lib/embulk/output/orc.rb
61
- - src/main/java/org/embulk/output/orc/OrcColumnVisitor.java
62
- - src/main/java/org/embulk/output/orc/OrcOutputPlugin.java
63
- - src/main/java/org/embulk/output/orc/OrcOutputPluginHelper.java
64
- - src/main/java/org/embulk/output/orc/PluginTask.java
65
- - src/main/java/org/embulk/output/orc/TimestampColumnOption.java
66
- - src/test/java/org/embulk/output/orc/TestOrcOutputPlugin.java
67
- - classpath/activation-1.1.jar
68
- - classpath/aircompressor-0.3.jar
53
+ - classpath/aircompressor-0.10.jar
69
54
  - classpath/apacheds-i18n-2.0.0-M15.jar
70
55
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
71
56
  - classpath/api-asn1-api-1.0.0-M20.jar
@@ -84,7 +69,6 @@ files:
84
69
  - classpath/commons-configuration-1.6.jar
85
70
  - classpath/commons-daemon-1.0.13.jar
86
71
  - classpath/commons-digester-1.8.jar
87
- - classpath/commons-el-1.0.jar
88
72
  - classpath/commons-httpclient-3.1.jar
89
73
  - classpath/commons-io-2.4.jar
90
74
  - classpath/commons-lang-2.6.jar
@@ -94,16 +78,15 @@ files:
94
78
  - classpath/curator-client-2.7.1.jar
95
79
  - classpath/curator-framework-2.7.1.jar
96
80
  - classpath/curator-recipes-2.7.1.jar
97
- - classpath/embulk-output-orc-0.3.0.jar
81
+ - classpath/embulk-output-orc-0.3.5.jar
98
82
  - classpath/embulk-util-aws-credentials-0.2.8.jar
99
83
  - classpath/gson-2.2.4.jar
100
- - classpath/hadoop-annotations-2.7.3.jar
101
- - classpath/hadoop-auth-2.7.3.jar
102
- - classpath/hadoop-aws-2.7.3.jar
103
- - classpath/hadoop-common-2.7.3.jar
104
- - classpath/hadoop-hdfs-2.6.4.jar
105
- - classpath/hive-storage-api-2.2.1.jar
106
- - classpath/htrace-core-3.0.4.jar
84
+ - classpath/hadoop-annotations-2.7.5.jar
85
+ - classpath/hadoop-auth-2.7.5.jar
86
+ - classpath/hadoop-aws-2.7.5.jar
87
+ - classpath/hadoop-common-2.7.5.jar
88
+ - classpath/hadoop-hdfs-2.7.5.jar
89
+ - classpath/hive-storage-api-2.6.0.jar
107
90
  - classpath/htrace-core-3.1.0-incubating.jar
108
91
  - classpath/httpclient-4.3.6.jar
109
92
  - classpath/httpcore-4.3.3.jar
@@ -111,9 +94,8 @@ files:
111
94
  - classpath/jackson-jaxrs-1.8.3.jar
112
95
  - classpath/jackson-mapper-asl-1.9.13.jar
113
96
  - classpath/jackson-xc-1.8.3.jar
114
- - classpath/jasper-runtime-5.5.23.jar
115
97
  - classpath/java-xmlbuilder-0.4.jar
116
- - classpath/jaxb-api-2.2.2.jar
98
+ - classpath/jaxb-api-2.2.11.jar
117
99
  - classpath/jaxb-impl-2.2.3-1.jar
118
100
  - classpath/jcl-over-slf4j-1.7.12.jar
119
101
  - classpath/jersey-core-1.9.jar
@@ -122,25 +104,46 @@ files:
122
104
  - classpath/jets3t-0.9.0.jar
123
105
  - classpath/jettison-1.1.jar
124
106
  - classpath/jetty-6.1.26.jar
107
+ - classpath/jetty-sslengine-6.1.26.jar
125
108
  - classpath/jetty-util-6.1.26.jar
126
109
  - classpath/jline-0.9.94.jar
127
- - classpath/joda-time-2.9.9.jar
128
- - classpath/jsch-0.1.42.jar
110
+ - classpath/jsch-0.1.54.jar
129
111
  - classpath/jsp-api-2.1.jar
130
112
  - classpath/jsr305-3.0.0.jar
113
+ - classpath/leveldbjni-all-1.8.jar
131
114
  - classpath/log4j-1.2.17.jar
132
115
  - classpath/netty-3.7.0.Final.jar
133
- - classpath/orc-core-1.4.0.jar
116
+ - classpath/netty-all-4.0.23.Final.jar
117
+ - classpath/orc-core-1.5.4.jar
118
+ - classpath/orc-shims-1.5.4.jar
134
119
  - classpath/paranamer-2.3.jar
135
120
  - classpath/protobuf-java-2.5.0.jar
121
+ - classpath/scala-library-2.12.12.jar
122
+ - classpath/servlet-api-2.5-20081211.jar
136
123
  - classpath/servlet-api-2.5.jar
137
124
  - classpath/snappy-java-1.0.4.1.jar
138
- - classpath/stax-api-1.0-2.jar
139
125
  - classpath/xercesImpl-2.9.1.jar
140
126
  - classpath/xml-apis-1.3.04.jar
141
127
  - classpath/xmlenc-0.52.jar
142
128
  - classpath/xz-1.0.jar
143
129
  - classpath/zookeeper-3.4.6.jar
130
+ - config/checkstyle/checkstyle.xml
131
+ - config/checkstyle/default.xml
132
+ - example/example.yml
133
+ - gradle/wrapper/gradle-wrapper.jar
134
+ - gradle/wrapper/gradle-wrapper.properties
135
+ - gradlew
136
+ - gradlew.bat
137
+ - lib/embulk/output/orc.rb
138
+ - src/main/scala/org/embulk/output/orc/OrcColumnVisitor.scala
139
+ - src/main/scala/org/embulk/output/orc/OrcOutputPlugin.scala
140
+ - src/main/scala/org/embulk/output/orc/OrcOutputPluginHelper.scala
141
+ - src/main/scala/org/embulk/output/orc/OrcTransactionalPageOutput.scala
142
+ - src/main/scala/org/embulk/output/orc/PluginTask.scala
143
+ - src/main/scala/org/embulk/output/orc/TimestampColumnOption.scala
144
+ - src/test/java/org/embulk/output/orc/OrcOutputPluginHelperTest.java
145
+ - src/test/resources/example-null.yml
146
+ - src/test/resources/example.yml
144
147
  homepage: https://github.com/yuokada/embulk-output-orc
145
148
  licenses:
146
149
  - MIT
@@ -151,17 +154,17 @@ require_paths:
151
154
  - lib
152
155
  required_ruby_version: !ruby/object:Gem::Requirement
153
156
  requirements:
154
- - - '>='
157
+ - - ">="
155
158
  - !ruby/object:Gem::Version
156
159
  version: '0'
157
160
  required_rubygems_version: !ruby/object:Gem::Requirement
158
161
  requirements:
159
- - - '>='
162
+ - - ">="
160
163
  - !ruby/object:Gem::Version
161
164
  version: '0'
162
165
  requirements: []
163
166
  rubyforge_project:
164
- rubygems_version: 2.1.9
167
+ rubygems_version: 2.6.8
165
168
  signing_key:
166
169
  specification_version: 4
167
170
  summary: Orc output plugin for Embulk