embulk-input-parquet_hadoop 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/build.gradle +53 -0
- data/classpath/activation-1.1.jar +0 -0
- data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
- data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
- data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
- data/classpath/api-util-1.0.0-M20.jar +0 -0
- data/classpath/asm-3.1.jar +0 -0
- data/classpath/avro-1.7.4.jar +0 -0
- data/classpath/commons-beanutils-1.7.0.jar +0 -0
- data/classpath/commons-cli-1.2.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-collections-3.2.2.jar +0 -0
- data/classpath/commons-compress-1.4.1.jar +0 -0
- data/classpath/commons-configuration-1.6.jar +0 -0
- data/classpath/commons-digester-1.8.jar +0 -0
- data/classpath/commons-httpclient-3.1.jar +0 -0
- data/classpath/commons-io-2.4.jar +0 -0
- data/classpath/commons-lang-2.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/commons-math3-3.1.1.jar +0 -0
- data/classpath/commons-net-3.1.jar +0 -0
- data/classpath/curator-client-2.7.1.jar +0 -0
- data/classpath/curator-framework-2.7.1.jar +0 -0
- data/classpath/curator-recipes-2.7.1.jar +0 -0
- data/classpath/embulk-input-parquet_hadoop-0.1.0.jar +0 -0
- data/classpath/gson-2.2.4.jar +0 -0
- data/classpath/hadoop-annotations-2.7.3.jar +0 -0
- data/classpath/hadoop-auth-2.7.3.jar +0 -0
- data/classpath/hadoop-client-2.7.3.jar +0 -0
- data/classpath/hadoop-common-2.7.3.jar +0 -0
- data/classpath/hadoop-hdfs-2.7.3.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.7.3.jar +0 -0
- data/classpath/hadoop-mapreduce-client-common-2.7.3.jar +0 -0
- data/classpath/hadoop-mapreduce-client-core-2.7.3.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.7.3.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.7.3.jar +0 -0
- data/classpath/hadoop-yarn-api-2.7.3.jar +0 -0
- data/classpath/hadoop-yarn-client-2.7.3.jar +0 -0
- data/classpath/hadoop-yarn-common-2.7.3.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.7.3.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.7.3.jar +0 -0
- data/classpath/htrace-core-3.1.0-incubating.jar +0 -0
- data/classpath/httpclient-4.2.5.jar +0 -0
- data/classpath/httpcore-4.2.4.jar +0 -0
- data/classpath/jackson-core-asl-1.9.13.jar +0 -0
- data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
- data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
- data/classpath/jackson-xc-1.9.13.jar +0 -0
- data/classpath/jaxb-api-2.2.2.jar +0 -0
- data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
- data/classpath/jersey-client-1.9.jar +0 -0
- data/classpath/jersey-core-1.9.jar +0 -0
- data/classpath/jersey-guice-1.9.jar +0 -0
- data/classpath/jersey-json-1.9.jar +0 -0
- data/classpath/jersey-server-1.9.jar +0 -0
- data/classpath/jettison-1.1.jar +0 -0
- data/classpath/jetty-util-6.1.26.jar +0 -0
- data/classpath/jline-0.9.94.jar +0 -0
- data/classpath/jsp-api-2.1.jar +0 -0
- data/classpath/jsr305-3.0.0.jar +0 -0
- data/classpath/jul-to-slf4j-1.7.24.jar +0 -0
- data/classpath/leveldbjni-all-1.8.jar +0 -0
- data/classpath/log4j-over-slf4j-1.7.24.jar +0 -0
- data/classpath/netty-3.7.0.Final.jar +0 -0
- data/classpath/netty-all-4.0.23.Final.jar +0 -0
- data/classpath/paranamer-2.3.jar +0 -0
- data/classpath/parquet-column-1.8.1.jar +0 -0
- data/classpath/parquet-common-1.8.1.jar +0 -0
- data/classpath/parquet-encoding-1.8.1.jar +0 -0
- data/classpath/parquet-format-2.3.0-incubating.jar +0 -0
- data/classpath/parquet-hadoop-1.8.1.jar +0 -0
- data/classpath/parquet-jackson-1.8.1.jar +0 -0
- data/classpath/parquet-msgpack-0.1.0.jar +0 -0
- data/classpath/protobuf-java-2.5.0.jar +0 -0
- data/classpath/servlet-api-2.5.jar +0 -0
- data/classpath/slf4j-api-1.7.24.jar +0 -0
- data/classpath/snappy-java-1.1.1.6.jar +0 -0
- data/classpath/stax-api-1.0-2.jar +0 -0
- data/classpath/xercesImpl-2.9.1.jar +0 -0
- data/classpath/xml-apis-1.3.04.jar +0 -0
- data/classpath/xmlenc-0.52.jar +0 -0
- data/classpath/xz-1.0.jar +0 -0
- data/classpath/zookeeper-3.4.6.jar +0 -0
- data/lib/embulk/input/parquet_hadoop.rb +18 -0
- data/src/main/java/org/embulk/input/parquet_hadoop/ConfigurationFactory.java +84 -0
- data/src/main/java/org/embulk/input/parquet_hadoop/ParquetHadoopInputPlugin.java +257 -0
- data/src/main/java/org/embulk/input/parquet_hadoop/ParquetRowReader.java +182 -0
- data/src/main/java/org/embulk/input/parquet_hadoop/PluginClassLoaderScope.java +44 -0
- data/src/test/java/org/embulk/input/parquet_hadoop/TestParquetHadoopInputPlugin.java +74 -0
- data/src/test/resources/test-data/incompatible-schema/data/1.parquet +0 -0
- data/src/test/resources/test-data/incompatible-schema/data/2.parquet +0 -0
- data/src/test/resources/test-data/incompatible-schema/expected.csv +4 -0
- data/src/test/resources/test-data/incompatible-schema/in.yml +2 -0
- data/src/test/resources/test-data/simple/data.parquet +0 -0
- data/src/test/resources/test-data/simple/expected.csv +3 -0
- data/src/test/resources/test-data/simple/in.yml +2 -0
- metadata +168 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 11d3bfc5cf66805e9ce41966e90759d5acfbed8f
|
4
|
+
data.tar.gz: 234ecd00864d9c122f01a95ab224c18bdff3ccea
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 05e661e93e1e5c99edec29e2c83cd68d79f45e8c828afb0aeba822e44003057cf5deb1c69e14cf8eebd32755c19a06766c095e9dd0812bc3feee3f3ae4574c0a
|
7
|
+
data.tar.gz: 9b1119067ba7eaeb18ee4ddaac2322881b6177fb3eab92995784745b72d90e6e0c9e60d0bc552afd652f6556392b008628e9065cbd8762ab48a2275cb2a62944
|
data/build.gradle
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
import com.github.jrubygradle.JRubyExec
|
2
|
+
|
3
|
+
dependencies {
|
4
|
+
compile "org.embulk:embulk-core:0.8.16"
|
5
|
+
provided "org.embulk:embulk-core:0.8.16"
|
6
|
+
|
7
|
+
compile project(':parquet-msgpack')
|
8
|
+
// for hadoop
|
9
|
+
compile 'org.slf4j:log4j-over-slf4j:1.7.24'
|
10
|
+
// for parquet
|
11
|
+
compile 'org.slf4j:jul-to-slf4j:1.7.24'
|
12
|
+
|
13
|
+
testCompile "junit:junit:4.+"
|
14
|
+
testCompile 'org.embulk:embulk-standards:0.8.16'
|
15
|
+
testCompile "org.embulk:embulk-test:0.8.16"
|
16
|
+
testCompile 'org.assertj:assertj-core:2.6.+'
|
17
|
+
}
|
18
|
+
|
19
|
+
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
20
|
+
jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
|
21
|
+
scriptArgs "${project.projectDir.absolutePath}/build/gemspec"
|
22
|
+
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "${project.projectDir}/pkg") }
|
23
|
+
}
|
24
|
+
|
25
|
+
task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
|
26
|
+
jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
|
27
|
+
scriptArgs "pkg/${project.name}-${project.version}.gem"
|
28
|
+
}
|
29
|
+
|
30
|
+
task gemspec {
|
31
|
+
doLast {
|
32
|
+
file('build').mkdirs()
|
33
|
+
file('build/gemspec').write($/
|
34
|
+
Gem::Specification.new do |spec|
|
35
|
+
spec.name = "${project.name}"
|
36
|
+
spec.version = "${project.version}"
|
37
|
+
spec.authors = ["Koji AGAWA"]
|
38
|
+
spec.summary = %[Parquet input plugin for Embulk]
|
39
|
+
spec.description = %[Loads records from Parquet files via Hadoop FileSystem.]
|
40
|
+
spec.email = ["agawa_koji@cyberagent.co.jp"]
|
41
|
+
spec.licenses = ["Apache 2.0"]
|
42
|
+
spec.homepage = "https://github.com/CyberAgent/embulk-input-parquet_hadoop"
|
43
|
+
|
44
|
+
spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
|
45
|
+
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
46
|
+
spec.require_paths = ["lib"]
|
47
|
+
|
48
|
+
spec.add_development_dependency 'bundler', ['~> 1.0']
|
49
|
+
spec.add_development_dependency 'rake', ['>= 10.0']
|
50
|
+
end
|
51
|
+
/$)
|
52
|
+
}
|
53
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2017 CyberAgent, Inc.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
Embulk::JavaPlugin.register_input(
|
17
|
+
"parquet_hadoop", "org.embulk.input.parquet_hadoop.ParquetHadoopInputPlugin",
|
18
|
+
File.expand_path('../../../../classpath', __FILE__))
|
@@ -0,0 +1,84 @@
|
|
1
|
+
/*
|
2
|
+
* This class includes code from embulk-input-hadoop.
|
3
|
+
* (https://github.com/civitaspo/embulk-input-hdfs)
|
4
|
+
*
|
5
|
+
* The MIT License
|
6
|
+
* Copyright (c) 2015 Civitaspo
|
7
|
+
*
|
8
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
9
|
+
* of this software and associated documentation files (the "Software"), to deal
|
10
|
+
* in the Software without restriction, including without limitation the rights
|
11
|
+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12
|
+
* copies of the Software, and to permit persons to whom the Software is
|
13
|
+
* furnished to do so, subject to the following conditions:
|
14
|
+
*
|
15
|
+
* The above copyright notice and this permission notice shall be included in
|
16
|
+
* all copies or substantial portions of the Software.
|
17
|
+
*
|
18
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21
|
+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23
|
+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
24
|
+
* THE SOFTWARE.
|
25
|
+
*/
|
26
|
+
package org.embulk.input.parquet_hadoop;
|
27
|
+
|
28
|
+
import org.apache.hadoop.conf.Configuration;
|
29
|
+
import org.embulk.config.Config;
|
30
|
+
import org.embulk.config.ConfigDefault;
|
31
|
+
import org.embulk.config.ConfigException;
|
32
|
+
import org.embulk.spi.Exec;
|
33
|
+
import org.slf4j.Logger;
|
34
|
+
|
35
|
+
import java.io.File;
|
36
|
+
import java.net.MalformedURLException;
|
37
|
+
import java.util.List;
|
38
|
+
import java.util.Map;
|
39
|
+
|
40
|
+
public class ConfigurationFactory
|
41
|
+
{
|
42
|
+
private static final Logger logger = Exec.getLogger(ConfigurationFactory.class);
|
43
|
+
|
44
|
+
interface Task
|
45
|
+
{
|
46
|
+
@Config("config_files")
|
47
|
+
@ConfigDefault("[]")
|
48
|
+
List<String> getConfigFiles();
|
49
|
+
|
50
|
+
@Config("config")
|
51
|
+
@ConfigDefault("{}")
|
52
|
+
Map<String, String> getConfig();
|
53
|
+
}
|
54
|
+
|
55
|
+
private ConfigurationFactory()
|
56
|
+
{
|
57
|
+
}
|
58
|
+
|
59
|
+
public static Configuration create(Task task)
|
60
|
+
{
|
61
|
+
Configuration c = new Configuration();
|
62
|
+
for (String f : task.getConfigFiles()) {
|
63
|
+
try {
|
64
|
+
logger.trace("embulk-input-parquet_hadoop: load a config file: {}", f);
|
65
|
+
c.addResource(new File(f).toURI().toURL());
|
66
|
+
} catch (MalformedURLException e) {
|
67
|
+
throw new ConfigException(e);
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
|
72
|
+
logger.trace("embulk-input-parquet_hadoop: load a config: {}:{}", entry.getKey(), entry.getValue());
|
73
|
+
c.set(entry.getKey(), entry.getValue());
|
74
|
+
}
|
75
|
+
|
76
|
+
// For logging
|
77
|
+
for (Map.Entry<String, String> entry : c) {
|
78
|
+
logger.trace("embulk-input-parquet_hadoop: loaded: {}: {}", entry.getKey(), entry.getValue());
|
79
|
+
}
|
80
|
+
logger.trace("embulk-input-parquet_hadoop: loaded files: {}", c);
|
81
|
+
|
82
|
+
return c;
|
83
|
+
}
|
84
|
+
}
|
@@ -0,0 +1,257 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright 2017 CyberAgent, Inc.
|
3
|
+
*
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
* you may not use this file except in compliance with the License.
|
6
|
+
* You may obtain a copy of the License at
|
7
|
+
*
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
*
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
* See the License for the specific language governing permissions and
|
14
|
+
* limitations under the License.
|
15
|
+
*/
|
16
|
+
package org.embulk.input.parquet_hadoop;
|
17
|
+
|
18
|
+
import com.google.common.base.Function;
|
19
|
+
import com.google.common.base.Throwables;
|
20
|
+
import com.google.common.collect.Lists;
|
21
|
+
import org.apache.hadoop.conf.Configuration;
|
22
|
+
import org.apache.hadoop.fs.FileStatus;
|
23
|
+
import org.apache.hadoop.fs.FileSystem;
|
24
|
+
import org.apache.hadoop.fs.Path;
|
25
|
+
import org.apache.hadoop.fs.PathNotFoundException;
|
26
|
+
import org.apache.parquet.ParquetRuntimeException;
|
27
|
+
import org.apache.parquet.hadoop.util.HiddenFileFilter;
|
28
|
+
import org.embulk.config.Config;
|
29
|
+
import org.embulk.config.ConfigDefault;
|
30
|
+
import org.embulk.config.ConfigDiff;
|
31
|
+
import org.embulk.config.ConfigSource;
|
32
|
+
import org.embulk.config.Task;
|
33
|
+
import org.embulk.config.TaskReport;
|
34
|
+
import org.embulk.config.TaskSource;
|
35
|
+
import org.embulk.spi.Column;
|
36
|
+
import org.embulk.spi.DataException;
|
37
|
+
import org.embulk.spi.Exec;
|
38
|
+
import org.embulk.spi.InputPlugin;
|
39
|
+
import org.embulk.spi.PageBuilder;
|
40
|
+
import org.embulk.spi.PageOutput;
|
41
|
+
import org.embulk.spi.Schema;
|
42
|
+
import org.embulk.spi.type.Types;
|
43
|
+
import org.msgpack.value.Value;
|
44
|
+
import org.slf4j.Logger;
|
45
|
+
import org.slf4j.bridge.SLF4JBridgeHandler;
|
46
|
+
import studio.adtech.parquet.msgpack.read.MessagePackReadSupport;
|
47
|
+
|
48
|
+
import javax.annotation.Nullable;
|
49
|
+
import java.io.IOException;
|
50
|
+
import java.util.List;
|
51
|
+
import java.util.logging.Level;
|
52
|
+
|
53
|
+
public class ParquetHadoopInputPlugin
|
54
|
+
implements InputPlugin
|
55
|
+
{
|
56
|
+
private static final Logger logger = Exec.getLogger(ParquetHadoopInputPlugin.class);
|
57
|
+
|
58
|
+
public interface PluginTask
|
59
|
+
extends Task, ConfigurationFactory.Task
|
60
|
+
{
|
61
|
+
@Config("path")
|
62
|
+
String getPath();
|
63
|
+
|
64
|
+
@Config("parquet_log_level")
|
65
|
+
@ConfigDefault("\"INFO\"")
|
66
|
+
String getParquetLogLevel();
|
67
|
+
|
68
|
+
List<String> getFiles();
|
69
|
+
void setFiles(List<String> files);
|
70
|
+
}
|
71
|
+
|
72
|
+
Schema newSchema()
|
73
|
+
{
|
74
|
+
return Schema.builder().add("record", Types.JSON).build();
|
75
|
+
}
|
76
|
+
|
77
|
+
@Override
|
78
|
+
public ConfigDiff transaction(ConfigSource config,
|
79
|
+
InputPlugin.Control control)
|
80
|
+
{
|
81
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
82
|
+
configureParquetLogger(task);
|
83
|
+
|
84
|
+
Path rootPath = new Path(task.getPath());
|
85
|
+
|
86
|
+
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
87
|
+
Configuration conf = ConfigurationFactory.create(task);
|
88
|
+
|
89
|
+
FileSystem fs = FileSystem.get(rootPath.toUri(), conf);
|
90
|
+
List<FileStatus> statusList = listFileStatuses(fs, rootPath);
|
91
|
+
if (statusList.isEmpty()) {
|
92
|
+
throw new PathNotFoundException(rootPath.toString());
|
93
|
+
}
|
94
|
+
|
95
|
+
for (FileStatus status : statusList) {
|
96
|
+
logger.debug("embulk-input-parquet_hadoop: Loading paths: {}, length: {}",
|
97
|
+
status.getPath(), status.getLen());
|
98
|
+
}
|
99
|
+
|
100
|
+
List<String> files = Lists.transform(statusList, new Function<FileStatus, String>() {
|
101
|
+
@Nullable
|
102
|
+
@Override
|
103
|
+
public String apply(@Nullable FileStatus input) {
|
104
|
+
return input.getPath().toString();
|
105
|
+
}
|
106
|
+
});
|
107
|
+
task.setFiles(files);
|
108
|
+
} catch (IOException e) {
|
109
|
+
throw Throwables.propagate(e);
|
110
|
+
}
|
111
|
+
|
112
|
+
Schema schema = newSchema();
|
113
|
+
int taskCount = task.getFiles().size();
|
114
|
+
|
115
|
+
return resume(task.dump(), schema, taskCount, control);
|
116
|
+
}
|
117
|
+
|
118
|
+
@Override
|
119
|
+
public ConfigDiff resume(TaskSource taskSource,
|
120
|
+
Schema schema, int taskCount,
|
121
|
+
InputPlugin.Control control)
|
122
|
+
{
|
123
|
+
control.run(taskSource, schema, taskCount);
|
124
|
+
return Exec.newConfigDiff();
|
125
|
+
}
|
126
|
+
|
127
|
+
@Override
|
128
|
+
public void cleanup(TaskSource taskSource,
|
129
|
+
Schema schema, int taskCount,
|
130
|
+
List<TaskReport> successTaskReports)
|
131
|
+
{
|
132
|
+
}
|
133
|
+
|
134
|
+
@Override
|
135
|
+
public TaskReport run(TaskSource taskSource,
|
136
|
+
Schema schema, int taskIndex,
|
137
|
+
PageOutput output)
|
138
|
+
{
|
139
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
140
|
+
configureParquetLogger(task);
|
141
|
+
|
142
|
+
final Column jsonColumn = schema.getColumn(0);
|
143
|
+
|
144
|
+
Configuration conf;
|
145
|
+
Path filePath;
|
146
|
+
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
147
|
+
conf = ConfigurationFactory.create(task);
|
148
|
+
filePath = new Path(task.getFiles().get(taskIndex));
|
149
|
+
}
|
150
|
+
|
151
|
+
try (PageBuilder pageBuilder = newPageBuilder(schema, output)) {
|
152
|
+
ParquetRowReader<Value> reader;
|
153
|
+
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
154
|
+
reader = new ParquetRowReader<>(conf, filePath, new MessagePackReadSupport());
|
155
|
+
} catch (ParquetRuntimeException | IOException e) {
|
156
|
+
throw new DataException(e);
|
157
|
+
}
|
158
|
+
|
159
|
+
Value value;
|
160
|
+
while (true) {
|
161
|
+
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
162
|
+
value = reader.read();
|
163
|
+
} catch (ParquetRuntimeException | IOException e) {
|
164
|
+
throw new DataException(e);
|
165
|
+
}
|
166
|
+
if (value == null) {
|
167
|
+
break;
|
168
|
+
}
|
169
|
+
|
170
|
+
pageBuilder.setJson(jsonColumn, value);
|
171
|
+
pageBuilder.addRecord();
|
172
|
+
}
|
173
|
+
|
174
|
+
pageBuilder.finish();
|
175
|
+
|
176
|
+
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
177
|
+
reader.close();
|
178
|
+
} catch (ParquetRuntimeException | IOException e) {
|
179
|
+
throw new DataException(e);
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
TaskReport report = Exec.newTaskReport();
|
184
|
+
return report;
|
185
|
+
}
|
186
|
+
|
187
|
+
@Override
|
188
|
+
public ConfigDiff guess(ConfigSource config)
|
189
|
+
{
|
190
|
+
return Exec.newConfigDiff();
|
191
|
+
}
|
192
|
+
|
193
|
+
private PageBuilder newPageBuilder(Schema schema, PageOutput output)
|
194
|
+
{
|
195
|
+
return new PageBuilder(Exec.getBufferAllocator(), schema, output);
|
196
|
+
}
|
197
|
+
|
198
|
+
private List<FileStatus> listFileStatuses(FileSystem fs, Path rootPath) throws IOException {
|
199
|
+
List<FileStatus> fileStatuses = Lists.newArrayList();
|
200
|
+
|
201
|
+
FileStatus[] entries = fs.globStatus(rootPath, HiddenFileFilter.INSTANCE);
|
202
|
+
if (entries == null) {
|
203
|
+
return fileStatuses;
|
204
|
+
}
|
205
|
+
|
206
|
+
for (FileStatus entry : entries) {
|
207
|
+
if (entry.isDirectory()) {
|
208
|
+
List<FileStatus> subEntries = listRecursive(fs, entry);
|
209
|
+
fileStatuses.addAll(subEntries);
|
210
|
+
} else {
|
211
|
+
fileStatuses.add(entry);
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
return fileStatuses;
|
216
|
+
}
|
217
|
+
|
218
|
+
private List<FileStatus> listRecursive(FileSystem fs, FileStatus status) throws IOException
|
219
|
+
{
|
220
|
+
List<FileStatus> statusList = Lists.newArrayList();
|
221
|
+
if (status.isDirectory()) {
|
222
|
+
FileStatus[] entries = fs.listStatus(status.getPath(), HiddenFileFilter.INSTANCE);
|
223
|
+
for (FileStatus entry : entries) {
|
224
|
+
statusList.addAll(listRecursive(fs, entry));
|
225
|
+
}
|
226
|
+
} else {
|
227
|
+
statusList.add(status);
|
228
|
+
}
|
229
|
+
return statusList;
|
230
|
+
}
|
231
|
+
|
232
|
+
private static void configureParquetLogger(PluginTask task)
|
233
|
+
{
|
234
|
+
// delegate java.util.logging to slf4j.
|
235
|
+
java.util.logging.Logger parquetLogger = java.util.logging.Logger.getLogger("org.apache.parquet");
|
236
|
+
if (parquetLogger.getHandlers().length == 0) {
|
237
|
+
parquetLogger.addHandler(new SLF4JBridgeHandler());
|
238
|
+
parquetLogger.setUseParentHandlers(false);
|
239
|
+
}
|
240
|
+
|
241
|
+
Level level;
|
242
|
+
try {
|
243
|
+
level = Level.parse(task.getParquetLogLevel());
|
244
|
+
} catch (IllegalArgumentException e) {
|
245
|
+
logger.warn("embulk-input-parquet_hadoop: Invalid parquet_log_level", e);
|
246
|
+
level = Level.WARNING;
|
247
|
+
}
|
248
|
+
// invoke static initializer that overrides log level.
|
249
|
+
try {
|
250
|
+
Class.forName("org.apache.parquet.Log");
|
251
|
+
} catch (ClassNotFoundException e) {
|
252
|
+
logger.warn("", e);
|
253
|
+
}
|
254
|
+
|
255
|
+
parquetLogger.setLevel(level);
|
256
|
+
}
|
257
|
+
}
|
@@ -0,0 +1,182 @@
|
|
1
|
+
/*
|
2
|
+
* This class includes code from Apache Parquet MR.
|
3
|
+
* (org.apache.parquet.hadoop.InternalParquetRecordReader)
|
4
|
+
*
|
5
|
+
* Copyright 2017 CyberAgent, Inc.
|
6
|
+
*
|
7
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
8
|
+
* you may not use this file except in compliance with the License.
|
9
|
+
* You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
* See the License for the specific language governing permissions and
|
17
|
+
* limitations under the License.
|
18
|
+
*/
|
19
|
+
package org.embulk.input.parquet_hadoop;
|
20
|
+
|
21
|
+
import org.apache.hadoop.conf.Configuration;
|
22
|
+
import org.apache.hadoop.fs.Path;
|
23
|
+
import org.apache.parquet.column.ColumnDescriptor;
|
24
|
+
import org.apache.parquet.column.page.PageReadStore;
|
25
|
+
import org.apache.parquet.filter2.compat.FilterCompat;
|
26
|
+
import org.apache.parquet.format.converter.ParquetMetadataConverter;
|
27
|
+
import org.apache.parquet.hadoop.ParquetFileReader;
|
28
|
+
import org.apache.parquet.hadoop.UnmaterializableRecordCounter;
|
29
|
+
import org.apache.parquet.hadoop.api.InitContext;
|
30
|
+
import org.apache.parquet.hadoop.api.ReadSupport;
|
31
|
+
import org.apache.parquet.hadoop.metadata.BlockMetaData;
|
32
|
+
import org.apache.parquet.hadoop.metadata.FileMetaData;
|
33
|
+
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
34
|
+
import org.apache.parquet.io.ColumnIOFactory;
|
35
|
+
import org.apache.parquet.io.MessageColumnIO;
|
36
|
+
import org.apache.parquet.io.ParquetDecodingException;
|
37
|
+
import org.apache.parquet.io.RecordReader;
|
38
|
+
import org.apache.parquet.io.api.RecordMaterializer;
|
39
|
+
import org.apache.parquet.schema.MessageType;
|
40
|
+
import org.embulk.spi.Exec;
|
41
|
+
import org.slf4j.Logger;
|
42
|
+
|
43
|
+
import java.io.IOException;
|
44
|
+
import java.util.Collections;
|
45
|
+
import java.util.HashMap;
|
46
|
+
import java.util.HashSet;
|
47
|
+
import java.util.List;
|
48
|
+
import java.util.Map;
|
49
|
+
import java.util.Set;
|
50
|
+
|
51
|
+
public class ParquetRowReader<T> {
|
52
|
+
private static final Logger logger = Exec.getLogger(ParquetRowReader.class);
|
53
|
+
|
54
|
+
private final Path filePath;
|
55
|
+
private final ParquetFileReader reader;
|
56
|
+
private final long total;
|
57
|
+
private final ColumnIOFactory columnIOFactory;
|
58
|
+
private final RecordMaterializer<T> recordConverter;
|
59
|
+
private final MessageType requestedSchema;
|
60
|
+
private final MessageType fileSchema;
|
61
|
+
private final UnmaterializableRecordCounter unmaterializableRecordCounter;
|
62
|
+
|
63
|
+
private long current = 0;
|
64
|
+
private long totalCountLoadedSoFar = 0;
|
65
|
+
private int currentBlock = -1;
|
66
|
+
private RecordReader<T> recordReader;
|
67
|
+
|
68
|
+
// TODO: make configurable ?
|
69
|
+
private static final boolean strictTypeChecking = true;
|
70
|
+
private static final FilterCompat.Filter filter = FilterCompat.NOOP;
|
71
|
+
|
72
|
+
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException {
|
73
|
+
this.filePath = filePath;
|
74
|
+
|
75
|
+
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
|
76
|
+
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
|
77
|
+
|
78
|
+
FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
|
79
|
+
this.fileSchema = fileMetadata.getSchema();
|
80
|
+
Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
|
81
|
+
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
|
82
|
+
configuration, toSetMultiMap(keyValueMetadata), fileSchema));
|
83
|
+
this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());
|
84
|
+
|
85
|
+
this.requestedSchema = readContext.getRequestedSchema();
|
86
|
+
this.recordConverter = readSupport.prepareForRead(
|
87
|
+
configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);
|
88
|
+
|
89
|
+
List<ColumnDescriptor> columns = requestedSchema.getColumns();
|
90
|
+
|
91
|
+
reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);
|
92
|
+
|
93
|
+
long total = 0;
|
94
|
+
for (BlockMetaData block : blocks) {
|
95
|
+
total += block.getRowCount();
|
96
|
+
}
|
97
|
+
this.total = total;
|
98
|
+
|
99
|
+
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
|
100
|
+
logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
|
101
|
+
}
|
102
|
+
|
103
|
+
private void checkRead() throws IOException {
|
104
|
+
if (current == totalCountLoadedSoFar) {
|
105
|
+
PageReadStore pages = reader.readNextRowGroup();
|
106
|
+
if (pages == null) {
|
107
|
+
throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
|
108
|
+
}
|
109
|
+
|
110
|
+
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
|
111
|
+
recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
|
112
|
+
totalCountLoadedSoFar += pages.getRowCount();
|
113
|
+
++ currentBlock;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
/**
|
118
|
+
* @return the next record or null if finished
|
119
|
+
* @throws IOException
|
120
|
+
* @throws ParquetDecodingException
|
121
|
+
*/
|
122
|
+
public T read() throws IOException {
|
123
|
+
T currentValue = null;
|
124
|
+
boolean recordFound = false;
|
125
|
+
while (!recordFound) {
|
126
|
+
// no more records left
|
127
|
+
if (current >= total) {
|
128
|
+
return null;
|
129
|
+
}
|
130
|
+
|
131
|
+
try {
|
132
|
+
checkRead();
|
133
|
+
current++;
|
134
|
+
|
135
|
+
try {
|
136
|
+
currentValue = recordReader.read();
|
137
|
+
} catch (RecordMaterializer.RecordMaterializationException e) {
|
138
|
+
// this might throw, but it's fatal if it does.
|
139
|
+
unmaterializableRecordCounter.incErrors(e);
|
140
|
+
logger.debug("skipping a corrupt record");
|
141
|
+
continue;
|
142
|
+
}
|
143
|
+
|
144
|
+
if (recordReader.shouldSkipCurrentRecord()) {
|
145
|
+
// this record is being filtered via the filter2 package
|
146
|
+
logger.debug("skipping record");
|
147
|
+
continue;
|
148
|
+
}
|
149
|
+
|
150
|
+
if (currentValue == null) {
|
151
|
+
// only happens with FilteredRecordReader at end of block
|
152
|
+
current = totalCountLoadedSoFar;
|
153
|
+
logger.debug("filtered record reader reached end of block");
|
154
|
+
continue;
|
155
|
+
}
|
156
|
+
|
157
|
+
recordFound = true;
|
158
|
+
|
159
|
+
logger.debug("read value: {}", currentValue);
|
160
|
+
} catch (RuntimeException e) {
|
161
|
+
throw new ParquetDecodingException(
|
162
|
+
String.format("Can not read value at %d in block %d in file %s", current, currentBlock, filePath), e);
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
return currentValue;
|
167
|
+
}
|
168
|
+
|
169
|
+
public void close() throws IOException {
|
170
|
+
reader.close();
|
171
|
+
}
|
172
|
+
|
173
|
+
private static <K, V> Map<K, Set<V>> toSetMultiMap(Map<K, V> map) {
|
174
|
+
Map<K, Set<V>> setMultiMap = new HashMap<>();
|
175
|
+
for (Map.Entry<K, V> entry : map.entrySet()) {
|
176
|
+
Set<V> set = new HashSet<>();
|
177
|
+
set.add(entry.getValue());
|
178
|
+
setMultiMap.put(entry.getKey(), Collections.unmodifiableSet(set));
|
179
|
+
}
|
180
|
+
return Collections.unmodifiableMap(setMultiMap);
|
181
|
+
}
|
182
|
+
}
|
@@ -0,0 +1,44 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright 2017 CyberAgent, Inc.
|
3
|
+
*
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
* you may not use this file except in compliance with the License.
|
6
|
+
* You may obtain a copy of the License at
|
7
|
+
*
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
*
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
* See the License for the specific language governing permissions and
|
14
|
+
* limitations under the License.
|
15
|
+
*/
|
16
|
+
package org.embulk.input.parquet_hadoop;
|
17
|
+
|
18
|
+
/**
|
19
|
+
* Set the context class loader to plugin's class loader.
|
20
|
+
*
|
21
|
+
* {@link org.apache.hadoop.fs.FileSystem#loadFileSystems()} loads FileSystem implementation via
|
22
|
+
* {@link java.util.ServiceLoader}.
|
23
|
+
* It's look up services via system class loader if context class loader is null.
|
24
|
+
* However system class loader failed to look up FileSystem implementations because
|
25
|
+
* hadoop jars is not in classpath of system class loader.
|
26
|
+
* So we need to set context class loader to plugins' class loader.
|
27
|
+
*/
|
28
|
+
class PluginClassLoaderScope implements AutoCloseable {
|
29
|
+
private static final ClassLoader PLUGIN_CLASS_LOADER =
|
30
|
+
ParquetHadoopInputPlugin.class.getClassLoader();
|
31
|
+
|
32
|
+
private final ClassLoader original;
|
33
|
+
|
34
|
+
public PluginClassLoaderScope() {
|
35
|
+
Thread current = Thread.currentThread();
|
36
|
+
this.original = current.getContextClassLoader();
|
37
|
+
Thread.currentThread().setContextClassLoader(PLUGIN_CLASS_LOADER);
|
38
|
+
}
|
39
|
+
|
40
|
+
@Override
|
41
|
+
public void close() {
|
42
|
+
Thread.currentThread().setContextClassLoader(original);
|
43
|
+
}
|
44
|
+
}
|
@@ -0,0 +1,74 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright 2017 CyberAgent, Inc.
|
3
|
+
*
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
* you may not use this file except in compliance with the License.
|
6
|
+
* You may obtain a copy of the License at
|
7
|
+
*
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
*
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
* See the License for the specific language governing permissions and
|
14
|
+
* limitations under the License.
|
15
|
+
*/
|
16
|
+
package org.embulk.input.parquet_hadoop;
|
17
|
+
|
18
|
+
import com.google.common.io.Resources;
|
19
|
+
import org.embulk.config.ConfigSource;
|
20
|
+
import org.embulk.spi.InputPlugin;
|
21
|
+
import org.embulk.test.TestingEmbulk;
|
22
|
+
import org.junit.Rule;
|
23
|
+
import org.junit.Test;
|
24
|
+
|
25
|
+
import java.nio.file.Path;
|
26
|
+
|
27
|
+
import static org.embulk.test.EmbulkTests.readFile;
|
28
|
+
import static org.embulk.test.EmbulkTests.readResource;
|
29
|
+
import static org.hamcrest.CoreMatchers.is;
|
30
|
+
import static org.junit.Assert.assertThat;
|
31
|
+
|
32
|
+
public class TestParquetHadoopInputPlugin
|
33
|
+
{
|
34
|
+
private static final String RESOURCE_NAME_PREFIX = "test-data/";
|
35
|
+
|
36
|
+
@Rule
|
37
|
+
public TestingEmbulk embulk = TestingEmbulk.builder()
|
38
|
+
.registerPlugin(InputPlugin.class, "parquet_hadoop", ParquetHadoopInputPlugin.class)
|
39
|
+
.build();
|
40
|
+
|
41
|
+
@Test
|
42
|
+
public void testSimple() throws Exception
|
43
|
+
{
|
44
|
+
assertRecordsByResource(embulk, "simple/in.yml", "simple/data.parquet",
|
45
|
+
"simple/expected.csv");
|
46
|
+
}
|
47
|
+
|
48
|
+
@Test
|
49
|
+
public void testIncompatibleSchema() throws Exception
|
50
|
+
{
|
51
|
+
assertRecordsByResource(embulk, "incompatible-schema/in.yml", "incompatible-schema/data",
|
52
|
+
"incompatible-schema/expected.csv");
|
53
|
+
}
|
54
|
+
|
55
|
+
static void assertRecordsByResource(TestingEmbulk embulk,
|
56
|
+
String inConfigYamlResourceName,
|
57
|
+
String sourceResourceName, String resultCsvResourceName)
|
58
|
+
throws Exception
|
59
|
+
{
|
60
|
+
Path outputPath = embulk.createTempFile("csv");
|
61
|
+
|
62
|
+
// in: config
|
63
|
+
String inputPath = Resources.getResource(RESOURCE_NAME_PREFIX + sourceResourceName).toURI().toString();
|
64
|
+
ConfigSource inConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + inConfigYamlResourceName)
|
65
|
+
.set("path", inputPath);
|
66
|
+
|
67
|
+
TestingEmbulk.RunResult result = embulk.inputBuilder()
|
68
|
+
.in(inConfig)
|
69
|
+
.outputPath(outputPath)
|
70
|
+
.run();
|
71
|
+
|
72
|
+
assertThat(readFile(outputPath), is(readResource(RESOURCE_NAME_PREFIX + resultCsvResourceName)));
|
73
|
+
}
|
74
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,3 @@
|
|
1
|
+
"{""c_str"":""foo"",""c_int"":1,""c_double"":1.5,""c_bool"":true,""c_json"":""{\""foo\"":1}""}"
|
2
|
+
"{""c_str"":""bar"",""c_int"":2,""c_double"":2.5,""c_bool"":false,""c_json"":""{\""bar\"":2}""}"
|
3
|
+
"{""c_str"":""baz"",""c_int"":3,""c_double"":3.5,""c_bool"":true,""c_json"":""{\""baz\"":3}""}"
|
metadata
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: embulk-input-parquet_hadoop
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Koji AGAWA
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-03-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '1.0'
|
19
|
+
name: bundler
|
20
|
+
prerelease: false
|
21
|
+
type: :development
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '10.0'
|
33
|
+
name: rake
|
34
|
+
prerelease: false
|
35
|
+
type: :development
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
description: Loads records from Parquet files via Hadoop FileSystem.
|
42
|
+
email:
|
43
|
+
- agawa_koji@cyberagent.co.jp
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- build.gradle
|
49
|
+
- classpath/activation-1.1.jar
|
50
|
+
- classpath/apacheds-i18n-2.0.0-M15.jar
|
51
|
+
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
52
|
+
- classpath/api-asn1-api-1.0.0-M20.jar
|
53
|
+
- classpath/api-util-1.0.0-M20.jar
|
54
|
+
- classpath/asm-3.1.jar
|
55
|
+
- classpath/avro-1.7.4.jar
|
56
|
+
- classpath/commons-beanutils-1.7.0.jar
|
57
|
+
- classpath/commons-cli-1.2.jar
|
58
|
+
- classpath/commons-codec-1.6.jar
|
59
|
+
- classpath/commons-collections-3.2.2.jar
|
60
|
+
- classpath/commons-compress-1.4.1.jar
|
61
|
+
- classpath/commons-configuration-1.6.jar
|
62
|
+
- classpath/commons-digester-1.8.jar
|
63
|
+
- classpath/commons-httpclient-3.1.jar
|
64
|
+
- classpath/commons-io-2.4.jar
|
65
|
+
- classpath/commons-lang-2.6.jar
|
66
|
+
- classpath/commons-logging-1.1.3.jar
|
67
|
+
- classpath/commons-math3-3.1.1.jar
|
68
|
+
- classpath/commons-net-3.1.jar
|
69
|
+
- classpath/curator-client-2.7.1.jar
|
70
|
+
- classpath/curator-framework-2.7.1.jar
|
71
|
+
- classpath/curator-recipes-2.7.1.jar
|
72
|
+
- classpath/embulk-input-parquet_hadoop-0.1.0.jar
|
73
|
+
- classpath/gson-2.2.4.jar
|
74
|
+
- classpath/hadoop-annotations-2.7.3.jar
|
75
|
+
- classpath/hadoop-auth-2.7.3.jar
|
76
|
+
- classpath/hadoop-client-2.7.3.jar
|
77
|
+
- classpath/hadoop-common-2.7.3.jar
|
78
|
+
- classpath/hadoop-hdfs-2.7.3.jar
|
79
|
+
- classpath/hadoop-mapreduce-client-app-2.7.3.jar
|
80
|
+
- classpath/hadoop-mapreduce-client-common-2.7.3.jar
|
81
|
+
- classpath/hadoop-mapreduce-client-core-2.7.3.jar
|
82
|
+
- classpath/hadoop-mapreduce-client-jobclient-2.7.3.jar
|
83
|
+
- classpath/hadoop-mapreduce-client-shuffle-2.7.3.jar
|
84
|
+
- classpath/hadoop-yarn-api-2.7.3.jar
|
85
|
+
- classpath/hadoop-yarn-client-2.7.3.jar
|
86
|
+
- classpath/hadoop-yarn-common-2.7.3.jar
|
87
|
+
- classpath/hadoop-yarn-server-common-2.7.3.jar
|
88
|
+
- classpath/hadoop-yarn-server-nodemanager-2.7.3.jar
|
89
|
+
- classpath/htrace-core-3.1.0-incubating.jar
|
90
|
+
- classpath/httpclient-4.2.5.jar
|
91
|
+
- classpath/httpcore-4.2.4.jar
|
92
|
+
- classpath/jackson-core-asl-1.9.13.jar
|
93
|
+
- classpath/jackson-jaxrs-1.9.13.jar
|
94
|
+
- classpath/jackson-mapper-asl-1.9.13.jar
|
95
|
+
- classpath/jackson-xc-1.9.13.jar
|
96
|
+
- classpath/jaxb-api-2.2.2.jar
|
97
|
+
- classpath/jaxb-impl-2.2.3-1.jar
|
98
|
+
- classpath/jersey-client-1.9.jar
|
99
|
+
- classpath/jersey-core-1.9.jar
|
100
|
+
- classpath/jersey-guice-1.9.jar
|
101
|
+
- classpath/jersey-json-1.9.jar
|
102
|
+
- classpath/jersey-server-1.9.jar
|
103
|
+
- classpath/jettison-1.1.jar
|
104
|
+
- classpath/jetty-util-6.1.26.jar
|
105
|
+
- classpath/jline-0.9.94.jar
|
106
|
+
- classpath/jsp-api-2.1.jar
|
107
|
+
- classpath/jsr305-3.0.0.jar
|
108
|
+
- classpath/jul-to-slf4j-1.7.24.jar
|
109
|
+
- classpath/leveldbjni-all-1.8.jar
|
110
|
+
- classpath/log4j-over-slf4j-1.7.24.jar
|
111
|
+
- classpath/netty-3.7.0.Final.jar
|
112
|
+
- classpath/netty-all-4.0.23.Final.jar
|
113
|
+
- classpath/paranamer-2.3.jar
|
114
|
+
- classpath/parquet-column-1.8.1.jar
|
115
|
+
- classpath/parquet-common-1.8.1.jar
|
116
|
+
- classpath/parquet-encoding-1.8.1.jar
|
117
|
+
- classpath/parquet-format-2.3.0-incubating.jar
|
118
|
+
- classpath/parquet-hadoop-1.8.1.jar
|
119
|
+
- classpath/parquet-jackson-1.8.1.jar
|
120
|
+
- classpath/parquet-msgpack-0.1.0.jar
|
121
|
+
- classpath/protobuf-java-2.5.0.jar
|
122
|
+
- classpath/servlet-api-2.5.jar
|
123
|
+
- classpath/slf4j-api-1.7.24.jar
|
124
|
+
- classpath/snappy-java-1.1.1.6.jar
|
125
|
+
- classpath/stax-api-1.0-2.jar
|
126
|
+
- classpath/xercesImpl-2.9.1.jar
|
127
|
+
- classpath/xml-apis-1.3.04.jar
|
128
|
+
- classpath/xmlenc-0.52.jar
|
129
|
+
- classpath/xz-1.0.jar
|
130
|
+
- classpath/zookeeper-3.4.6.jar
|
131
|
+
- lib/embulk/input/parquet_hadoop.rb
|
132
|
+
- src/main/java/org/embulk/input/parquet_hadoop/ConfigurationFactory.java
|
133
|
+
- src/main/java/org/embulk/input/parquet_hadoop/ParquetHadoopInputPlugin.java
|
134
|
+
- src/main/java/org/embulk/input/parquet_hadoop/ParquetRowReader.java
|
135
|
+
- src/main/java/org/embulk/input/parquet_hadoop/PluginClassLoaderScope.java
|
136
|
+
- src/test/java/org/embulk/input/parquet_hadoop/TestParquetHadoopInputPlugin.java
|
137
|
+
- src/test/resources/test-data/incompatible-schema/data/1.parquet
|
138
|
+
- src/test/resources/test-data/incompatible-schema/data/2.parquet
|
139
|
+
- src/test/resources/test-data/incompatible-schema/expected.csv
|
140
|
+
- src/test/resources/test-data/incompatible-schema/in.yml
|
141
|
+
- src/test/resources/test-data/simple/data.parquet
|
142
|
+
- src/test/resources/test-data/simple/expected.csv
|
143
|
+
- src/test/resources/test-data/simple/in.yml
|
144
|
+
homepage: https://github.com/CyberAgent/embulk-input-parquet_hadoop
|
145
|
+
licenses:
|
146
|
+
- Apache 2.0
|
147
|
+
metadata: {}
|
148
|
+
post_install_message:
|
149
|
+
rdoc_options: []
|
150
|
+
require_paths:
|
151
|
+
- lib
|
152
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '0'
|
157
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
+
requirements:
|
159
|
+
- - ">="
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
requirements: []
|
163
|
+
rubyforge_project:
|
164
|
+
rubygems_version: 2.4.8
|
165
|
+
signing_key:
|
166
|
+
specification_version: 4
|
167
|
+
summary: Parquet input plugin for Embulk
|
168
|
+
test_files: []
|