embulk-input-parquet_hadoop 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +53 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/asm-3.1.jar +0 -0
  9. data/classpath/avro-1.7.4.jar +0 -0
  10. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  11. data/classpath/commons-cli-1.2.jar +0 -0
  12. data/classpath/commons-codec-1.6.jar +0 -0
  13. data/classpath/commons-collections-3.2.2.jar +0 -0
  14. data/classpath/commons-compress-1.4.1.jar +0 -0
  15. data/classpath/commons-configuration-1.6.jar +0 -0
  16. data/classpath/commons-digester-1.8.jar +0 -0
  17. data/classpath/commons-httpclient-3.1.jar +0 -0
  18. data/classpath/commons-io-2.4.jar +0 -0
  19. data/classpath/commons-lang-2.6.jar +0 -0
  20. data/classpath/commons-logging-1.1.3.jar +0 -0
  21. data/classpath/commons-math3-3.1.1.jar +0 -0
  22. data/classpath/commons-net-3.1.jar +0 -0
  23. data/classpath/curator-client-2.7.1.jar +0 -0
  24. data/classpath/curator-framework-2.7.1.jar +0 -0
  25. data/classpath/curator-recipes-2.7.1.jar +0 -0
  26. data/classpath/embulk-input-parquet_hadoop-0.1.0.jar +0 -0
  27. data/classpath/gson-2.2.4.jar +0 -0
  28. data/classpath/hadoop-annotations-2.7.3.jar +0 -0
  29. data/classpath/hadoop-auth-2.7.3.jar +0 -0
  30. data/classpath/hadoop-client-2.7.3.jar +0 -0
  31. data/classpath/hadoop-common-2.7.3.jar +0 -0
  32. data/classpath/hadoop-hdfs-2.7.3.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-app-2.7.3.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-common-2.7.3.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-core-2.7.3.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-jobclient-2.7.3.jar +0 -0
  37. data/classpath/hadoop-mapreduce-client-shuffle-2.7.3.jar +0 -0
  38. data/classpath/hadoop-yarn-api-2.7.3.jar +0 -0
  39. data/classpath/hadoop-yarn-client-2.7.3.jar +0 -0
  40. data/classpath/hadoop-yarn-common-2.7.3.jar +0 -0
  41. data/classpath/hadoop-yarn-server-common-2.7.3.jar +0 -0
  42. data/classpath/hadoop-yarn-server-nodemanager-2.7.3.jar +0 -0
  43. data/classpath/htrace-core-3.1.0-incubating.jar +0 -0
  44. data/classpath/httpclient-4.2.5.jar +0 -0
  45. data/classpath/httpcore-4.2.4.jar +0 -0
  46. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  47. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  48. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  49. data/classpath/jackson-xc-1.9.13.jar +0 -0
  50. data/classpath/jaxb-api-2.2.2.jar +0 -0
  51. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  52. data/classpath/jersey-client-1.9.jar +0 -0
  53. data/classpath/jersey-core-1.9.jar +0 -0
  54. data/classpath/jersey-guice-1.9.jar +0 -0
  55. data/classpath/jersey-json-1.9.jar +0 -0
  56. data/classpath/jersey-server-1.9.jar +0 -0
  57. data/classpath/jettison-1.1.jar +0 -0
  58. data/classpath/jetty-util-6.1.26.jar +0 -0
  59. data/classpath/jline-0.9.94.jar +0 -0
  60. data/classpath/jsp-api-2.1.jar +0 -0
  61. data/classpath/jsr305-3.0.0.jar +0 -0
  62. data/classpath/jul-to-slf4j-1.7.24.jar +0 -0
  63. data/classpath/leveldbjni-all-1.8.jar +0 -0
  64. data/classpath/log4j-over-slf4j-1.7.24.jar +0 -0
  65. data/classpath/netty-3.7.0.Final.jar +0 -0
  66. data/classpath/netty-all-4.0.23.Final.jar +0 -0
  67. data/classpath/paranamer-2.3.jar +0 -0
  68. data/classpath/parquet-column-1.8.1.jar +0 -0
  69. data/classpath/parquet-common-1.8.1.jar +0 -0
  70. data/classpath/parquet-encoding-1.8.1.jar +0 -0
  71. data/classpath/parquet-format-2.3.0-incubating.jar +0 -0
  72. data/classpath/parquet-hadoop-1.8.1.jar +0 -0
  73. data/classpath/parquet-jackson-1.8.1.jar +0 -0
  74. data/classpath/parquet-msgpack-0.1.0.jar +0 -0
  75. data/classpath/protobuf-java-2.5.0.jar +0 -0
  76. data/classpath/servlet-api-2.5.jar +0 -0
  77. data/classpath/slf4j-api-1.7.24.jar +0 -0
  78. data/classpath/snappy-java-1.1.1.6.jar +0 -0
  79. data/classpath/stax-api-1.0-2.jar +0 -0
  80. data/classpath/xercesImpl-2.9.1.jar +0 -0
  81. data/classpath/xml-apis-1.3.04.jar +0 -0
  82. data/classpath/xmlenc-0.52.jar +0 -0
  83. data/classpath/xz-1.0.jar +0 -0
  84. data/classpath/zookeeper-3.4.6.jar +0 -0
  85. data/lib/embulk/input/parquet_hadoop.rb +18 -0
  86. data/src/main/java/org/embulk/input/parquet_hadoop/ConfigurationFactory.java +84 -0
  87. data/src/main/java/org/embulk/input/parquet_hadoop/ParquetHadoopInputPlugin.java +257 -0
  88. data/src/main/java/org/embulk/input/parquet_hadoop/ParquetRowReader.java +182 -0
  89. data/src/main/java/org/embulk/input/parquet_hadoop/PluginClassLoaderScope.java +44 -0
  90. data/src/test/java/org/embulk/input/parquet_hadoop/TestParquetHadoopInputPlugin.java +74 -0
  91. data/src/test/resources/test-data/incompatible-schema/data/1.parquet +0 -0
  92. data/src/test/resources/test-data/incompatible-schema/data/2.parquet +0 -0
  93. data/src/test/resources/test-data/incompatible-schema/expected.csv +4 -0
  94. data/src/test/resources/test-data/incompatible-schema/in.yml +2 -0
  95. data/src/test/resources/test-data/simple/data.parquet +0 -0
  96. data/src/test/resources/test-data/simple/expected.csv +3 -0
  97. data/src/test/resources/test-data/simple/in.yml +2 -0
  98. metadata +168 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 11d3bfc5cf66805e9ce41966e90759d5acfbed8f
4
+ data.tar.gz: 234ecd00864d9c122f01a95ab224c18bdff3ccea
5
+ SHA512:
6
+ metadata.gz: 05e661e93e1e5c99edec29e2c83cd68d79f45e8c828afb0aeba822e44003057cf5deb1c69e14cf8eebd32755c19a06766c095e9dd0812bc3feee3f3ae4574c0a
7
+ data.tar.gz: 9b1119067ba7eaeb18ee4ddaac2322881b6177fb3eab92995784745b72d90e6e0c9e60d0bc552afd652f6556392b008628e9065cbd8762ab48a2275cb2a62944
data/build.gradle ADDED
@@ -0,0 +1,53 @@
1
+ import com.github.jrubygradle.JRubyExec
2
+
3
+ dependencies {
4
+ compile "org.embulk:embulk-core:0.8.16"
5
+ provided "org.embulk:embulk-core:0.8.16"
6
+
7
+ compile project(':parquet-msgpack')
8
+ // for hadoop
9
+ compile 'org.slf4j:log4j-over-slf4j:1.7.24'
10
+ // for parquet
11
+ compile 'org.slf4j:jul-to-slf4j:1.7.24'
12
+
13
+ testCompile "junit:junit:4.+"
14
+ testCompile 'org.embulk:embulk-standards:0.8.16'
15
+ testCompile "org.embulk:embulk-test:0.8.16"
16
+ testCompile 'org.assertj:assertj-core:2.6.+'
17
+ }
18
+
19
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
20
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
21
+ scriptArgs "${project.projectDir.absolutePath}/build/gemspec"
22
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "${project.projectDir}/pkg") }
23
+ }
24
+
25
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
26
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
27
+ scriptArgs "pkg/${project.name}-${project.version}.gem"
28
+ }
29
+
30
+ task gemspec {
31
+ doLast {
32
+ file('build').mkdirs()
33
+ file('build/gemspec').write($/
34
+ Gem::Specification.new do |spec|
35
+ spec.name = "${project.name}"
36
+ spec.version = "${project.version}"
37
+ spec.authors = ["Koji AGAWA"]
38
+ spec.summary = %[Parquet input plugin for Embulk]
39
+ spec.description = %[Loads records from Parquet files via Hadoop FileSystem.]
40
+ spec.email = ["agawa_koji@cyberagent.co.jp"]
41
+ spec.licenses = ["Apache 2.0"]
42
+ spec.homepage = "https://github.com/CyberAgent/embulk-input-parquet_hadoop"
43
+
44
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
45
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
46
+ spec.require_paths = ["lib"]
47
+
48
+ spec.add_development_dependency 'bundler', ['~> 1.0']
49
+ spec.add_development_dependency 'rake', ['>= 10.0']
50
+ end
51
+ /$)
52
+ }
53
+ }
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,18 @@
1
+ #
2
+ # Copyright 2017 CyberAgent, Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ Embulk::JavaPlugin.register_input(
17
+ "parquet_hadoop", "org.embulk.input.parquet_hadoop.ParquetHadoopInputPlugin",
18
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,84 @@
1
+ /*
2
+ * This class includes code from embulk-input-hadoop.
3
+ * (https://github.com/civitaspo/embulk-input-hdfs)
4
+ *
5
+ * The MIT License
6
+ * Copyright (c) 2015 Civitaspo
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ * of this software and associated documentation files (the "Software"), to deal
10
+ * in the Software without restriction, including without limitation the rights
11
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ * copies of the Software, and to permit persons to whom the Software is
13
+ * furnished to do so, subject to the following conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be included in
16
+ * all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ * THE SOFTWARE.
25
+ */
26
+ package org.embulk.input.parquet_hadoop;
27
+
28
+ import org.apache.hadoop.conf.Configuration;
29
+ import org.embulk.config.Config;
30
+ import org.embulk.config.ConfigDefault;
31
+ import org.embulk.config.ConfigException;
32
+ import org.embulk.spi.Exec;
33
+ import org.slf4j.Logger;
34
+
35
+ import java.io.File;
36
+ import java.net.MalformedURLException;
37
+ import java.util.List;
38
+ import java.util.Map;
39
+
40
+ public class ConfigurationFactory
41
+ {
42
+ private static final Logger logger = Exec.getLogger(ConfigurationFactory.class);
43
+
44
+ interface Task
45
+ {
46
+ @Config("config_files")
47
+ @ConfigDefault("[]")
48
+ List<String> getConfigFiles();
49
+
50
+ @Config("config")
51
+ @ConfigDefault("{}")
52
+ Map<String, String> getConfig();
53
+ }
54
+
55
+ private ConfigurationFactory()
56
+ {
57
+ }
58
+
59
+ public static Configuration create(Task task)
60
+ {
61
+ Configuration c = new Configuration();
62
+ for (String f : task.getConfigFiles()) {
63
+ try {
64
+ logger.trace("embulk-input-parquet_hadoop: load a config file: {}", f);
65
+ c.addResource(new File(f).toURI().toURL());
66
+ } catch (MalformedURLException e) {
67
+ throw new ConfigException(e);
68
+ }
69
+ }
70
+
71
+ for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
72
+ logger.trace("embulk-input-parquet_hadoop: load a config: {}:{}", entry.getKey(), entry.getValue());
73
+ c.set(entry.getKey(), entry.getValue());
74
+ }
75
+
76
+ // For logging
77
+ for (Map.Entry<String, String> entry : c) {
78
+ logger.trace("embulk-input-parquet_hadoop: loaded: {}: {}", entry.getKey(), entry.getValue());
79
+ }
80
+ logger.trace("embulk-input-parquet_hadoop: loaded files: {}", c);
81
+
82
+ return c;
83
+ }
84
+ }
@@ -0,0 +1,257 @@
1
+ /*
2
+ * Copyright 2017 CyberAgent, Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+ package org.embulk.input.parquet_hadoop;
17
+
18
+ import com.google.common.base.Function;
19
+ import com.google.common.base.Throwables;
20
+ import com.google.common.collect.Lists;
21
+ import org.apache.hadoop.conf.Configuration;
22
+ import org.apache.hadoop.fs.FileStatus;
23
+ import org.apache.hadoop.fs.FileSystem;
24
+ import org.apache.hadoop.fs.Path;
25
+ import org.apache.hadoop.fs.PathNotFoundException;
26
+ import org.apache.parquet.ParquetRuntimeException;
27
+ import org.apache.parquet.hadoop.util.HiddenFileFilter;
28
+ import org.embulk.config.Config;
29
+ import org.embulk.config.ConfigDefault;
30
+ import org.embulk.config.ConfigDiff;
31
+ import org.embulk.config.ConfigSource;
32
+ import org.embulk.config.Task;
33
+ import org.embulk.config.TaskReport;
34
+ import org.embulk.config.TaskSource;
35
+ import org.embulk.spi.Column;
36
+ import org.embulk.spi.DataException;
37
+ import org.embulk.spi.Exec;
38
+ import org.embulk.spi.InputPlugin;
39
+ import org.embulk.spi.PageBuilder;
40
+ import org.embulk.spi.PageOutput;
41
+ import org.embulk.spi.Schema;
42
+ import org.embulk.spi.type.Types;
43
+ import org.msgpack.value.Value;
44
+ import org.slf4j.Logger;
45
+ import org.slf4j.bridge.SLF4JBridgeHandler;
46
+ import studio.adtech.parquet.msgpack.read.MessagePackReadSupport;
47
+
48
+ import javax.annotation.Nullable;
49
+ import java.io.IOException;
50
+ import java.util.List;
51
+ import java.util.logging.Level;
52
+
53
+ public class ParquetHadoopInputPlugin
54
+ implements InputPlugin
55
+ {
56
+ private static final Logger logger = Exec.getLogger(ParquetHadoopInputPlugin.class);
57
+
58
+ public interface PluginTask
59
+ extends Task, ConfigurationFactory.Task
60
+ {
61
+ @Config("path")
62
+ String getPath();
63
+
64
+ @Config("parquet_log_level")
65
+ @ConfigDefault("\"INFO\"")
66
+ String getParquetLogLevel();
67
+
68
+ List<String> getFiles();
69
+ void setFiles(List<String> files);
70
+ }
71
+
72
+ Schema newSchema()
73
+ {
74
+ return Schema.builder().add("record", Types.JSON).build();
75
+ }
76
+
77
+ @Override
78
+ public ConfigDiff transaction(ConfigSource config,
79
+ InputPlugin.Control control)
80
+ {
81
+ PluginTask task = config.loadConfig(PluginTask.class);
82
+ configureParquetLogger(task);
83
+
84
+ Path rootPath = new Path(task.getPath());
85
+
86
+ try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
87
+ Configuration conf = ConfigurationFactory.create(task);
88
+
89
+ FileSystem fs = FileSystem.get(rootPath.toUri(), conf);
90
+ List<FileStatus> statusList = listFileStatuses(fs, rootPath);
91
+ if (statusList.isEmpty()) {
92
+ throw new PathNotFoundException(rootPath.toString());
93
+ }
94
+
95
+ for (FileStatus status : statusList) {
96
+ logger.debug("embulk-input-parquet_hadoop: Loading paths: {}, length: {}",
97
+ status.getPath(), status.getLen());
98
+ }
99
+
100
+ List<String> files = Lists.transform(statusList, new Function<FileStatus, String>() {
101
+ @Nullable
102
+ @Override
103
+ public String apply(@Nullable FileStatus input) {
104
+ return input.getPath().toString();
105
+ }
106
+ });
107
+ task.setFiles(files);
108
+ } catch (IOException e) {
109
+ throw Throwables.propagate(e);
110
+ }
111
+
112
+ Schema schema = newSchema();
113
+ int taskCount = task.getFiles().size();
114
+
115
+ return resume(task.dump(), schema, taskCount, control);
116
+ }
117
+
118
+ @Override
119
+ public ConfigDiff resume(TaskSource taskSource,
120
+ Schema schema, int taskCount,
121
+ InputPlugin.Control control)
122
+ {
123
+ control.run(taskSource, schema, taskCount);
124
+ return Exec.newConfigDiff();
125
+ }
126
+
127
+ @Override
128
+ public void cleanup(TaskSource taskSource,
129
+ Schema schema, int taskCount,
130
+ List<TaskReport> successTaskReports)
131
+ {
132
+ }
133
+
134
+ @Override
135
+ public TaskReport run(TaskSource taskSource,
136
+ Schema schema, int taskIndex,
137
+ PageOutput output)
138
+ {
139
+ PluginTask task = taskSource.loadTask(PluginTask.class);
140
+ configureParquetLogger(task);
141
+
142
+ final Column jsonColumn = schema.getColumn(0);
143
+
144
+ Configuration conf;
145
+ Path filePath;
146
+ try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
147
+ conf = ConfigurationFactory.create(task);
148
+ filePath = new Path(task.getFiles().get(taskIndex));
149
+ }
150
+
151
+ try (PageBuilder pageBuilder = newPageBuilder(schema, output)) {
152
+ ParquetRowReader<Value> reader;
153
+ try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
154
+ reader = new ParquetRowReader<>(conf, filePath, new MessagePackReadSupport());
155
+ } catch (ParquetRuntimeException | IOException e) {
156
+ throw new DataException(e);
157
+ }
158
+
159
+ Value value;
160
+ while (true) {
161
+ try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
162
+ value = reader.read();
163
+ } catch (ParquetRuntimeException | IOException e) {
164
+ throw new DataException(e);
165
+ }
166
+ if (value == null) {
167
+ break;
168
+ }
169
+
170
+ pageBuilder.setJson(jsonColumn, value);
171
+ pageBuilder.addRecord();
172
+ }
173
+
174
+ pageBuilder.finish();
175
+
176
+ try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
177
+ reader.close();
178
+ } catch (ParquetRuntimeException | IOException e) {
179
+ throw new DataException(e);
180
+ }
181
+ }
182
+
183
+ TaskReport report = Exec.newTaskReport();
184
+ return report;
185
+ }
186
+
187
+ @Override
188
+ public ConfigDiff guess(ConfigSource config)
189
+ {
190
+ return Exec.newConfigDiff();
191
+ }
192
+
193
+ private PageBuilder newPageBuilder(Schema schema, PageOutput output)
194
+ {
195
+ return new PageBuilder(Exec.getBufferAllocator(), schema, output);
196
+ }
197
+
198
+ private List<FileStatus> listFileStatuses(FileSystem fs, Path rootPath) throws IOException {
199
+ List<FileStatus> fileStatuses = Lists.newArrayList();
200
+
201
+ FileStatus[] entries = fs.globStatus(rootPath, HiddenFileFilter.INSTANCE);
202
+ if (entries == null) {
203
+ return fileStatuses;
204
+ }
205
+
206
+ for (FileStatus entry : entries) {
207
+ if (entry.isDirectory()) {
208
+ List<FileStatus> subEntries = listRecursive(fs, entry);
209
+ fileStatuses.addAll(subEntries);
210
+ } else {
211
+ fileStatuses.add(entry);
212
+ }
213
+ }
214
+
215
+ return fileStatuses;
216
+ }
217
+
218
+ private List<FileStatus> listRecursive(FileSystem fs, FileStatus status) throws IOException
219
+ {
220
+ List<FileStatus> statusList = Lists.newArrayList();
221
+ if (status.isDirectory()) {
222
+ FileStatus[] entries = fs.listStatus(status.getPath(), HiddenFileFilter.INSTANCE);
223
+ for (FileStatus entry : entries) {
224
+ statusList.addAll(listRecursive(fs, entry));
225
+ }
226
+ } else {
227
+ statusList.add(status);
228
+ }
229
+ return statusList;
230
+ }
231
+
232
+ private static void configureParquetLogger(PluginTask task)
233
+ {
234
+ // delegate java.util.logging to slf4j.
235
+ java.util.logging.Logger parquetLogger = java.util.logging.Logger.getLogger("org.apache.parquet");
236
+ if (parquetLogger.getHandlers().length == 0) {
237
+ parquetLogger.addHandler(new SLF4JBridgeHandler());
238
+ parquetLogger.setUseParentHandlers(false);
239
+ }
240
+
241
+ Level level;
242
+ try {
243
+ level = Level.parse(task.getParquetLogLevel());
244
+ } catch (IllegalArgumentException e) {
245
+ logger.warn("embulk-input-parquet_hadoop: Invalid parquet_log_level", e);
246
+ level = Level.WARNING;
247
+ }
248
+ // invoke static initializer that overrides log level.
249
+ try {
250
+ Class.forName("org.apache.parquet.Log");
251
+ } catch (ClassNotFoundException e) {
252
+ logger.warn("", e);
253
+ }
254
+
255
+ parquetLogger.setLevel(level);
256
+ }
257
+ }
@@ -0,0 +1,182 @@
1
+ /*
2
+ * This class includes code from Apache Parquet MR.
3
+ * (org.apache.parquet.hadoop.InternalParquetRecordReader)
4
+ *
5
+ * Copyright 2017 CyberAgent, Inc.
6
+ *
7
+ * Licensed under the Apache License, Version 2.0 (the "License");
8
+ * you may not use this file except in compliance with the License.
9
+ * You may obtain a copy of the License at
10
+ *
11
+ * http://www.apache.org/licenses/LICENSE-2.0
12
+ *
13
+ * Unless required by applicable law or agreed to in writing, software
14
+ * distributed under the License is distributed on an "AS IS" BASIS,
15
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ * See the License for the specific language governing permissions and
17
+ * limitations under the License.
18
+ */
19
+ package org.embulk.input.parquet_hadoop;
20
+
21
+ import org.apache.hadoop.conf.Configuration;
22
+ import org.apache.hadoop.fs.Path;
23
+ import org.apache.parquet.column.ColumnDescriptor;
24
+ import org.apache.parquet.column.page.PageReadStore;
25
+ import org.apache.parquet.filter2.compat.FilterCompat;
26
+ import org.apache.parquet.format.converter.ParquetMetadataConverter;
27
+ import org.apache.parquet.hadoop.ParquetFileReader;
28
+ import org.apache.parquet.hadoop.UnmaterializableRecordCounter;
29
+ import org.apache.parquet.hadoop.api.InitContext;
30
+ import org.apache.parquet.hadoop.api.ReadSupport;
31
+ import org.apache.parquet.hadoop.metadata.BlockMetaData;
32
+ import org.apache.parquet.hadoop.metadata.FileMetaData;
33
+ import org.apache.parquet.hadoop.metadata.ParquetMetadata;
34
+ import org.apache.parquet.io.ColumnIOFactory;
35
+ import org.apache.parquet.io.MessageColumnIO;
36
+ import org.apache.parquet.io.ParquetDecodingException;
37
+ import org.apache.parquet.io.RecordReader;
38
+ import org.apache.parquet.io.api.RecordMaterializer;
39
+ import org.apache.parquet.schema.MessageType;
40
+ import org.embulk.spi.Exec;
41
+ import org.slf4j.Logger;
42
+
43
+ import java.io.IOException;
44
+ import java.util.Collections;
45
+ import java.util.HashMap;
46
+ import java.util.HashSet;
47
+ import java.util.List;
48
+ import java.util.Map;
49
+ import java.util.Set;
50
+
51
+ public class ParquetRowReader<T> {
52
+ private static final Logger logger = Exec.getLogger(ParquetRowReader.class);
53
+
54
+ private final Path filePath;
55
+ private final ParquetFileReader reader;
56
+ private final long total;
57
+ private final ColumnIOFactory columnIOFactory;
58
+ private final RecordMaterializer<T> recordConverter;
59
+ private final MessageType requestedSchema;
60
+ private final MessageType fileSchema;
61
+ private final UnmaterializableRecordCounter unmaterializableRecordCounter;
62
+
63
+ private long current = 0;
64
+ private long totalCountLoadedSoFar = 0;
65
+ private int currentBlock = -1;
66
+ private RecordReader<T> recordReader;
67
+
68
+ // TODO: make configurable ?
69
+ private static final boolean strictTypeChecking = true;
70
+ private static final FilterCompat.Filter filter = FilterCompat.NOOP;
71
+
72
+ public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException {
73
+ this.filePath = filePath;
74
+
75
+ ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
76
+ List<BlockMetaData> blocks = parquetMetadata.getBlocks();
77
+
78
+ FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
79
+ this.fileSchema = fileMetadata.getSchema();
80
+ Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
81
+ ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
82
+ configuration, toSetMultiMap(keyValueMetadata), fileSchema));
83
+ this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());
84
+
85
+ this.requestedSchema = readContext.getRequestedSchema();
86
+ this.recordConverter = readSupport.prepareForRead(
87
+ configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);
88
+
89
+ List<ColumnDescriptor> columns = requestedSchema.getColumns();
90
+
91
+ reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);
92
+
93
+ long total = 0;
94
+ for (BlockMetaData block : blocks) {
95
+ total += block.getRowCount();
96
+ }
97
+ this.total = total;
98
+
99
+ this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
100
+ logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
101
+ }
102
+
103
+ private void checkRead() throws IOException {
104
+ if (current == totalCountLoadedSoFar) {
105
+ PageReadStore pages = reader.readNextRowGroup();
106
+ if (pages == null) {
107
+ throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
108
+ }
109
+
110
+ MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
111
+ recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
112
+ totalCountLoadedSoFar += pages.getRowCount();
113
+ ++ currentBlock;
114
+ }
115
+ }
116
+
117
+ /**
118
+ * @return the next record or null if finished
119
+ * @throws IOException
120
+ * @throws ParquetDecodingException
121
+ */
122
+ public T read() throws IOException {
123
+ T currentValue = null;
124
+ boolean recordFound = false;
125
+ while (!recordFound) {
126
+ // no more records left
127
+ if (current >= total) {
128
+ return null;
129
+ }
130
+
131
+ try {
132
+ checkRead();
133
+ current++;
134
+
135
+ try {
136
+ currentValue = recordReader.read();
137
+ } catch (RecordMaterializer.RecordMaterializationException e) {
138
+ // this might throw, but it's fatal if it does.
139
+ unmaterializableRecordCounter.incErrors(e);
140
+ logger.debug("skipping a corrupt record");
141
+ continue;
142
+ }
143
+
144
+ if (recordReader.shouldSkipCurrentRecord()) {
145
+ // this record is being filtered via the filter2 package
146
+ logger.debug("skipping record");
147
+ continue;
148
+ }
149
+
150
+ if (currentValue == null) {
151
+ // only happens with FilteredRecordReader at end of block
152
+ current = totalCountLoadedSoFar;
153
+ logger.debug("filtered record reader reached end of block");
154
+ continue;
155
+ }
156
+
157
+ recordFound = true;
158
+
159
+ logger.debug("read value: {}", currentValue);
160
+ } catch (RuntimeException e) {
161
+ throw new ParquetDecodingException(
162
+ String.format("Can not read value at %d in block %d in file %s", current, currentBlock, filePath), e);
163
+ }
164
+ }
165
+
166
+ return currentValue;
167
+ }
168
+
169
+ public void close() throws IOException {
170
+ reader.close();
171
+ }
172
+
173
+ private static <K, V> Map<K, Set<V>> toSetMultiMap(Map<K, V> map) {
174
+ Map<K, Set<V>> setMultiMap = new HashMap<>();
175
+ for (Map.Entry<K, V> entry : map.entrySet()) {
176
+ Set<V> set = new HashSet<>();
177
+ set.add(entry.getValue());
178
+ setMultiMap.put(entry.getKey(), Collections.unmodifiableSet(set));
179
+ }
180
+ return Collections.unmodifiableMap(setMultiMap);
181
+ }
182
+ }
@@ -0,0 +1,44 @@
1
+ /*
2
+ * Copyright 2017 CyberAgent, Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+ package org.embulk.input.parquet_hadoop;
17
+
18
+ /**
19
+ * Set the context class loader to plugin's class loader.
20
+ *
21
+ * {@link org.apache.hadoop.fs.FileSystem#loadFileSystems()} loads FileSystem implementation via
22
+ * {@link java.util.ServiceLoader}.
23
+ * It's look up services via system class loader if context class loader is null.
24
+ * However system class loader failed to look up FileSystem implementations because
25
+ * hadoop jars is not in classpath of system class loader.
26
+ * So we need to set context class loader to plugins' class loader.
27
+ */
28
+ class PluginClassLoaderScope implements AutoCloseable {
29
+ private static final ClassLoader PLUGIN_CLASS_LOADER =
30
+ ParquetHadoopInputPlugin.class.getClassLoader();
31
+
32
+ private final ClassLoader original;
33
+
34
+ public PluginClassLoaderScope() {
35
+ Thread current = Thread.currentThread();
36
+ this.original = current.getContextClassLoader();
37
+ Thread.currentThread().setContextClassLoader(PLUGIN_CLASS_LOADER);
38
+ }
39
+
40
+ @Override
41
+ public void close() {
42
+ Thread.currentThread().setContextClassLoader(original);
43
+ }
44
+ }
@@ -0,0 +1,74 @@
1
+ /*
2
+ * Copyright 2017 CyberAgent, Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+ package org.embulk.input.parquet_hadoop;
17
+
18
+ import com.google.common.io.Resources;
19
+ import org.embulk.config.ConfigSource;
20
+ import org.embulk.spi.InputPlugin;
21
+ import org.embulk.test.TestingEmbulk;
22
+ import org.junit.Rule;
23
+ import org.junit.Test;
24
+
25
+ import java.nio.file.Path;
26
+
27
+ import static org.embulk.test.EmbulkTests.readFile;
28
+ import static org.embulk.test.EmbulkTests.readResource;
29
+ import static org.hamcrest.CoreMatchers.is;
30
+ import static org.junit.Assert.assertThat;
31
+
32
+ public class TestParquetHadoopInputPlugin
33
+ {
34
+ private static final String RESOURCE_NAME_PREFIX = "test-data/";
35
+
36
+ @Rule
37
+ public TestingEmbulk embulk = TestingEmbulk.builder()
38
+ .registerPlugin(InputPlugin.class, "parquet_hadoop", ParquetHadoopInputPlugin.class)
39
+ .build();
40
+
41
+ @Test
42
+ public void testSimple() throws Exception
43
+ {
44
+ assertRecordsByResource(embulk, "simple/in.yml", "simple/data.parquet",
45
+ "simple/expected.csv");
46
+ }
47
+
48
+ @Test
49
+ public void testIncompatibleSchema() throws Exception
50
+ {
51
+ assertRecordsByResource(embulk, "incompatible-schema/in.yml", "incompatible-schema/data",
52
+ "incompatible-schema/expected.csv");
53
+ }
54
+
55
+ static void assertRecordsByResource(TestingEmbulk embulk,
56
+ String inConfigYamlResourceName,
57
+ String sourceResourceName, String resultCsvResourceName)
58
+ throws Exception
59
+ {
60
+ Path outputPath = embulk.createTempFile("csv");
61
+
62
+ // in: config
63
+ String inputPath = Resources.getResource(RESOURCE_NAME_PREFIX + sourceResourceName).toURI().toString();
64
+ ConfigSource inConfig = embulk.loadYamlResource(RESOURCE_NAME_PREFIX + inConfigYamlResourceName)
65
+ .set("path", inputPath);
66
+
67
+ TestingEmbulk.RunResult result = embulk.inputBuilder()
68
+ .in(inConfig)
69
+ .outputPath(outputPath)
70
+ .run();
71
+
72
+ assertThat(readFile(outputPath), is(readResource(RESOURCE_NAME_PREFIX + resultCsvResourceName)));
73
+ }
74
+ }
@@ -0,0 +1,4 @@
1
+ "{""c_int"":1}"
2
+ "{""c_int"":2}"
3
+ "{""c_str"":""hoge"",""c_bool"":true}"
4
+ "{""c_str"":""fuga"",""c_bool"":false}"
@@ -0,0 +1,2 @@
1
+ type: parquet_hadoop
2
+ parquet_log_level: WARNING
@@ -0,0 +1,3 @@
1
+ "{""c_str"":""foo"",""c_int"":1,""c_double"":1.5,""c_bool"":true,""c_json"":""{\""foo\"":1}""}"
2
+ "{""c_str"":""bar"",""c_int"":2,""c_double"":2.5,""c_bool"":false,""c_json"":""{\""bar\"":2}""}"
3
+ "{""c_str"":""baz"",""c_int"":3,""c_double"":3.5,""c_bool"":true,""c_json"":""{\""baz\"":3}""}"
@@ -0,0 +1,2 @@
1
+ type: parquet_hadoop
2
+ parquet_log_level: WARNING
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-input-parquet_hadoop
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Koji AGAWA
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-03-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.0'
19
+ name: bundler
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '10.0'
33
+ name: rake
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Loads records from Parquet files via Hadoop FileSystem.
42
+ email:
43
+ - agawa_koji@cyberagent.co.jp
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - build.gradle
49
+ - classpath/activation-1.1.jar
50
+ - classpath/apacheds-i18n-2.0.0-M15.jar
51
+ - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
52
+ - classpath/api-asn1-api-1.0.0-M20.jar
53
+ - classpath/api-util-1.0.0-M20.jar
54
+ - classpath/asm-3.1.jar
55
+ - classpath/avro-1.7.4.jar
56
+ - classpath/commons-beanutils-1.7.0.jar
57
+ - classpath/commons-cli-1.2.jar
58
+ - classpath/commons-codec-1.6.jar
59
+ - classpath/commons-collections-3.2.2.jar
60
+ - classpath/commons-compress-1.4.1.jar
61
+ - classpath/commons-configuration-1.6.jar
62
+ - classpath/commons-digester-1.8.jar
63
+ - classpath/commons-httpclient-3.1.jar
64
+ - classpath/commons-io-2.4.jar
65
+ - classpath/commons-lang-2.6.jar
66
+ - classpath/commons-logging-1.1.3.jar
67
+ - classpath/commons-math3-3.1.1.jar
68
+ - classpath/commons-net-3.1.jar
69
+ - classpath/curator-client-2.7.1.jar
70
+ - classpath/curator-framework-2.7.1.jar
71
+ - classpath/curator-recipes-2.7.1.jar
72
+ - classpath/embulk-input-parquet_hadoop-0.1.0.jar
73
+ - classpath/gson-2.2.4.jar
74
+ - classpath/hadoop-annotations-2.7.3.jar
75
+ - classpath/hadoop-auth-2.7.3.jar
76
+ - classpath/hadoop-client-2.7.3.jar
77
+ - classpath/hadoop-common-2.7.3.jar
78
+ - classpath/hadoop-hdfs-2.7.3.jar
79
+ - classpath/hadoop-mapreduce-client-app-2.7.3.jar
80
+ - classpath/hadoop-mapreduce-client-common-2.7.3.jar
81
+ - classpath/hadoop-mapreduce-client-core-2.7.3.jar
82
+ - classpath/hadoop-mapreduce-client-jobclient-2.7.3.jar
83
+ - classpath/hadoop-mapreduce-client-shuffle-2.7.3.jar
84
+ - classpath/hadoop-yarn-api-2.7.3.jar
85
+ - classpath/hadoop-yarn-client-2.7.3.jar
86
+ - classpath/hadoop-yarn-common-2.7.3.jar
87
+ - classpath/hadoop-yarn-server-common-2.7.3.jar
88
+ - classpath/hadoop-yarn-server-nodemanager-2.7.3.jar
89
+ - classpath/htrace-core-3.1.0-incubating.jar
90
+ - classpath/httpclient-4.2.5.jar
91
+ - classpath/httpcore-4.2.4.jar
92
+ - classpath/jackson-core-asl-1.9.13.jar
93
+ - classpath/jackson-jaxrs-1.9.13.jar
94
+ - classpath/jackson-mapper-asl-1.9.13.jar
95
+ - classpath/jackson-xc-1.9.13.jar
96
+ - classpath/jaxb-api-2.2.2.jar
97
+ - classpath/jaxb-impl-2.2.3-1.jar
98
+ - classpath/jersey-client-1.9.jar
99
+ - classpath/jersey-core-1.9.jar
100
+ - classpath/jersey-guice-1.9.jar
101
+ - classpath/jersey-json-1.9.jar
102
+ - classpath/jersey-server-1.9.jar
103
+ - classpath/jettison-1.1.jar
104
+ - classpath/jetty-util-6.1.26.jar
105
+ - classpath/jline-0.9.94.jar
106
+ - classpath/jsp-api-2.1.jar
107
+ - classpath/jsr305-3.0.0.jar
108
+ - classpath/jul-to-slf4j-1.7.24.jar
109
+ - classpath/leveldbjni-all-1.8.jar
110
+ - classpath/log4j-over-slf4j-1.7.24.jar
111
+ - classpath/netty-3.7.0.Final.jar
112
+ - classpath/netty-all-4.0.23.Final.jar
113
+ - classpath/paranamer-2.3.jar
114
+ - classpath/parquet-column-1.8.1.jar
115
+ - classpath/parquet-common-1.8.1.jar
116
+ - classpath/parquet-encoding-1.8.1.jar
117
+ - classpath/parquet-format-2.3.0-incubating.jar
118
+ - classpath/parquet-hadoop-1.8.1.jar
119
+ - classpath/parquet-jackson-1.8.1.jar
120
+ - classpath/parquet-msgpack-0.1.0.jar
121
+ - classpath/protobuf-java-2.5.0.jar
122
+ - classpath/servlet-api-2.5.jar
123
+ - classpath/slf4j-api-1.7.24.jar
124
+ - classpath/snappy-java-1.1.1.6.jar
125
+ - classpath/stax-api-1.0-2.jar
126
+ - classpath/xercesImpl-2.9.1.jar
127
+ - classpath/xml-apis-1.3.04.jar
128
+ - classpath/xmlenc-0.52.jar
129
+ - classpath/xz-1.0.jar
130
+ - classpath/zookeeper-3.4.6.jar
131
+ - lib/embulk/input/parquet_hadoop.rb
132
+ - src/main/java/org/embulk/input/parquet_hadoop/ConfigurationFactory.java
133
+ - src/main/java/org/embulk/input/parquet_hadoop/ParquetHadoopInputPlugin.java
134
+ - src/main/java/org/embulk/input/parquet_hadoop/ParquetRowReader.java
135
+ - src/main/java/org/embulk/input/parquet_hadoop/PluginClassLoaderScope.java
136
+ - src/test/java/org/embulk/input/parquet_hadoop/TestParquetHadoopInputPlugin.java
137
+ - src/test/resources/test-data/incompatible-schema/data/1.parquet
138
+ - src/test/resources/test-data/incompatible-schema/data/2.parquet
139
+ - src/test/resources/test-data/incompatible-schema/expected.csv
140
+ - src/test/resources/test-data/incompatible-schema/in.yml
141
+ - src/test/resources/test-data/simple/data.parquet
142
+ - src/test/resources/test-data/simple/expected.csv
143
+ - src/test/resources/test-data/simple/in.yml
144
+ homepage: https://github.com/CyberAgent/embulk-input-parquet_hadoop
145
+ licenses:
146
+ - Apache 2.0
147
+ metadata: {}
148
+ post_install_message:
149
+ rdoc_options: []
150
+ require_paths:
151
+ - lib
152
+ required_ruby_version: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ required_rubygems_version: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ requirements: []
163
+ rubyforge_project:
164
+ rubygems_version: 2.4.8
165
+ signing_key:
166
+ specification_version: 4
167
+ summary: Parquet input plugin for Embulk
168
+ test_files: []