embulk-input-parquet_hadoop 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/classpath/embulk-input-parquet_hadoop-0.1.1.jar +0 -0
- data/classpath/parquet-msgpack-0.1.1.jar +0 -0
- data/src/main/java/org/embulk/input/parquet_hadoop/ConfigurationFactory.java +2 -1
- data/src/main/java/org/embulk/input/parquet_hadoop/ParquetHadoopInputPlugin.java +22 -11
- data/src/main/java/org/embulk/input/parquet_hadoop/ParquetRowReader.java +16 -8
- data/src/main/java/org/embulk/input/parquet_hadoop/PluginClassLoaderScope.java +6 -3
- metadata +4 -4
- data/classpath/embulk-input-parquet_hadoop-0.1.0.jar +0 -0
- data/classpath/parquet-msgpack-0.1.0.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26ed2eaecbcd68dc340a28050283d99e8d4328d8
|
4
|
+
data.tar.gz: cb043093611c02591c8cd013bef9bde4d17ad410
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccd763d5484bbd3e34ea45c217dc3363b5229848194e5061981fba202300b76790fa2a453388ca51ffe81410e8950d8c8647855c784a49461b4537fffc9fa909
|
7
|
+
data.tar.gz: 6f12ead633dba51521b33154c5ae8b237e6372aeb00d689414e8fae89168e17cac5c08059580fca7532f012b7769f13ad804e82b55187c73cebe2377ee03a017
|
Binary file
|
Binary file
|
@@ -63,7 +63,8 @@ public class ConfigurationFactory
|
|
63
63
|
try {
|
64
64
|
logger.trace("embulk-input-parquet_hadoop: load a config file: {}", f);
|
65
65
|
c.addResource(new File(f).toURI().toURL());
|
66
|
-
}
|
66
|
+
}
|
67
|
+
catch (MalformedURLException e) {
|
67
68
|
throw new ConfigException(e);
|
68
69
|
}
|
69
70
|
}
|
@@ -18,6 +18,7 @@ package org.embulk.input.parquet_hadoop;
|
|
18
18
|
import com.google.common.base.Function;
|
19
19
|
import com.google.common.base.Throwables;
|
20
20
|
import com.google.common.collect.Lists;
|
21
|
+
import jp.co.cyberagent.parquet.msgpack.read.MessagePackReadSupport;
|
21
22
|
import org.apache.hadoop.conf.Configuration;
|
22
23
|
import org.apache.hadoop.fs.FileStatus;
|
23
24
|
import org.apache.hadoop.fs.FileSystem;
|
@@ -43,9 +44,9 @@ import org.embulk.spi.type.Types;
|
|
43
44
|
import org.msgpack.value.Value;
|
44
45
|
import org.slf4j.Logger;
|
45
46
|
import org.slf4j.bridge.SLF4JBridgeHandler;
|
46
|
-
import studio.adtech.parquet.msgpack.read.MessagePackReadSupport;
|
47
47
|
|
48
48
|
import javax.annotation.Nullable;
|
49
|
+
|
49
50
|
import java.io.IOException;
|
50
51
|
import java.util.List;
|
51
52
|
import java.util.logging.Level;
|
@@ -100,12 +101,14 @@ public class ParquetHadoopInputPlugin
|
|
100
101
|
List<String> files = Lists.transform(statusList, new Function<FileStatus, String>() {
|
101
102
|
@Nullable
|
102
103
|
@Override
|
103
|
-
public String apply(@Nullable FileStatus input)
|
104
|
+
public String apply(@Nullable FileStatus input)
|
105
|
+
{
|
104
106
|
return input.getPath().toString();
|
105
107
|
}
|
106
108
|
});
|
107
109
|
task.setFiles(files);
|
108
|
-
}
|
110
|
+
}
|
111
|
+
catch (IOException e) {
|
109
112
|
throw Throwables.propagate(e);
|
110
113
|
}
|
111
114
|
|
@@ -152,7 +155,8 @@ public class ParquetHadoopInputPlugin
|
|
152
155
|
ParquetRowReader<Value> reader;
|
153
156
|
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
154
157
|
reader = new ParquetRowReader<>(conf, filePath, new MessagePackReadSupport());
|
155
|
-
}
|
158
|
+
}
|
159
|
+
catch (ParquetRuntimeException | IOException e) {
|
156
160
|
throw new DataException(e);
|
157
161
|
}
|
158
162
|
|
@@ -160,7 +164,8 @@ public class ParquetHadoopInputPlugin
|
|
160
164
|
while (true) {
|
161
165
|
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
162
166
|
value = reader.read();
|
163
|
-
}
|
167
|
+
}
|
168
|
+
catch (ParquetRuntimeException | IOException e) {
|
164
169
|
throw new DataException(e);
|
165
170
|
}
|
166
171
|
if (value == null) {
|
@@ -175,7 +180,8 @@ public class ParquetHadoopInputPlugin
|
|
175
180
|
|
176
181
|
try (PluginClassLoaderScope ignored = new PluginClassLoaderScope()) {
|
177
182
|
reader.close();
|
178
|
-
}
|
183
|
+
}
|
184
|
+
catch (ParquetRuntimeException | IOException e) {
|
179
185
|
throw new DataException(e);
|
180
186
|
}
|
181
187
|
}
|
@@ -195,7 +201,8 @@ public class ParquetHadoopInputPlugin
|
|
195
201
|
return new PageBuilder(Exec.getBufferAllocator(), schema, output);
|
196
202
|
}
|
197
203
|
|
198
|
-
private List<FileStatus> listFileStatuses(FileSystem fs, Path rootPath) throws IOException
|
204
|
+
private List<FileStatus> listFileStatuses(FileSystem fs, Path rootPath) throws IOException
|
205
|
+
{
|
199
206
|
List<FileStatus> fileStatuses = Lists.newArrayList();
|
200
207
|
|
201
208
|
FileStatus[] entries = fs.globStatus(rootPath, HiddenFileFilter.INSTANCE);
|
@@ -207,7 +214,8 @@ public class ParquetHadoopInputPlugin
|
|
207
214
|
if (entry.isDirectory()) {
|
208
215
|
List<FileStatus> subEntries = listRecursive(fs, entry);
|
209
216
|
fileStatuses.addAll(subEntries);
|
210
|
-
}
|
217
|
+
}
|
218
|
+
else {
|
211
219
|
fileStatuses.add(entry);
|
212
220
|
}
|
213
221
|
}
|
@@ -223,7 +231,8 @@ public class ParquetHadoopInputPlugin
|
|
223
231
|
for (FileStatus entry : entries) {
|
224
232
|
statusList.addAll(listRecursive(fs, entry));
|
225
233
|
}
|
226
|
-
}
|
234
|
+
}
|
235
|
+
else {
|
227
236
|
statusList.add(status);
|
228
237
|
}
|
229
238
|
return statusList;
|
@@ -241,14 +250,16 @@ public class ParquetHadoopInputPlugin
|
|
241
250
|
Level level;
|
242
251
|
try {
|
243
252
|
level = Level.parse(task.getParquetLogLevel());
|
244
|
-
}
|
253
|
+
}
|
254
|
+
catch (IllegalArgumentException e) {
|
245
255
|
logger.warn("embulk-input-parquet_hadoop: Invalid parquet_log_level", e);
|
246
256
|
level = Level.WARNING;
|
247
257
|
}
|
248
258
|
// invoke static initializer that overrides log level.
|
249
259
|
try {
|
250
260
|
Class.forName("org.apache.parquet.Log");
|
251
|
-
}
|
261
|
+
}
|
262
|
+
catch (ClassNotFoundException e) {
|
252
263
|
logger.warn("", e);
|
253
264
|
}
|
254
265
|
|
@@ -48,7 +48,8 @@ import java.util.List;
|
|
48
48
|
import java.util.Map;
|
49
49
|
import java.util.Set;
|
50
50
|
|
51
|
-
public class ParquetRowReader<T>
|
51
|
+
public class ParquetRowReader<T>
|
52
|
+
{
|
52
53
|
private static final Logger logger = Exec.getLogger(ParquetRowReader.class);
|
53
54
|
|
54
55
|
private final Path filePath;
|
@@ -69,7 +70,8 @@ public class ParquetRowReader<T> {
|
|
69
70
|
private static final boolean strictTypeChecking = true;
|
70
71
|
private static final FilterCompat.Filter filter = FilterCompat.NOOP;
|
71
72
|
|
72
|
-
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
|
73
|
+
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
|
74
|
+
{
|
73
75
|
this.filePath = filePath;
|
74
76
|
|
75
77
|
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
|
@@ -100,7 +102,8 @@ public class ParquetRowReader<T> {
|
|
100
102
|
logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
|
101
103
|
}
|
102
104
|
|
103
|
-
private void checkRead() throws IOException
|
105
|
+
private void checkRead() throws IOException
|
106
|
+
{
|
104
107
|
if (current == totalCountLoadedSoFar) {
|
105
108
|
PageReadStore pages = reader.readNextRowGroup();
|
106
109
|
if (pages == null) {
|
@@ -119,7 +122,8 @@ public class ParquetRowReader<T> {
|
|
119
122
|
* @throws IOException
|
120
123
|
* @throws ParquetDecodingException
|
121
124
|
*/
|
122
|
-
public T read() throws IOException
|
125
|
+
public T read() throws IOException
|
126
|
+
{
|
123
127
|
T currentValue = null;
|
124
128
|
boolean recordFound = false;
|
125
129
|
while (!recordFound) {
|
@@ -134,7 +138,8 @@ public class ParquetRowReader<T> {
|
|
134
138
|
|
135
139
|
try {
|
136
140
|
currentValue = recordReader.read();
|
137
|
-
}
|
141
|
+
}
|
142
|
+
catch (RecordMaterializer.RecordMaterializationException e) {
|
138
143
|
// this might throw, but it's fatal if it does.
|
139
144
|
unmaterializableRecordCounter.incErrors(e);
|
140
145
|
logger.debug("skipping a corrupt record");
|
@@ -157,7 +162,8 @@ public class ParquetRowReader<T> {
|
|
157
162
|
recordFound = true;
|
158
163
|
|
159
164
|
logger.debug("read value: {}", currentValue);
|
160
|
-
}
|
165
|
+
}
|
166
|
+
catch (RuntimeException e) {
|
161
167
|
throw new ParquetDecodingException(
|
162
168
|
String.format("Can not read value at %d in block %d in file %s", current, currentBlock, filePath), e);
|
163
169
|
}
|
@@ -166,11 +172,13 @@ public class ParquetRowReader<T> {
|
|
166
172
|
return currentValue;
|
167
173
|
}
|
168
174
|
|
169
|
-
public void close() throws IOException
|
175
|
+
public void close() throws IOException
|
176
|
+
{
|
170
177
|
reader.close();
|
171
178
|
}
|
172
179
|
|
173
|
-
private static <K, V> Map<K, Set<V>> toSetMultiMap(Map<K, V> map)
|
180
|
+
private static <K, V> Map<K, Set<V>> toSetMultiMap(Map<K, V> map)
|
181
|
+
{
|
174
182
|
Map<K, Set<V>> setMultiMap = new HashMap<>();
|
175
183
|
for (Map.Entry<K, V> entry : map.entrySet()) {
|
176
184
|
Set<V> set = new HashSet<>();
|
@@ -25,20 +25,23 @@ package org.embulk.input.parquet_hadoop;
|
|
25
25
|
* hadoop jars is not in classpath of system class loader.
|
26
26
|
* So we need to set context class loader to plugins' class loader.
|
27
27
|
*/
|
28
|
-
class PluginClassLoaderScope implements AutoCloseable
|
28
|
+
class PluginClassLoaderScope implements AutoCloseable
|
29
|
+
{
|
29
30
|
private static final ClassLoader PLUGIN_CLASS_LOADER =
|
30
31
|
ParquetHadoopInputPlugin.class.getClassLoader();
|
31
32
|
|
32
33
|
private final ClassLoader original;
|
33
34
|
|
34
|
-
public PluginClassLoaderScope()
|
35
|
+
public PluginClassLoaderScope()
|
36
|
+
{
|
35
37
|
Thread current = Thread.currentThread();
|
36
38
|
this.original = current.getContextClassLoader();
|
37
39
|
Thread.currentThread().setContextClassLoader(PLUGIN_CLASS_LOADER);
|
38
40
|
}
|
39
41
|
|
40
42
|
@Override
|
41
|
-
public void close()
|
43
|
+
public void close()
|
44
|
+
{
|
42
45
|
Thread.currentThread().setContextClassLoader(original);
|
43
46
|
}
|
44
47
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-parquet_hadoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koji AGAWA
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -69,7 +69,7 @@ files:
|
|
69
69
|
- classpath/curator-client-2.7.1.jar
|
70
70
|
- classpath/curator-framework-2.7.1.jar
|
71
71
|
- classpath/curator-recipes-2.7.1.jar
|
72
|
-
- classpath/embulk-input-parquet_hadoop-0.1.
|
72
|
+
- classpath/embulk-input-parquet_hadoop-0.1.1.jar
|
73
73
|
- classpath/gson-2.2.4.jar
|
74
74
|
- classpath/hadoop-annotations-2.7.3.jar
|
75
75
|
- classpath/hadoop-auth-2.7.3.jar
|
@@ -117,7 +117,7 @@ files:
|
|
117
117
|
- classpath/parquet-format-2.3.0-incubating.jar
|
118
118
|
- classpath/parquet-hadoop-1.8.1.jar
|
119
119
|
- classpath/parquet-jackson-1.8.1.jar
|
120
|
-
- classpath/parquet-msgpack-0.1.
|
120
|
+
- classpath/parquet-msgpack-0.1.1.jar
|
121
121
|
- classpath/protobuf-java-2.5.0.jar
|
122
122
|
- classpath/servlet-api-2.5.jar
|
123
123
|
- classpath/slf4j-api-1.7.24.jar
|
Binary file
|
Binary file
|