embulk-input-hdfs 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHENGELOG.md +5 -0
- data/README.md +20 -17
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java +60 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +107 -200
- data/src/main/java/org/embulk/input/hdfs/Strftime.java +34 -0
- data/src/main/java/org/embulk/input/hdfs/TargetFileInfo.java +174 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileList.java → TargetFileInfoList.java} +73 -90
- data/src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java +128 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileInputStream.java → TargetFilePartialInputStream.java} +4 -6
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +6 -8
- metadata +19 -18
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +0 -82
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +0 -48
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +0 -125
@@ -0,0 +1,128 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.FileSystem;
|
4
|
+
import org.apache.hadoop.fs.Path;
|
5
|
+
import org.apache.hadoop.io.compress.CompressionCodec;
|
6
|
+
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
7
|
+
import org.embulk.spi.Exec;
|
8
|
+
import org.slf4j.Logger;
|
9
|
+
|
10
|
+
import java.io.BufferedInputStream;
|
11
|
+
import java.io.ByteArrayInputStream;
|
12
|
+
import java.io.ByteArrayOutputStream;
|
13
|
+
import java.io.IOException;
|
14
|
+
import java.io.InputStream;
|
15
|
+
import java.io.SequenceInputStream;
|
16
|
+
|
17
|
+
public class TargetFileInputStreamFactory
|
18
|
+
{
|
19
|
+
private static final Logger logger = Exec.getLogger(TargetFileInputStreamFactory.class);
|
20
|
+
private final FileSystem fs;
|
21
|
+
|
22
|
+
public TargetFileInputStreamFactory(FileSystem fs)
|
23
|
+
{
|
24
|
+
this.fs = fs;
|
25
|
+
}
|
26
|
+
|
27
|
+
public InputStream create(TargetFileInfo t)
|
28
|
+
throws IOException
|
29
|
+
{
|
30
|
+
InputStream is = createSuitableInputStream(t);
|
31
|
+
return createInputStreamWithHeaders(is, t);
|
32
|
+
}
|
33
|
+
|
34
|
+
private InputStream createSuitableInputStream(TargetFileInfo t)
|
35
|
+
throws IOException
|
36
|
+
{
|
37
|
+
if (t.getIsDecompressible()) {
|
38
|
+
logger.debug("embulk-input-hdfs: createDecompressedInputStream: {}", t.getPathString());
|
39
|
+
return createDecompressedInputStream(t);
|
40
|
+
}
|
41
|
+
else if (t.getIsPartitionable()) {
|
42
|
+
logger.debug("embulk-input-hdfs: createPartialInputStream: {}, start:{}, end:{}",
|
43
|
+
t.getPathString(), t.getStart(), t.getEnd());
|
44
|
+
return createPartialInputStream(t);
|
45
|
+
}
|
46
|
+
else {
|
47
|
+
logger.debug("embulk-input-hdfs: createOriginalInputStream: {}", t.getPathString());
|
48
|
+
return createOriginalInputStream(t);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
private InputStream createInputStreamWithHeaders(InputStream original, TargetFileInfo t)
|
53
|
+
throws IOException
|
54
|
+
{
|
55
|
+
if (t.getStart() > 0 && t.getNumHeaderLines() > 0) {
|
56
|
+
logger.debug("embulk-input-hdfs: createInputStreamWithHeaders: {}", t.getPathString());
|
57
|
+
InputStream headers = createHeadersInputStream(t);
|
58
|
+
return new SequenceInputStream(headers, original);
|
59
|
+
}
|
60
|
+
else {
|
61
|
+
return original;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
private InputStream createOriginalInputStream(TargetFileInfo t)
|
66
|
+
throws IOException
|
67
|
+
{
|
68
|
+
return fs.open(new Path(t.getPathString()));
|
69
|
+
}
|
70
|
+
|
71
|
+
private InputStream createDecompressedInputStream(TargetFileInfo t)
|
72
|
+
throws IOException
|
73
|
+
{
|
74
|
+
InputStream original = createOriginalInputStream(t);
|
75
|
+
CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
|
76
|
+
CompressionCodec codec = factory.getCodec(new Path(t.getPathString()));
|
77
|
+
if (codec == null) {
|
78
|
+
logger.debug("embulk-input-hdfs: CompressionCodec: null: {}", t.getPathString());
|
79
|
+
return original;
|
80
|
+
}
|
81
|
+
else {
|
82
|
+
logger.debug("embulk-input-hdfs: CompressionCodec: {}: {}", codec, t.getPathString());
|
83
|
+
return codec.createInputStream(original);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
private InputStream createPartialInputStream(TargetFileInfo t)
|
88
|
+
throws IOException
|
89
|
+
{
|
90
|
+
InputStream original = createOriginalInputStream(t);
|
91
|
+
return new TargetFilePartialInputStream(original, t.getStart(), t.getEnd());
|
92
|
+
}
|
93
|
+
|
94
|
+
private InputStream createHeadersInputStream(TargetFileInfo t)
|
95
|
+
throws IOException
|
96
|
+
{
|
97
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
98
|
+
int skippedHeaders = 0;
|
99
|
+
InputStream is = createOriginalInputStream(t);
|
100
|
+
try (BufferedInputStream in = new BufferedInputStream(is)) {
|
101
|
+
while (true) {
|
102
|
+
int c = in.read();
|
103
|
+
if (c < 0) {
|
104
|
+
break;
|
105
|
+
}
|
106
|
+
|
107
|
+
header.write(c);
|
108
|
+
|
109
|
+
if (c == '\n') {
|
110
|
+
skippedHeaders++;
|
111
|
+
}
|
112
|
+
else if (c == '\r') {
|
113
|
+
int c2 = in.read();
|
114
|
+
if (c2 == '\n') {
|
115
|
+
header.write(c2);
|
116
|
+
}
|
117
|
+
skippedHeaders++;
|
118
|
+
}
|
119
|
+
|
120
|
+
if (skippedHeaders >= t.getNumHeaderLines()) {
|
121
|
+
break;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
header.close();
|
126
|
+
return new ByteArrayInputStream(header.toByteArray());
|
127
|
+
}
|
128
|
+
}
|
@@ -1,15 +1,13 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
+
// Ported from https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
|
4
|
+
|
3
5
|
import java.io.BufferedInputStream;
|
4
6
|
import java.io.IOException;
|
5
7
|
import java.io.InputStream;
|
6
8
|
import java.io.PushbackInputStream;
|
7
9
|
|
8
|
-
|
9
|
-
* Created by takahiro.nakayama on 2/13/16.
|
10
|
-
* ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
|
11
|
-
*/
|
12
|
-
public class PartialFileInputStream
|
10
|
+
public class TargetFilePartialInputStream
|
13
11
|
extends InputStream
|
14
12
|
{
|
15
13
|
private final PushbackInputStream original;
|
@@ -18,7 +16,7 @@ public class PartialFileInputStream
|
|
18
16
|
private long current;
|
19
17
|
private boolean eof;
|
20
18
|
|
21
|
-
public
|
19
|
+
public TargetFilePartialInputStream(InputStream original, long start, long end)
|
22
20
|
{
|
23
21
|
this.original = new PushbackInputStream(new BufferedInputStream(original));
|
24
22
|
this.start = start;
|
@@ -30,7 +30,6 @@ import javax.annotation.Nullable;
|
|
30
30
|
|
31
31
|
import java.io.File;
|
32
32
|
import java.util.ArrayList;
|
33
|
-
import java.util.Iterator;
|
34
33
|
import java.util.List;
|
35
34
|
|
36
35
|
import static org.junit.Assert.assertEquals;
|
@@ -43,7 +42,6 @@ public class TestHdfsFileInputPlugin
|
|
43
42
|
@Rule
|
44
43
|
public ExpectedException exception = ExpectedException.none();
|
45
44
|
|
46
|
-
private Logger logger = runtime.getExec().getLogger(TestHdfsFileInputPlugin.class);
|
47
45
|
private HdfsFileInputPlugin plugin;
|
48
46
|
private FileInputRunner runner;
|
49
47
|
private MockPageOutput output;
|
@@ -67,11 +65,11 @@ public class TestHdfsFileInputPlugin
|
|
67
65
|
assertEquals(path.toString(), task.getPath());
|
68
66
|
assertEquals(Lists.newArrayList(), task.getConfigFiles());
|
69
67
|
assertEquals(Maps.newHashMap(), task.getConfig());
|
70
|
-
assertEquals(true, task.
|
68
|
+
assertEquals(true, task.getWillPartition());
|
71
69
|
assertEquals(0, task.getRewindSeconds());
|
72
70
|
assertEquals(-1, task.getApproximateNumPartitions());
|
73
71
|
assertEquals(0, task.getSkipHeaderLines());
|
74
|
-
assertEquals(false, task.
|
72
|
+
assertEquals(false, task.getWillDecompress());
|
75
73
|
}
|
76
74
|
|
77
75
|
@Test(expected = ConfigException.class)
|
@@ -103,9 +101,9 @@ public class TestHdfsFileInputPlugin
|
|
103
101
|
});
|
104
102
|
|
105
103
|
List<String> resultFList = Lists.newArrayList();
|
106
|
-
for (int i = 0; i < task.
|
107
|
-
for (
|
108
|
-
resultFList.add(
|
104
|
+
for (int i = 0; i < task.getTargetFileInfoList().getTaskCount();i++) {
|
105
|
+
for (TargetFileInfo targetFileInfo : task.getTargetFileInfoList().get(i)) {
|
106
|
+
resultFList.add(targetFileInfo.getPathString());
|
109
107
|
}
|
110
108
|
}
|
111
109
|
assertEquals(fileList.size(), resultFList.size());
|
@@ -152,7 +150,7 @@ public class TestHdfsFileInputPlugin
|
|
152
150
|
config.set("path", "/tmp/%Y-%m-%d");
|
153
151
|
config.set("rewind_seconds", 86400);
|
154
152
|
PluginTask task = config.loadConfig(PluginTask.class);
|
155
|
-
String result =
|
153
|
+
String result = new Strftime(task).format(task.getPath());
|
156
154
|
String expected = task.getJRuby().runScriptlet("(Time.now - 86400).strftime('/tmp/%Y-%m-%d')").toString();
|
157
155
|
assertEquals(expected, result);
|
158
156
|
}
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Civitaspo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.0'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
16
|
- - ~>
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '1.0'
|
19
|
+
name: bundler
|
25
20
|
prerelease: false
|
26
21
|
type: :development
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - ~>
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
30
|
- - '>='
|
37
31
|
- !ruby/object:Gem::Version
|
38
32
|
version: '10.0'
|
33
|
+
name: rake
|
39
34
|
prerelease: false
|
40
35
|
type: :development
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
41
|
description: Reads files stored on Hdfs.
|
42
42
|
email:
|
43
43
|
- civitaspo@gmail.com
|
@@ -61,12 +61,13 @@ files:
|
|
61
61
|
- gradlew
|
62
62
|
- gradlew.bat
|
63
63
|
- lib/embulk/input/hdfs.rb
|
64
|
-
- src/main/java/org/embulk/input/hdfs/
|
64
|
+
- src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java
|
65
65
|
- src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
|
66
|
-
- src/main/java/org/embulk/input/hdfs/
|
67
|
-
- src/main/java/org/embulk/input/hdfs/
|
68
|
-
- src/main/java/org/embulk/input/hdfs/
|
69
|
-
- src/main/java/org/embulk/input/hdfs/
|
66
|
+
- src/main/java/org/embulk/input/hdfs/Strftime.java
|
67
|
+
- src/main/java/org/embulk/input/hdfs/TargetFileInfo.java
|
68
|
+
- src/main/java/org/embulk/input/hdfs/TargetFileInfoList.java
|
69
|
+
- src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java
|
70
|
+
- src/main/java/org/embulk/input/hdfs/TargetFilePartialInputStream.java
|
70
71
|
- src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
|
71
72
|
- src/test/resources/sample_01.csv
|
72
73
|
- src/test/resources/sample_02.csv
|
@@ -94,7 +95,7 @@ files:
|
|
94
95
|
- classpath/curator-client-2.6.0.jar
|
95
96
|
- classpath/curator-framework-2.6.0.jar
|
96
97
|
- classpath/curator-recipes-2.6.0.jar
|
97
|
-
- classpath/embulk-input-hdfs-0.
|
98
|
+
- classpath/embulk-input-hdfs-0.3.0.jar
|
98
99
|
- classpath/gson-2.2.4.jar
|
99
100
|
- classpath/hadoop-annotations-2.6.4.jar
|
100
101
|
- classpath/hadoop-auth-2.6.4.jar
|
@@ -1,82 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
import com.google.common.collect.ImmutableList;
|
4
|
-
import com.google.common.collect.ImmutableMap;
|
5
|
-
import com.google.common.collect.Lists;
|
6
|
-
import com.google.common.collect.Maps;
|
7
|
-
import org.apache.hadoop.conf.Configuration;
|
8
|
-
import org.embulk.config.ConfigException;
|
9
|
-
import org.embulk.spi.Exec;
|
10
|
-
import org.slf4j.Logger;
|
11
|
-
|
12
|
-
import java.io.File;
|
13
|
-
import java.net.MalformedURLException;
|
14
|
-
import java.util.List;
|
15
|
-
import java.util.Map;
|
16
|
-
|
17
|
-
/**
|
18
|
-
* Created by takahiro.nakayama on 2/22/16.
|
19
|
-
*/
|
20
|
-
public class ConfigurationBuilder
|
21
|
-
{
|
22
|
-
private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
|
23
|
-
private final ImmutableList.Builder<String> configFilesBuilder;
|
24
|
-
private final ImmutableMap.Builder<String, String> configMapBuilder;
|
25
|
-
|
26
|
-
public ConfigurationBuilder()
|
27
|
-
{
|
28
|
-
this.configFilesBuilder = ImmutableList.builder();
|
29
|
-
this.configMapBuilder = ImmutableMap.builder();
|
30
|
-
}
|
31
|
-
|
32
|
-
public ConfigurationBuilder addConfigFiles(List<String> configFiles)
|
33
|
-
{
|
34
|
-
for (String configFile : configFiles) {
|
35
|
-
addConfigFile(configFile);
|
36
|
-
}
|
37
|
-
return this;
|
38
|
-
}
|
39
|
-
|
40
|
-
public ConfigurationBuilder addConfigFile(String configFile)
|
41
|
-
{
|
42
|
-
configFilesBuilder.add(configFile);
|
43
|
-
return this;
|
44
|
-
}
|
45
|
-
|
46
|
-
public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
|
47
|
-
{
|
48
|
-
for (Map.Entry<String, String> entry : configMap.entrySet()) {
|
49
|
-
addConfig(entry.getKey(), entry.getValue());
|
50
|
-
}
|
51
|
-
return this;
|
52
|
-
}
|
53
|
-
|
54
|
-
public ConfigurationBuilder addConfig(String key, String value)
|
55
|
-
{
|
56
|
-
configMapBuilder.put(key, value);
|
57
|
-
return this;
|
58
|
-
}
|
59
|
-
|
60
|
-
public Configuration build()
|
61
|
-
{
|
62
|
-
Configuration configuration = new Configuration();
|
63
|
-
for (String configFile : configFilesBuilder.build()) {
|
64
|
-
File file = new File(configFile);
|
65
|
-
try {
|
66
|
-
configuration.addResource(file.toURI().toURL());
|
67
|
-
}
|
68
|
-
catch (MalformedURLException e) {
|
69
|
-
throw new ConfigException(e);
|
70
|
-
}
|
71
|
-
}
|
72
|
-
for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
|
73
|
-
configuration.set(entry.getKey(), entry.getValue());
|
74
|
-
}
|
75
|
-
// For debug
|
76
|
-
for (Map.Entry<String, String> entry : configuration) {
|
77
|
-
logger.trace("{}: {}", entry.getKey(), entry.getValue());
|
78
|
-
}
|
79
|
-
logger.trace("Resource Files: {}", configuration);
|
80
|
-
return configuration;
|
81
|
-
}
|
82
|
-
}
|
@@ -1,48 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
import org.apache.hadoop.fs.Path;
|
4
|
-
|
5
|
-
/**
|
6
|
-
* Created by takahiro.nakayama on 2/20/16.
|
7
|
-
* is the same as PartialFileList.Entry, so this class does not need?
|
8
|
-
*/
|
9
|
-
public class PartialFile
|
10
|
-
{
|
11
|
-
private final Path path;
|
12
|
-
private final long start;
|
13
|
-
private final long end;
|
14
|
-
private final boolean canDecompress;
|
15
|
-
|
16
|
-
public PartialFile(String path, long start, long end, boolean canDecompress)
|
17
|
-
{
|
18
|
-
this(new Path(path), start, end, canDecompress);
|
19
|
-
}
|
20
|
-
|
21
|
-
public PartialFile(Path path, long start, long end, boolean canDecompress)
|
22
|
-
{
|
23
|
-
this.path = path;
|
24
|
-
this.start = start;
|
25
|
-
this.end = end;
|
26
|
-
this.canDecompress = canDecompress;
|
27
|
-
}
|
28
|
-
|
29
|
-
public Path getPath()
|
30
|
-
{
|
31
|
-
return path;
|
32
|
-
}
|
33
|
-
|
34
|
-
public long getStart()
|
35
|
-
{
|
36
|
-
return start;
|
37
|
-
}
|
38
|
-
|
39
|
-
public long getEnd()
|
40
|
-
{
|
41
|
-
return end;
|
42
|
-
}
|
43
|
-
|
44
|
-
public boolean getCanDecompress()
|
45
|
-
{
|
46
|
-
return canDecompress;
|
47
|
-
}
|
48
|
-
}
|
@@ -1,125 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
import com.google.common.base.Optional;
|
4
|
-
import com.google.common.base.Throwables;
|
5
|
-
import org.apache.hadoop.fs.FileSystem;
|
6
|
-
import org.apache.hadoop.io.compress.CodecPool;
|
7
|
-
import org.apache.hadoop.io.compress.CompressionCodec;
|
8
|
-
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
9
|
-
import org.apache.hadoop.io.compress.Decompressor;
|
10
|
-
import org.embulk.spi.Exec;
|
11
|
-
import org.slf4j.Logger;
|
12
|
-
|
13
|
-
import java.io.BufferedInputStream;
|
14
|
-
import java.io.ByteArrayInputStream;
|
15
|
-
import java.io.ByteArrayOutputStream;
|
16
|
-
import java.io.IOException;
|
17
|
-
import java.io.InputStream;
|
18
|
-
import java.io.SequenceInputStream;
|
19
|
-
|
20
|
-
/**
|
21
|
-
* Created by takahiro.nakayama on 2/21/16.
|
22
|
-
*/
|
23
|
-
public class PartialFileInputStreamBuilder
|
24
|
-
{
|
25
|
-
private static final Logger logger = Exec.getLogger(PartialFileInputStreamBuilder.class);
|
26
|
-
private final FileSystem fs;
|
27
|
-
private final PartialFile partialFile;
|
28
|
-
private int numHeaderLines = 0;
|
29
|
-
|
30
|
-
public PartialFileInputStreamBuilder(FileSystem fs, PartialFile partialFile)
|
31
|
-
{
|
32
|
-
this.fs = fs;
|
33
|
-
this.partialFile = partialFile;
|
34
|
-
}
|
35
|
-
|
36
|
-
public InputStream build()
|
37
|
-
throws IOException
|
38
|
-
{
|
39
|
-
logger.trace("path: {}, start: {}, end: {}, num_header_lines: {}",
|
40
|
-
partialFile.getPath(), partialFile.getStart(), partialFile.getEnd(), numHeaderLines);
|
41
|
-
if (partialFile.getStart() > 0 && numHeaderLines > 0) {
|
42
|
-
return new SequenceInputStream(createHeadersInputStream(), createPartialFileInputStream());
|
43
|
-
}
|
44
|
-
else {
|
45
|
-
return createPartialFileInputStream();
|
46
|
-
}
|
47
|
-
}
|
48
|
-
|
49
|
-
public PartialFileInputStreamBuilder withHeaders(int numHeaderLines)
|
50
|
-
{
|
51
|
-
this.numHeaderLines = numHeaderLines;
|
52
|
-
return this;
|
53
|
-
}
|
54
|
-
|
55
|
-
private InputStream createOriginalFileWrappedInputStream()
|
56
|
-
{
|
57
|
-
InputStream original = createOriginalFileInputStream();
|
58
|
-
CompressionCodec codec = new CompressionCodecFactory(fs.getConf()).getCodec(partialFile.getPath());
|
59
|
-
if (partialFile.getCanDecompress() && codec != null) {
|
60
|
-
try {
|
61
|
-
return codec.createInputStream(original);
|
62
|
-
}
|
63
|
-
catch (IOException e) {
|
64
|
-
throw Throwables.propagate(e);
|
65
|
-
}
|
66
|
-
}
|
67
|
-
else {
|
68
|
-
return original;
|
69
|
-
}
|
70
|
-
}
|
71
|
-
|
72
|
-
private InputStream createOriginalFileInputStream()
|
73
|
-
{
|
74
|
-
try {
|
75
|
-
return fs.open(partialFile.getPath());
|
76
|
-
}
|
77
|
-
catch (IOException e) {
|
78
|
-
throw Throwables.propagate(e);
|
79
|
-
}
|
80
|
-
}
|
81
|
-
|
82
|
-
// memo: compressioncodec使ったinputstream作る奴いても良いような…
|
83
|
-
// じゃないと、headers もおかしくなるような…ならんか
|
84
|
-
|
85
|
-
private InputStream createPartialFileInputStream()
|
86
|
-
{
|
87
|
-
InputStream original = createOriginalFileWrappedInputStream();
|
88
|
-
return new PartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
|
89
|
-
}
|
90
|
-
|
91
|
-
private InputStream createHeadersInputStream()
|
92
|
-
throws IOException
|
93
|
-
{
|
94
|
-
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
95
|
-
int skippedHeaders = 0;
|
96
|
-
InputStream original = createOriginalFileWrappedInputStream();
|
97
|
-
try (BufferedInputStream in = new BufferedInputStream(original)) {
|
98
|
-
while (true) {
|
99
|
-
int c = in.read();
|
100
|
-
if (c < 0) {
|
101
|
-
break;
|
102
|
-
}
|
103
|
-
|
104
|
-
header.write(c);
|
105
|
-
|
106
|
-
if (c == '\n') {
|
107
|
-
skippedHeaders++;
|
108
|
-
}
|
109
|
-
else if (c == '\r') {
|
110
|
-
int c2 = in.read();
|
111
|
-
if (c2 == '\n') {
|
112
|
-
header.write(c2);
|
113
|
-
}
|
114
|
-
skippedHeaders++;
|
115
|
-
}
|
116
|
-
|
117
|
-
if (skippedHeaders >= numHeaderLines) {
|
118
|
-
break;
|
119
|
-
}
|
120
|
-
}
|
121
|
-
}
|
122
|
-
header.close();
|
123
|
-
return new ByteArrayInputStream(header.toByteArray());
|
124
|
-
}
|
125
|
-
}
|