embulk-input-hdfs 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,128 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.FileSystem;
4
+ import org.apache.hadoop.fs.Path;
5
+ import org.apache.hadoop.io.compress.CompressionCodec;
6
+ import org.apache.hadoop.io.compress.CompressionCodecFactory;
7
+ import org.embulk.spi.Exec;
8
+ import org.slf4j.Logger;
9
+
10
+ import java.io.BufferedInputStream;
11
+ import java.io.ByteArrayInputStream;
12
+ import java.io.ByteArrayOutputStream;
13
+ import java.io.IOException;
14
+ import java.io.InputStream;
15
+ import java.io.SequenceInputStream;
16
+
17
+ public class TargetFileInputStreamFactory
18
+ {
19
+ private static final Logger logger = Exec.getLogger(TargetFileInputStreamFactory.class);
20
+ private final FileSystem fs;
21
+
22
+ public TargetFileInputStreamFactory(FileSystem fs)
23
+ {
24
+ this.fs = fs;
25
+ }
26
+
27
+ public InputStream create(TargetFileInfo t)
28
+ throws IOException
29
+ {
30
+ InputStream is = createSuitableInputStream(t);
31
+ return createInputStreamWithHeaders(is, t);
32
+ }
33
+
34
+ private InputStream createSuitableInputStream(TargetFileInfo t)
35
+ throws IOException
36
+ {
37
+ if (t.getIsDecompressible()) {
38
+ logger.debug("embulk-input-hdfs: createDecompressedInputStream: {}", t.getPathString());
39
+ return createDecompressedInputStream(t);
40
+ }
41
+ else if (t.getIsPartitionable()) {
42
+ logger.debug("embulk-input-hdfs: createPartialInputStream: {}, start:{}, end:{}",
43
+ t.getPathString(), t.getStart(), t.getEnd());
44
+ return createPartialInputStream(t);
45
+ }
46
+ else {
47
+ logger.debug("embulk-input-hdfs: createOriginalInputStream: {}", t.getPathString());
48
+ return createOriginalInputStream(t);
49
+ }
50
+ }
51
+
52
+ private InputStream createInputStreamWithHeaders(InputStream original, TargetFileInfo t)
53
+ throws IOException
54
+ {
55
+ if (t.getStart() > 0 && t.getNumHeaderLines() > 0) {
56
+ logger.debug("embulk-input-hdfs: createInputStreamWithHeaders: {}", t.getPathString());
57
+ InputStream headers = createHeadersInputStream(t);
58
+ return new SequenceInputStream(headers, original);
59
+ }
60
+ else {
61
+ return original;
62
+ }
63
+ }
64
+
65
+ private InputStream createOriginalInputStream(TargetFileInfo t)
66
+ throws IOException
67
+ {
68
+ return fs.open(new Path(t.getPathString()));
69
+ }
70
+
71
+ private InputStream createDecompressedInputStream(TargetFileInfo t)
72
+ throws IOException
73
+ {
74
+ InputStream original = createOriginalInputStream(t);
75
+ CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
76
+ CompressionCodec codec = factory.getCodec(new Path(t.getPathString()));
77
+ if (codec == null) {
78
+ logger.debug("embulk-input-hdfs: CompressionCodec: null: {}", t.getPathString());
79
+ return original;
80
+ }
81
+ else {
82
+ logger.debug("embulk-input-hdfs: CompressionCodec: {}: {}", codec, t.getPathString());
83
+ return codec.createInputStream(original);
84
+ }
85
+ }
86
+
87
+ private InputStream createPartialInputStream(TargetFileInfo t)
88
+ throws IOException
89
+ {
90
+ InputStream original = createOriginalInputStream(t);
91
+ return new TargetFilePartialInputStream(original, t.getStart(), t.getEnd());
92
+ }
93
+
94
+ private InputStream createHeadersInputStream(TargetFileInfo t)
95
+ throws IOException
96
+ {
97
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
98
+ int skippedHeaders = 0;
99
+ InputStream is = createOriginalInputStream(t);
100
+ try (BufferedInputStream in = new BufferedInputStream(is)) {
101
+ while (true) {
102
+ int c = in.read();
103
+ if (c < 0) {
104
+ break;
105
+ }
106
+
107
+ header.write(c);
108
+
109
+ if (c == '\n') {
110
+ skippedHeaders++;
111
+ }
112
+ else if (c == '\r') {
113
+ int c2 = in.read();
114
+ if (c2 == '\n') {
115
+ header.write(c2);
116
+ }
117
+ skippedHeaders++;
118
+ }
119
+
120
+ if (skippedHeaders >= t.getNumHeaderLines()) {
121
+ break;
122
+ }
123
+ }
124
+ }
125
+ header.close();
126
+ return new ByteArrayInputStream(header.toByteArray());
127
+ }
128
+ }
@@ -1,15 +1,13 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
+ // Ported from https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
4
+
3
5
  import java.io.BufferedInputStream;
4
6
  import java.io.IOException;
5
7
  import java.io.InputStream;
6
8
  import java.io.PushbackInputStream;
7
9
 
8
- /**
9
- * Created by takahiro.nakayama on 2/13/16.
10
- * ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
11
- */
12
- public class PartialFileInputStream
10
+ public class TargetFilePartialInputStream
13
11
  extends InputStream
14
12
  {
15
13
  private final PushbackInputStream original;
@@ -18,7 +16,7 @@ public class PartialFileInputStream
18
16
  private long current;
19
17
  private boolean eof;
20
18
 
21
- public PartialFileInputStream(InputStream original, long start, long end)
19
+ public TargetFilePartialInputStream(InputStream original, long start, long end)
22
20
  {
23
21
  this.original = new PushbackInputStream(new BufferedInputStream(original));
24
22
  this.start = start;
@@ -30,7 +30,6 @@ import javax.annotation.Nullable;
30
30
 
31
31
  import java.io.File;
32
32
  import java.util.ArrayList;
33
- import java.util.Iterator;
34
33
  import java.util.List;
35
34
 
36
35
  import static org.junit.Assert.assertEquals;
@@ -43,7 +42,6 @@ public class TestHdfsFileInputPlugin
43
42
  @Rule
44
43
  public ExpectedException exception = ExpectedException.none();
45
44
 
46
- private Logger logger = runtime.getExec().getLogger(TestHdfsFileInputPlugin.class);
47
45
  private HdfsFileInputPlugin plugin;
48
46
  private FileInputRunner runner;
49
47
  private MockPageOutput output;
@@ -67,11 +65,11 @@ public class TestHdfsFileInputPlugin
67
65
  assertEquals(path.toString(), task.getPath());
68
66
  assertEquals(Lists.newArrayList(), task.getConfigFiles());
69
67
  assertEquals(Maps.newHashMap(), task.getConfig());
70
- assertEquals(true, task.getPartition());
68
+ assertEquals(true, task.getWillPartition());
71
69
  assertEquals(0, task.getRewindSeconds());
72
70
  assertEquals(-1, task.getApproximateNumPartitions());
73
71
  assertEquals(0, task.getSkipHeaderLines());
74
- assertEquals(false, task.getDecompression());
72
+ assertEquals(false, task.getWillDecompress());
75
73
  }
76
74
 
77
75
  @Test(expected = ConfigException.class)
@@ -103,9 +101,9 @@ public class TestHdfsFileInputPlugin
103
101
  });
104
102
 
105
103
  List<String> resultFList = Lists.newArrayList();
106
- for (int i = 0; i < task.getPartialFileList().getTaskCount();i++) {
107
- for (PartialFile partialFile : task.getPartialFileList().get(i)) {
108
- resultFList.add(partialFile.getPath().toString());
104
+ for (int i = 0; i < task.getTargetFileInfoList().getTaskCount();i++) {
105
+ for (TargetFileInfo targetFileInfo : task.getTargetFileInfoList().get(i)) {
106
+ resultFList.add(targetFileInfo.getPathString());
109
107
  }
110
108
  }
111
109
  assertEquals(fileList.size(), resultFList.size());
@@ -152,7 +150,7 @@ public class TestHdfsFileInputPlugin
152
150
  config.set("path", "/tmp/%Y-%m-%d");
153
151
  config.set("rewind_seconds", 86400);
154
152
  PluginTask task = config.loadConfig(PluginTask.class);
155
- String result = plugin.strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
153
+ String result = new Strftime(task).format(task.getPath());
156
154
  String expected = task.getJRuby().runScriptlet("(Time.now - 86400).strftime('/tmp/%Y-%m-%d')").toString();
157
155
  assertEquals(expected, result);
158
156
  }
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-25 00:00:00.000000000 Z
11
+ date: 2016-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.0'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
16
  - - ~>
23
17
  - !ruby/object:Gem::Version
24
18
  version: '1.0'
19
+ name: bundler
25
20
  prerelease: false
26
21
  type: :development
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - '>='
24
+ - - ~>
32
25
  - !ruby/object:Gem::Version
33
- version: '10.0'
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
30
  - - '>='
37
31
  - !ruby/object:Gem::Version
38
32
  version: '10.0'
33
+ name: rake
39
34
  prerelease: false
40
35
  type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
41
  description: Reads files stored on Hdfs.
42
42
  email:
43
43
  - civitaspo@gmail.com
@@ -61,12 +61,13 @@ files:
61
61
  - gradlew
62
62
  - gradlew.bat
63
63
  - lib/embulk/input/hdfs.rb
64
- - src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java
64
+ - src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java
65
65
  - src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
66
- - src/main/java/org/embulk/input/hdfs/PartialFile.java
67
- - src/main/java/org/embulk/input/hdfs/PartialFileInputStream.java
68
- - src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java
69
- - src/main/java/org/embulk/input/hdfs/PartialFileList.java
66
+ - src/main/java/org/embulk/input/hdfs/Strftime.java
67
+ - src/main/java/org/embulk/input/hdfs/TargetFileInfo.java
68
+ - src/main/java/org/embulk/input/hdfs/TargetFileInfoList.java
69
+ - src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java
70
+ - src/main/java/org/embulk/input/hdfs/TargetFilePartialInputStream.java
70
71
  - src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
71
72
  - src/test/resources/sample_01.csv
72
73
  - src/test/resources/sample_02.csv
@@ -94,7 +95,7 @@ files:
94
95
  - classpath/curator-client-2.6.0.jar
95
96
  - classpath/curator-framework-2.6.0.jar
96
97
  - classpath/curator-recipes-2.6.0.jar
97
- - classpath/embulk-input-hdfs-0.2.1.jar
98
+ - classpath/embulk-input-hdfs-0.3.0.jar
98
99
  - classpath/gson-2.2.4.jar
99
100
  - classpath/hadoop-annotations-2.6.4.jar
100
101
  - classpath/hadoop-auth-2.6.4.jar
@@ -1,82 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import com.google.common.collect.ImmutableList;
4
- import com.google.common.collect.ImmutableMap;
5
- import com.google.common.collect.Lists;
6
- import com.google.common.collect.Maps;
7
- import org.apache.hadoop.conf.Configuration;
8
- import org.embulk.config.ConfigException;
9
- import org.embulk.spi.Exec;
10
- import org.slf4j.Logger;
11
-
12
- import java.io.File;
13
- import java.net.MalformedURLException;
14
- import java.util.List;
15
- import java.util.Map;
16
-
17
- /**
18
- * Created by takahiro.nakayama on 2/22/16.
19
- */
20
- public class ConfigurationBuilder
21
- {
22
- private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
23
- private final ImmutableList.Builder<String> configFilesBuilder;
24
- private final ImmutableMap.Builder<String, String> configMapBuilder;
25
-
26
- public ConfigurationBuilder()
27
- {
28
- this.configFilesBuilder = ImmutableList.builder();
29
- this.configMapBuilder = ImmutableMap.builder();
30
- }
31
-
32
- public ConfigurationBuilder addConfigFiles(List<String> configFiles)
33
- {
34
- for (String configFile : configFiles) {
35
- addConfigFile(configFile);
36
- }
37
- return this;
38
- }
39
-
40
- public ConfigurationBuilder addConfigFile(String configFile)
41
- {
42
- configFilesBuilder.add(configFile);
43
- return this;
44
- }
45
-
46
- public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
47
- {
48
- for (Map.Entry<String, String> entry : configMap.entrySet()) {
49
- addConfig(entry.getKey(), entry.getValue());
50
- }
51
- return this;
52
- }
53
-
54
- public ConfigurationBuilder addConfig(String key, String value)
55
- {
56
- configMapBuilder.put(key, value);
57
- return this;
58
- }
59
-
60
- public Configuration build()
61
- {
62
- Configuration configuration = new Configuration();
63
- for (String configFile : configFilesBuilder.build()) {
64
- File file = new File(configFile);
65
- try {
66
- configuration.addResource(file.toURI().toURL());
67
- }
68
- catch (MalformedURLException e) {
69
- throw new ConfigException(e);
70
- }
71
- }
72
- for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
73
- configuration.set(entry.getKey(), entry.getValue());
74
- }
75
- // For debug
76
- for (Map.Entry<String, String> entry : configuration) {
77
- logger.trace("{}: {}", entry.getKey(), entry.getValue());
78
- }
79
- logger.trace("Resource Files: {}", configuration);
80
- return configuration;
81
- }
82
- }
@@ -1,48 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import org.apache.hadoop.fs.Path;
4
-
5
- /**
6
- * Created by takahiro.nakayama on 2/20/16.
7
- * is the same as PartialFileList.Entry, so this class does not need?
8
- */
9
- public class PartialFile
10
- {
11
- private final Path path;
12
- private final long start;
13
- private final long end;
14
- private final boolean canDecompress;
15
-
16
- public PartialFile(String path, long start, long end, boolean canDecompress)
17
- {
18
- this(new Path(path), start, end, canDecompress);
19
- }
20
-
21
- public PartialFile(Path path, long start, long end, boolean canDecompress)
22
- {
23
- this.path = path;
24
- this.start = start;
25
- this.end = end;
26
- this.canDecompress = canDecompress;
27
- }
28
-
29
- public Path getPath()
30
- {
31
- return path;
32
- }
33
-
34
- public long getStart()
35
- {
36
- return start;
37
- }
38
-
39
- public long getEnd()
40
- {
41
- return end;
42
- }
43
-
44
- public boolean getCanDecompress()
45
- {
46
- return canDecompress;
47
- }
48
- }
@@ -1,125 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import com.google.common.base.Optional;
4
- import com.google.common.base.Throwables;
5
- import org.apache.hadoop.fs.FileSystem;
6
- import org.apache.hadoop.io.compress.CodecPool;
7
- import org.apache.hadoop.io.compress.CompressionCodec;
8
- import org.apache.hadoop.io.compress.CompressionCodecFactory;
9
- import org.apache.hadoop.io.compress.Decompressor;
10
- import org.embulk.spi.Exec;
11
- import org.slf4j.Logger;
12
-
13
- import java.io.BufferedInputStream;
14
- import java.io.ByteArrayInputStream;
15
- import java.io.ByteArrayOutputStream;
16
- import java.io.IOException;
17
- import java.io.InputStream;
18
- import java.io.SequenceInputStream;
19
-
20
- /**
21
- * Created by takahiro.nakayama on 2/21/16.
22
- */
23
- public class PartialFileInputStreamBuilder
24
- {
25
- private static final Logger logger = Exec.getLogger(PartialFileInputStreamBuilder.class);
26
- private final FileSystem fs;
27
- private final PartialFile partialFile;
28
- private int numHeaderLines = 0;
29
-
30
- public PartialFileInputStreamBuilder(FileSystem fs, PartialFile partialFile)
31
- {
32
- this.fs = fs;
33
- this.partialFile = partialFile;
34
- }
35
-
36
- public InputStream build()
37
- throws IOException
38
- {
39
- logger.trace("path: {}, start: {}, end: {}, num_header_lines: {}",
40
- partialFile.getPath(), partialFile.getStart(), partialFile.getEnd(), numHeaderLines);
41
- if (partialFile.getStart() > 0 && numHeaderLines > 0) {
42
- return new SequenceInputStream(createHeadersInputStream(), createPartialFileInputStream());
43
- }
44
- else {
45
- return createPartialFileInputStream();
46
- }
47
- }
48
-
49
- public PartialFileInputStreamBuilder withHeaders(int numHeaderLines)
50
- {
51
- this.numHeaderLines = numHeaderLines;
52
- return this;
53
- }
54
-
55
- private InputStream createOriginalFileWrappedInputStream()
56
- {
57
- InputStream original = createOriginalFileInputStream();
58
- CompressionCodec codec = new CompressionCodecFactory(fs.getConf()).getCodec(partialFile.getPath());
59
- if (partialFile.getCanDecompress() && codec != null) {
60
- try {
61
- return codec.createInputStream(original);
62
- }
63
- catch (IOException e) {
64
- throw Throwables.propagate(e);
65
- }
66
- }
67
- else {
68
- return original;
69
- }
70
- }
71
-
72
- private InputStream createOriginalFileInputStream()
73
- {
74
- try {
75
- return fs.open(partialFile.getPath());
76
- }
77
- catch (IOException e) {
78
- throw Throwables.propagate(e);
79
- }
80
- }
81
-
82
- // memo: compressioncodec使ったinputstream作る奴いても良いような…
83
- // じゃないと、headers もおかしくなるような…ならんか
84
-
85
- private InputStream createPartialFileInputStream()
86
- {
87
- InputStream original = createOriginalFileWrappedInputStream();
88
- return new PartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
89
- }
90
-
91
- private InputStream createHeadersInputStream()
92
- throws IOException
93
- {
94
- ByteArrayOutputStream header = new ByteArrayOutputStream();
95
- int skippedHeaders = 0;
96
- InputStream original = createOriginalFileWrappedInputStream();
97
- try (BufferedInputStream in = new BufferedInputStream(original)) {
98
- while (true) {
99
- int c = in.read();
100
- if (c < 0) {
101
- break;
102
- }
103
-
104
- header.write(c);
105
-
106
- if (c == '\n') {
107
- skippedHeaders++;
108
- }
109
- else if (c == '\r') {
110
- int c2 = in.read();
111
- if (c2 == '\n') {
112
- header.write(c2);
113
- }
114
- skippedHeaders++;
115
- }
116
-
117
- if (skippedHeaders >= numHeaderLines) {
118
- break;
119
- }
120
- }
121
- }
122
- header.close();
123
- return new ByteArrayInputStream(header.toByteArray());
124
- }
125
- }