embulk-input-hdfs 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.FileSystem;
4
+ import org.apache.hadoop.fs.Path;
5
+ import org.apache.hadoop.io.compress.CompressionCodec;
6
+ import org.apache.hadoop.io.compress.CompressionCodecFactory;
7
+ import org.embulk.spi.Exec;
8
+ import org.slf4j.Logger;
9
+
10
+ import java.io.BufferedInputStream;
11
+ import java.io.ByteArrayInputStream;
12
+ import java.io.ByteArrayOutputStream;
13
+ import java.io.IOException;
14
+ import java.io.InputStream;
15
+ import java.io.SequenceInputStream;
16
+
17
+ public class TargetFileInputStreamFactory
18
+ {
19
+ private static final Logger logger = Exec.getLogger(TargetFileInputStreamFactory.class);
20
+ private final FileSystem fs;
21
+
22
+ public TargetFileInputStreamFactory(FileSystem fs)
23
+ {
24
+ this.fs = fs;
25
+ }
26
+
27
+ public InputStream create(TargetFileInfo t)
28
+ throws IOException
29
+ {
30
+ InputStream is = createSuitableInputStream(t);
31
+ return createInputStreamWithHeaders(is, t);
32
+ }
33
+
34
+ private InputStream createSuitableInputStream(TargetFileInfo t)
35
+ throws IOException
36
+ {
37
+ if (t.getIsDecompressible()) {
38
+ logger.debug("embulk-input-hdfs: createDecompressedInputStream: {}", t.getPathString());
39
+ return createDecompressedInputStream(t);
40
+ }
41
+ else if (t.getIsPartitionable()) {
42
+ logger.debug("embulk-input-hdfs: createPartialInputStream: {}, start:{}, end:{}",
43
+ t.getPathString(), t.getStart(), t.getEnd());
44
+ return createPartialInputStream(t);
45
+ }
46
+ else {
47
+ logger.debug("embulk-input-hdfs: createOriginalInputStream: {}", t.getPathString());
48
+ return createOriginalInputStream(t);
49
+ }
50
+ }
51
+
52
+ private InputStream createInputStreamWithHeaders(InputStream original, TargetFileInfo t)
53
+ throws IOException
54
+ {
55
+ if (t.getStart() > 0 && t.getNumHeaderLines() > 0) {
56
+ logger.debug("embulk-input-hdfs: createInputStreamWithHeaders: {}", t.getPathString());
57
+ InputStream headers = createHeadersInputStream(t);
58
+ return new SequenceInputStream(headers, original);
59
+ }
60
+ else {
61
+ return original;
62
+ }
63
+ }
64
+
65
+ private InputStream createOriginalInputStream(TargetFileInfo t)
66
+ throws IOException
67
+ {
68
+ return fs.open(new Path(t.getPathString()));
69
+ }
70
+
71
+ private InputStream createDecompressedInputStream(TargetFileInfo t)
72
+ throws IOException
73
+ {
74
+ InputStream original = createOriginalInputStream(t);
75
+ CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
76
+ CompressionCodec codec = factory.getCodec(new Path(t.getPathString()));
77
+ if (codec == null) {
78
+ logger.debug("embulk-input-hdfs: CompressionCodec: null: {}", t.getPathString());
79
+ return original;
80
+ }
81
+ else {
82
+ logger.debug("embulk-input-hdfs: CompressionCodec: {}: {}", codec, t.getPathString());
83
+ return codec.createInputStream(original);
84
+ }
85
+ }
86
+
87
+ private InputStream createPartialInputStream(TargetFileInfo t)
88
+ throws IOException
89
+ {
90
+ InputStream original = createOriginalInputStream(t);
91
+ return new TargetFilePartialInputStream(original, t.getStart(), t.getEnd());
92
+ }
93
+
94
+ private InputStream createHeadersInputStream(TargetFileInfo t)
95
+ throws IOException
96
+ {
97
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
98
+ int skippedHeaders = 0;
99
+ InputStream is = createOriginalInputStream(t);
100
+ try (BufferedInputStream in = new BufferedInputStream(is)) {
101
+ while (true) {
102
+ int c = in.read();
103
+ if (c < 0) {
104
+ break;
105
+ }
106
+
107
+ header.write(c);
108
+
109
+ if (c == '\n') {
110
+ skippedHeaders++;
111
+ }
112
+ else if (c == '\r') {
113
+ int c2 = in.read();
114
+ if (c2 == '\n') {
115
+ header.write(c2);
116
+ }
117
+ skippedHeaders++;
118
+ }
119
+
120
+ if (skippedHeaders >= t.getNumHeaderLines()) {
121
+ break;
122
+ }
123
+ }
124
+ }
125
+ header.close();
126
+ return new ByteArrayInputStream(header.toByteArray());
127
+ }
128
+ }
@@ -1,15 +1,13 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
+ // Ported from https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
4
+
3
5
  import java.io.BufferedInputStream;
4
6
  import java.io.IOException;
5
7
  import java.io.InputStream;
6
8
  import java.io.PushbackInputStream;
7
9
 
8
- /**
9
- * Created by takahiro.nakayama on 2/13/16.
10
- * ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
11
- */
12
- public class PartialFileInputStream
10
+ public class TargetFilePartialInputStream
13
11
  extends InputStream
14
12
  {
15
13
  private final PushbackInputStream original;
@@ -18,7 +16,7 @@ public class PartialFileInputStream
18
16
  private long current;
19
17
  private boolean eof;
20
18
 
21
- public PartialFileInputStream(InputStream original, long start, long end)
19
+ public TargetFilePartialInputStream(InputStream original, long start, long end)
22
20
  {
23
21
  this.original = new PushbackInputStream(new BufferedInputStream(original));
24
22
  this.start = start;
@@ -30,7 +30,6 @@ import javax.annotation.Nullable;
30
30
 
31
31
  import java.io.File;
32
32
  import java.util.ArrayList;
33
- import java.util.Iterator;
34
33
  import java.util.List;
35
34
 
36
35
  import static org.junit.Assert.assertEquals;
@@ -43,7 +42,6 @@ public class TestHdfsFileInputPlugin
43
42
  @Rule
44
43
  public ExpectedException exception = ExpectedException.none();
45
44
 
46
- private Logger logger = runtime.getExec().getLogger(TestHdfsFileInputPlugin.class);
47
45
  private HdfsFileInputPlugin plugin;
48
46
  private FileInputRunner runner;
49
47
  private MockPageOutput output;
@@ -67,11 +65,11 @@ public class TestHdfsFileInputPlugin
67
65
  assertEquals(path.toString(), task.getPath());
68
66
  assertEquals(Lists.newArrayList(), task.getConfigFiles());
69
67
  assertEquals(Maps.newHashMap(), task.getConfig());
70
- assertEquals(true, task.getPartition());
68
+ assertEquals(true, task.getWillPartition());
71
69
  assertEquals(0, task.getRewindSeconds());
72
70
  assertEquals(-1, task.getApproximateNumPartitions());
73
71
  assertEquals(0, task.getSkipHeaderLines());
74
- assertEquals(false, task.getDecompression());
72
+ assertEquals(false, task.getWillDecompress());
75
73
  }
76
74
 
77
75
  @Test(expected = ConfigException.class)
@@ -103,9 +101,9 @@ public class TestHdfsFileInputPlugin
103
101
  });
104
102
 
105
103
  List<String> resultFList = Lists.newArrayList();
106
- for (int i = 0; i < task.getPartialFileList().getTaskCount();i++) {
107
- for (PartialFile partialFile : task.getPartialFileList().get(i)) {
108
- resultFList.add(partialFile.getPath().toString());
104
+ for (int i = 0; i < task.getTargetFileInfoList().getTaskCount();i++) {
105
+ for (TargetFileInfo targetFileInfo : task.getTargetFileInfoList().get(i)) {
106
+ resultFList.add(targetFileInfo.getPathString());
109
107
  }
110
108
  }
111
109
  assertEquals(fileList.size(), resultFList.size());
@@ -152,7 +150,7 @@ public class TestHdfsFileInputPlugin
152
150
  config.set("path", "/tmp/%Y-%m-%d");
153
151
  config.set("rewind_seconds", 86400);
154
152
  PluginTask task = config.loadConfig(PluginTask.class);
155
- String result = plugin.strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
153
+ String result = new Strftime(task).format(task.getPath());
156
154
  String expected = task.getJRuby().runScriptlet("(Time.now - 86400).strftime('/tmp/%Y-%m-%d')").toString();
157
155
  assertEquals(expected, result);
158
156
  }
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-25 00:00:00.000000000 Z
11
+ date: 2016-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.0'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
16
  - - ~>
23
17
  - !ruby/object:Gem::Version
24
18
  version: '1.0'
19
+ name: bundler
25
20
  prerelease: false
26
21
  type: :development
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - '>='
24
+ - - ~>
32
25
  - !ruby/object:Gem::Version
33
- version: '10.0'
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
30
  - - '>='
37
31
  - !ruby/object:Gem::Version
38
32
  version: '10.0'
33
+ name: rake
39
34
  prerelease: false
40
35
  type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
41
  description: Reads files stored on Hdfs.
42
42
  email:
43
43
  - civitaspo@gmail.com
@@ -61,12 +61,13 @@ files:
61
61
  - gradlew
62
62
  - gradlew.bat
63
63
  - lib/embulk/input/hdfs.rb
64
- - src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java
64
+ - src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java
65
65
  - src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
66
- - src/main/java/org/embulk/input/hdfs/PartialFile.java
67
- - src/main/java/org/embulk/input/hdfs/PartialFileInputStream.java
68
- - src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java
69
- - src/main/java/org/embulk/input/hdfs/PartialFileList.java
66
+ - src/main/java/org/embulk/input/hdfs/Strftime.java
67
+ - src/main/java/org/embulk/input/hdfs/TargetFileInfo.java
68
+ - src/main/java/org/embulk/input/hdfs/TargetFileInfoList.java
69
+ - src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java
70
+ - src/main/java/org/embulk/input/hdfs/TargetFilePartialInputStream.java
70
71
  - src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
71
72
  - src/test/resources/sample_01.csv
72
73
  - src/test/resources/sample_02.csv
@@ -94,7 +95,7 @@ files:
94
95
  - classpath/curator-client-2.6.0.jar
95
96
  - classpath/curator-framework-2.6.0.jar
96
97
  - classpath/curator-recipes-2.6.0.jar
97
- - classpath/embulk-input-hdfs-0.2.1.jar
98
+ - classpath/embulk-input-hdfs-0.3.0.jar
98
99
  - classpath/gson-2.2.4.jar
99
100
  - classpath/hadoop-annotations-2.6.4.jar
100
101
  - classpath/hadoop-auth-2.6.4.jar
@@ -1,82 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import com.google.common.collect.ImmutableList;
4
- import com.google.common.collect.ImmutableMap;
5
- import com.google.common.collect.Lists;
6
- import com.google.common.collect.Maps;
7
- import org.apache.hadoop.conf.Configuration;
8
- import org.embulk.config.ConfigException;
9
- import org.embulk.spi.Exec;
10
- import org.slf4j.Logger;
11
-
12
- import java.io.File;
13
- import java.net.MalformedURLException;
14
- import java.util.List;
15
- import java.util.Map;
16
-
17
- /**
18
- * Created by takahiro.nakayama on 2/22/16.
19
- */
20
- public class ConfigurationBuilder
21
- {
22
- private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
23
- private final ImmutableList.Builder<String> configFilesBuilder;
24
- private final ImmutableMap.Builder<String, String> configMapBuilder;
25
-
26
- public ConfigurationBuilder()
27
- {
28
- this.configFilesBuilder = ImmutableList.builder();
29
- this.configMapBuilder = ImmutableMap.builder();
30
- }
31
-
32
- public ConfigurationBuilder addConfigFiles(List<String> configFiles)
33
- {
34
- for (String configFile : configFiles) {
35
- addConfigFile(configFile);
36
- }
37
- return this;
38
- }
39
-
40
- public ConfigurationBuilder addConfigFile(String configFile)
41
- {
42
- configFilesBuilder.add(configFile);
43
- return this;
44
- }
45
-
46
- public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
47
- {
48
- for (Map.Entry<String, String> entry : configMap.entrySet()) {
49
- addConfig(entry.getKey(), entry.getValue());
50
- }
51
- return this;
52
- }
53
-
54
- public ConfigurationBuilder addConfig(String key, String value)
55
- {
56
- configMapBuilder.put(key, value);
57
- return this;
58
- }
59
-
60
- public Configuration build()
61
- {
62
- Configuration configuration = new Configuration();
63
- for (String configFile : configFilesBuilder.build()) {
64
- File file = new File(configFile);
65
- try {
66
- configuration.addResource(file.toURI().toURL());
67
- }
68
- catch (MalformedURLException e) {
69
- throw new ConfigException(e);
70
- }
71
- }
72
- for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
73
- configuration.set(entry.getKey(), entry.getValue());
74
- }
75
- // For debug
76
- for (Map.Entry<String, String> entry : configuration) {
77
- logger.trace("{}: {}", entry.getKey(), entry.getValue());
78
- }
79
- logger.trace("Resource Files: {}", configuration);
80
- return configuration;
81
- }
82
- }
@@ -1,48 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import org.apache.hadoop.fs.Path;
4
-
5
- /**
6
- * Created by takahiro.nakayama on 2/20/16.
7
- * is the same as PartialFileList.Entry, so this class does not need?
8
- */
9
- public class PartialFile
10
- {
11
- private final Path path;
12
- private final long start;
13
- private final long end;
14
- private final boolean canDecompress;
15
-
16
- public PartialFile(String path, long start, long end, boolean canDecompress)
17
- {
18
- this(new Path(path), start, end, canDecompress);
19
- }
20
-
21
- public PartialFile(Path path, long start, long end, boolean canDecompress)
22
- {
23
- this.path = path;
24
- this.start = start;
25
- this.end = end;
26
- this.canDecompress = canDecompress;
27
- }
28
-
29
- public Path getPath()
30
- {
31
- return path;
32
- }
33
-
34
- public long getStart()
35
- {
36
- return start;
37
- }
38
-
39
- public long getEnd()
40
- {
41
- return end;
42
- }
43
-
44
- public boolean getCanDecompress()
45
- {
46
- return canDecompress;
47
- }
48
- }
@@ -1,125 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import com.google.common.base.Optional;
4
- import com.google.common.base.Throwables;
5
- import org.apache.hadoop.fs.FileSystem;
6
- import org.apache.hadoop.io.compress.CodecPool;
7
- import org.apache.hadoop.io.compress.CompressionCodec;
8
- import org.apache.hadoop.io.compress.CompressionCodecFactory;
9
- import org.apache.hadoop.io.compress.Decompressor;
10
- import org.embulk.spi.Exec;
11
- import org.slf4j.Logger;
12
-
13
- import java.io.BufferedInputStream;
14
- import java.io.ByteArrayInputStream;
15
- import java.io.ByteArrayOutputStream;
16
- import java.io.IOException;
17
- import java.io.InputStream;
18
- import java.io.SequenceInputStream;
19
-
20
- /**
21
- * Created by takahiro.nakayama on 2/21/16.
22
- */
23
- public class PartialFileInputStreamBuilder
24
- {
25
- private static final Logger logger = Exec.getLogger(PartialFileInputStreamBuilder.class);
26
- private final FileSystem fs;
27
- private final PartialFile partialFile;
28
- private int numHeaderLines = 0;
29
-
30
- public PartialFileInputStreamBuilder(FileSystem fs, PartialFile partialFile)
31
- {
32
- this.fs = fs;
33
- this.partialFile = partialFile;
34
- }
35
-
36
- public InputStream build()
37
- throws IOException
38
- {
39
- logger.trace("path: {}, start: {}, end: {}, num_header_lines: {}",
40
- partialFile.getPath(), partialFile.getStart(), partialFile.getEnd(), numHeaderLines);
41
- if (partialFile.getStart() > 0 && numHeaderLines > 0) {
42
- return new SequenceInputStream(createHeadersInputStream(), createPartialFileInputStream());
43
- }
44
- else {
45
- return createPartialFileInputStream();
46
- }
47
- }
48
-
49
- public PartialFileInputStreamBuilder withHeaders(int numHeaderLines)
50
- {
51
- this.numHeaderLines = numHeaderLines;
52
- return this;
53
- }
54
-
55
- private InputStream createOriginalFileWrappedInputStream()
56
- {
57
- InputStream original = createOriginalFileInputStream();
58
- CompressionCodec codec = new CompressionCodecFactory(fs.getConf()).getCodec(partialFile.getPath());
59
- if (partialFile.getCanDecompress() && codec != null) {
60
- try {
61
- return codec.createInputStream(original);
62
- }
63
- catch (IOException e) {
64
- throw Throwables.propagate(e);
65
- }
66
- }
67
- else {
68
- return original;
69
- }
70
- }
71
-
72
- private InputStream createOriginalFileInputStream()
73
- {
74
- try {
75
- return fs.open(partialFile.getPath());
76
- }
77
- catch (IOException e) {
78
- throw Throwables.propagate(e);
79
- }
80
- }
81
-
82
- // memo: compressioncodec使ったinputstream作る奴いても良いような…
83
- // じゃないと、headers もおかしくなるような…ならんか
84
-
85
- private InputStream createPartialFileInputStream()
86
- {
87
- InputStream original = createOriginalFileWrappedInputStream();
88
- return new PartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
89
- }
90
-
91
- private InputStream createHeadersInputStream()
92
- throws IOException
93
- {
94
- ByteArrayOutputStream header = new ByteArrayOutputStream();
95
- int skippedHeaders = 0;
96
- InputStream original = createOriginalFileWrappedInputStream();
97
- try (BufferedInputStream in = new BufferedInputStream(original)) {
98
- while (true) {
99
- int c = in.read();
100
- if (c < 0) {
101
- break;
102
- }
103
-
104
- header.write(c);
105
-
106
- if (c == '\n') {
107
- skippedHeaders++;
108
- }
109
- else if (c == '\r') {
110
- int c2 = in.read();
111
- if (c2 == '\n') {
112
- header.write(c2);
113
- }
114
- skippedHeaders++;
115
- }
116
-
117
- if (skippedHeaders >= numHeaderLines) {
118
- break;
119
- }
120
- }
121
- }
122
- header.close();
123
- return new ByteArrayInputStream(header.toByteArray());
124
- }
125
- }