embulk-input-hdfs 0.1.9 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHENGELOG.md +7 -0
- data/README.md +18 -15
- data/build.gradle +1 -1
- data/example/config.yml +4 -1
- data/example/data2.csv.gz +0 -0
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +82 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +248 -212
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +48 -0
- data/src/main/java/org/embulk/input/hdfs/{HdfsPartialFileInputStream.java → PartialFileInputStream.java} +9 -4
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +125 -0
- data/src/main/java/org/embulk/input/hdfs/PartialFileList.java +360 -0
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +38 -14
- data/src/test/resources/sample_03.csv.gz +0 -0
- metadata +26 -21
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +0 -40
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +0 -39
@@ -30,6 +30,7 @@ import javax.annotation.Nullable;
|
|
30
30
|
|
31
31
|
import java.io.File;
|
32
32
|
import java.util.ArrayList;
|
33
|
+
import java.util.Iterator;
|
33
34
|
import java.util.List;
|
34
35
|
|
35
36
|
import static org.junit.Assert.assertEquals;
|
@@ -69,6 +70,8 @@ public class TestHdfsFileInputPlugin
|
|
69
70
|
assertEquals(true, task.getPartition());
|
70
71
|
assertEquals(0, task.getRewindSeconds());
|
71
72
|
assertEquals(-1, task.getApproximateNumPartitions());
|
73
|
+
assertEquals(0, task.getSkipHeaderLines());
|
74
|
+
assertEquals(false, task.getDecompression());
|
72
75
|
}
|
73
76
|
|
74
77
|
@Test(expected = ConfigException.class)
|
@@ -99,17 +102,14 @@ public class TestHdfsFileInputPlugin
|
|
99
102
|
}
|
100
103
|
});
|
101
104
|
|
102
|
-
List<String> resultFList = Lists.
|
103
|
-
{
|
104
|
-
|
105
|
-
|
106
|
-
public String apply(@Nullable HdfsPartialFile input)
|
107
|
-
{
|
108
|
-
assert input != null;
|
109
|
-
return input.getPath();
|
105
|
+
List<String> resultFList = Lists.newArrayList();
|
106
|
+
for (int i = 0; i < task.getPartialFileList().getTaskCount();i++) {
|
107
|
+
for (PartialFile partialFile : task.getPartialFileList().get(i)) {
|
108
|
+
resultFList.add(partialFile.getPath().toString());
|
110
109
|
}
|
111
|
-
}
|
112
|
-
assertEquals(fileList, resultFList);
|
110
|
+
}
|
111
|
+
assertEquals(fileList.size(), resultFList.size());
|
112
|
+
assert fileList.containsAll(resultFList);
|
113
113
|
return emptyTaskReports(taskCount);
|
114
114
|
}
|
115
115
|
});
|
@@ -120,8 +120,9 @@ public class TestHdfsFileInputPlugin
|
|
120
120
|
{
|
121
121
|
ConfigSource config = getConfigWithDefaultValues();
|
122
122
|
config.set("num_partitions", 10);
|
123
|
+
config.set("decompression", true);
|
123
124
|
runner.transaction(config, new Control());
|
124
|
-
assertRecords(config, output);
|
125
|
+
assertRecords(config, output, 12);
|
125
126
|
}
|
126
127
|
|
127
128
|
@Test
|
@@ -129,8 +130,31 @@ public class TestHdfsFileInputPlugin
|
|
129
130
|
{
|
130
131
|
ConfigSource config = getConfigWithDefaultValues();
|
131
132
|
config.set("partition", false);
|
133
|
+
config.set("decompression", true);
|
134
|
+
runner.transaction(config, new Control());
|
135
|
+
assertRecords(config, output, 12);
|
136
|
+
}
|
137
|
+
|
138
|
+
@Test
|
139
|
+
public void testHdfsFileInputByOpenWithoutCompressionCodec()
|
140
|
+
{
|
141
|
+
ConfigSource config = getConfigWithDefaultValues();
|
142
|
+
config.set("partition", false);
|
143
|
+
config.set("path", getClass().getResource("/sample_01.csv").getPath());
|
132
144
|
runner.transaction(config, new Control());
|
133
|
-
assertRecords(config, output);
|
145
|
+
assertRecords(config, output, 4);
|
146
|
+
}
|
147
|
+
|
148
|
+
@Test
|
149
|
+
public void testStrftime()
|
150
|
+
{
|
151
|
+
ConfigSource config = getConfigWithDefaultValues();
|
152
|
+
config.set("path", "/tmp/%Y-%m-%d");
|
153
|
+
config.set("rewind_seconds", 86400);
|
154
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
155
|
+
String result = plugin.strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
|
156
|
+
String expected = task.getJRuby().runScriptlet("(Time.now - 86400).strftime('/tmp/%Y-%m-%d')").toString();
|
157
|
+
assertEquals(expected, result);
|
134
158
|
}
|
135
159
|
|
136
160
|
private class Control
|
@@ -201,10 +225,10 @@ public class TestHdfsFileInputPlugin
|
|
201
225
|
return builder.build();
|
202
226
|
}
|
203
227
|
|
204
|
-
private void assertRecords(ConfigSource config, MockPageOutput output)
|
228
|
+
private void assertRecords(ConfigSource config, MockPageOutput output, long size)
|
205
229
|
{
|
206
230
|
List<Object[]> records = getRecords(config, output);
|
207
|
-
assertEquals(
|
231
|
+
assertEquals(size, records.size());
|
208
232
|
{
|
209
233
|
Object[] record = records.get(0);
|
210
234
|
assertEquals(1L, record[0]);
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Civitaspo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -47,6 +47,7 @@ extra_rdoc_files: []
|
|
47
47
|
files:
|
48
48
|
- .gitignore
|
49
49
|
- .travis.yml
|
50
|
+
- CHENGELOG.md
|
50
51
|
- LICENSE.txt
|
51
52
|
- README.md
|
52
53
|
- build.gradle
|
@@ -54,18 +55,22 @@ files:
|
|
54
55
|
- config/checkstyle/default.xml
|
55
56
|
- example/config.yml
|
56
57
|
- example/data.csv
|
58
|
+
- example/data2.csv.gz
|
57
59
|
- gradle/wrapper/gradle-wrapper.jar
|
58
60
|
- gradle/wrapper/gradle-wrapper.properties
|
59
61
|
- gradlew
|
60
62
|
- gradlew.bat
|
61
63
|
- lib/embulk/input/hdfs.rb
|
64
|
+
- src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java
|
62
65
|
- src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
|
63
|
-
- src/main/java/org/embulk/input/hdfs/
|
64
|
-
- src/main/java/org/embulk/input/hdfs/
|
65
|
-
- src/main/java/org/embulk/input/hdfs/
|
66
|
+
- src/main/java/org/embulk/input/hdfs/PartialFile.java
|
67
|
+
- src/main/java/org/embulk/input/hdfs/PartialFileInputStream.java
|
68
|
+
- src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java
|
69
|
+
- src/main/java/org/embulk/input/hdfs/PartialFileList.java
|
66
70
|
- src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
|
67
71
|
- src/test/resources/sample_01.csv
|
68
72
|
- src/test/resources/sample_02.csv
|
73
|
+
- src/test/resources/sample_03.csv.gz
|
69
74
|
- classpath/activation-1.1.jar
|
70
75
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
71
76
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -89,23 +94,23 @@ files:
|
|
89
94
|
- classpath/curator-client-2.6.0.jar
|
90
95
|
- classpath/curator-framework-2.6.0.jar
|
91
96
|
- classpath/curator-recipes-2.6.0.jar
|
92
|
-
- classpath/embulk-input-hdfs-0.1.
|
97
|
+
- classpath/embulk-input-hdfs-0.2.1.jar
|
93
98
|
- classpath/gson-2.2.4.jar
|
94
|
-
- classpath/hadoop-annotations-2.6.
|
95
|
-
- classpath/hadoop-auth-2.6.
|
96
|
-
- classpath/hadoop-client-2.6.
|
97
|
-
- classpath/hadoop-common-2.6.
|
98
|
-
- classpath/hadoop-hdfs-2.6.
|
99
|
-
- classpath/hadoop-mapreduce-client-app-2.6.
|
100
|
-
- classpath/hadoop-mapreduce-client-common-2.6.
|
101
|
-
- classpath/hadoop-mapreduce-client-core-2.6.
|
102
|
-
- classpath/hadoop-mapreduce-client-jobclient-2.6.
|
103
|
-
- classpath/hadoop-mapreduce-client-shuffle-2.6.
|
104
|
-
- classpath/hadoop-yarn-api-2.6.
|
105
|
-
- classpath/hadoop-yarn-client-2.6.
|
106
|
-
- classpath/hadoop-yarn-common-2.6.
|
107
|
-
- classpath/hadoop-yarn-server-common-2.6.
|
108
|
-
- classpath/hadoop-yarn-server-nodemanager-2.6.
|
99
|
+
- classpath/hadoop-annotations-2.6.4.jar
|
100
|
+
- classpath/hadoop-auth-2.6.4.jar
|
101
|
+
- classpath/hadoop-client-2.6.4.jar
|
102
|
+
- classpath/hadoop-common-2.6.4.jar
|
103
|
+
- classpath/hadoop-hdfs-2.6.4.jar
|
104
|
+
- classpath/hadoop-mapreduce-client-app-2.6.4.jar
|
105
|
+
- classpath/hadoop-mapreduce-client-common-2.6.4.jar
|
106
|
+
- classpath/hadoop-mapreduce-client-core-2.6.4.jar
|
107
|
+
- classpath/hadoop-mapreduce-client-jobclient-2.6.4.jar
|
108
|
+
- classpath/hadoop-mapreduce-client-shuffle-2.6.4.jar
|
109
|
+
- classpath/hadoop-yarn-api-2.6.4.jar
|
110
|
+
- classpath/hadoop-yarn-client-2.6.4.jar
|
111
|
+
- classpath/hadoop-yarn-common-2.6.4.jar
|
112
|
+
- classpath/hadoop-yarn-server-common-2.6.4.jar
|
113
|
+
- classpath/hadoop-yarn-server-nodemanager-2.6.4.jar
|
109
114
|
- classpath/htrace-core-3.0.4.jar
|
110
115
|
- classpath/httpclient-4.2.5.jar
|
111
116
|
- classpath/httpcore-4.2.4.jar
|
@@ -1,40 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
import org.apache.hadoop.fs.FileSystem;
|
4
|
-
import org.apache.hadoop.fs.Path;
|
5
|
-
|
6
|
-
import java.io.IOException;
|
7
|
-
import java.util.ArrayList;
|
8
|
-
import java.util.List;
|
9
|
-
|
10
|
-
/**
|
11
|
-
* Created by takahiro.nakayama on 8/20/15.
|
12
|
-
*/
|
13
|
-
public class HdfsFilePartitioner
|
14
|
-
{
|
15
|
-
private FileSystem fs;
|
16
|
-
private Path path;
|
17
|
-
private long numPartitions;
|
18
|
-
|
19
|
-
public HdfsFilePartitioner(FileSystem fs, Path path, long numPartitions)
|
20
|
-
{
|
21
|
-
this.fs = fs;
|
22
|
-
this.path = path;
|
23
|
-
this.numPartitions = numPartitions;
|
24
|
-
}
|
25
|
-
|
26
|
-
public List<HdfsPartialFile> getHdfsPartialFiles()
|
27
|
-
throws IOException
|
28
|
-
{
|
29
|
-
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
30
|
-
long size = fs.getFileStatus(path).getLen();
|
31
|
-
for (int i = 0; i < numPartitions; i++) {
|
32
|
-
long start = size * i / numPartitions;
|
33
|
-
long end = size * (i + 1) / numPartitions;
|
34
|
-
if (start < end) {
|
35
|
-
hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
|
36
|
-
}
|
37
|
-
}
|
38
|
-
return hdfsPartialFiles;
|
39
|
-
}
|
40
|
-
}
|
@@ -1,39 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
/**
|
4
|
-
* Created by takahiro.nakayama on 8/20/15.
|
5
|
-
*/
|
6
|
-
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
|
7
|
-
public class HdfsPartialFile
|
8
|
-
{
|
9
|
-
private String path;
|
10
|
-
private long start;
|
11
|
-
private long end;
|
12
|
-
|
13
|
-
public HdfsPartialFile(String path, long start, long end)
|
14
|
-
{
|
15
|
-
this.path = path;
|
16
|
-
this.start = start;
|
17
|
-
this.end = end;
|
18
|
-
}
|
19
|
-
|
20
|
-
// see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
|
21
|
-
public HdfsPartialFile()
|
22
|
-
{
|
23
|
-
}
|
24
|
-
|
25
|
-
public String getPath()
|
26
|
-
{
|
27
|
-
return path;
|
28
|
-
}
|
29
|
-
|
30
|
-
public long getStart()
|
31
|
-
{
|
32
|
-
return start;
|
33
|
-
}
|
34
|
-
|
35
|
-
public long getEnd()
|
36
|
-
{
|
37
|
-
return end;
|
38
|
-
}
|
39
|
-
}
|