embulk-input-hdfs 0.1.9 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHENGELOG.md +7 -0
- data/README.md +18 -15
- data/build.gradle +1 -1
- data/example/config.yml +4 -1
- data/example/data2.csv.gz +0 -0
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +82 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +248 -212
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +48 -0
- data/src/main/java/org/embulk/input/hdfs/{HdfsPartialFileInputStream.java → PartialFileInputStream.java} +9 -4
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +125 -0
- data/src/main/java/org/embulk/input/hdfs/PartialFileList.java +360 -0
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +38 -14
- data/src/test/resources/sample_03.csv.gz +0 -0
- metadata +26 -21
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +0 -40
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +0 -39
@@ -30,6 +30,7 @@ import javax.annotation.Nullable;
|
|
30
30
|
|
31
31
|
import java.io.File;
|
32
32
|
import java.util.ArrayList;
|
33
|
+
import java.util.Iterator;
|
33
34
|
import java.util.List;
|
34
35
|
|
35
36
|
import static org.junit.Assert.assertEquals;
|
@@ -69,6 +70,8 @@ public class TestHdfsFileInputPlugin
|
|
69
70
|
assertEquals(true, task.getPartition());
|
70
71
|
assertEquals(0, task.getRewindSeconds());
|
71
72
|
assertEquals(-1, task.getApproximateNumPartitions());
|
73
|
+
assertEquals(0, task.getSkipHeaderLines());
|
74
|
+
assertEquals(false, task.getDecompression());
|
72
75
|
}
|
73
76
|
|
74
77
|
@Test(expected = ConfigException.class)
|
@@ -99,17 +102,14 @@ public class TestHdfsFileInputPlugin
|
|
99
102
|
}
|
100
103
|
});
|
101
104
|
|
102
|
-
List<String> resultFList = Lists.
|
103
|
-
{
|
104
|
-
|
105
|
-
|
106
|
-
public String apply(@Nullable HdfsPartialFile input)
|
107
|
-
{
|
108
|
-
assert input != null;
|
109
|
-
return input.getPath();
|
105
|
+
List<String> resultFList = Lists.newArrayList();
|
106
|
+
for (int i = 0; i < task.getPartialFileList().getTaskCount();i++) {
|
107
|
+
for (PartialFile partialFile : task.getPartialFileList().get(i)) {
|
108
|
+
resultFList.add(partialFile.getPath().toString());
|
110
109
|
}
|
111
|
-
}
|
112
|
-
assertEquals(fileList, resultFList);
|
110
|
+
}
|
111
|
+
assertEquals(fileList.size(), resultFList.size());
|
112
|
+
assert fileList.containsAll(resultFList);
|
113
113
|
return emptyTaskReports(taskCount);
|
114
114
|
}
|
115
115
|
});
|
@@ -120,8 +120,9 @@ public class TestHdfsFileInputPlugin
|
|
120
120
|
{
|
121
121
|
ConfigSource config = getConfigWithDefaultValues();
|
122
122
|
config.set("num_partitions", 10);
|
123
|
+
config.set("decompression", true);
|
123
124
|
runner.transaction(config, new Control());
|
124
|
-
assertRecords(config, output);
|
125
|
+
assertRecords(config, output, 12);
|
125
126
|
}
|
126
127
|
|
127
128
|
@Test
|
@@ -129,8 +130,31 @@ public class TestHdfsFileInputPlugin
|
|
129
130
|
{
|
130
131
|
ConfigSource config = getConfigWithDefaultValues();
|
131
132
|
config.set("partition", false);
|
133
|
+
config.set("decompression", true);
|
134
|
+
runner.transaction(config, new Control());
|
135
|
+
assertRecords(config, output, 12);
|
136
|
+
}
|
137
|
+
|
138
|
+
@Test
|
139
|
+
public void testHdfsFileInputByOpenWithoutCompressionCodec()
|
140
|
+
{
|
141
|
+
ConfigSource config = getConfigWithDefaultValues();
|
142
|
+
config.set("partition", false);
|
143
|
+
config.set("path", getClass().getResource("/sample_01.csv").getPath());
|
132
144
|
runner.transaction(config, new Control());
|
133
|
-
assertRecords(config, output);
|
145
|
+
assertRecords(config, output, 4);
|
146
|
+
}
|
147
|
+
|
148
|
+
@Test
|
149
|
+
public void testStrftime()
|
150
|
+
{
|
151
|
+
ConfigSource config = getConfigWithDefaultValues();
|
152
|
+
config.set("path", "/tmp/%Y-%m-%d");
|
153
|
+
config.set("rewind_seconds", 86400);
|
154
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
155
|
+
String result = plugin.strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
|
156
|
+
String expected = task.getJRuby().runScriptlet("(Time.now - 86400).strftime('/tmp/%Y-%m-%d')").toString();
|
157
|
+
assertEquals(expected, result);
|
134
158
|
}
|
135
159
|
|
136
160
|
private class Control
|
@@ -201,10 +225,10 @@ public class TestHdfsFileInputPlugin
|
|
201
225
|
return builder.build();
|
202
226
|
}
|
203
227
|
|
204
|
-
private void assertRecords(ConfigSource config, MockPageOutput output)
|
228
|
+
private void assertRecords(ConfigSource config, MockPageOutput output, long size)
|
205
229
|
{
|
206
230
|
List<Object[]> records = getRecords(config, output);
|
207
|
-
assertEquals(
|
231
|
+
assertEquals(size, records.size());
|
208
232
|
{
|
209
233
|
Object[] record = records.get(0);
|
210
234
|
assertEquals(1L, record[0]);
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Civitaspo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -47,6 +47,7 @@ extra_rdoc_files: []
|
|
47
47
|
files:
|
48
48
|
- .gitignore
|
49
49
|
- .travis.yml
|
50
|
+
- CHENGELOG.md
|
50
51
|
- LICENSE.txt
|
51
52
|
- README.md
|
52
53
|
- build.gradle
|
@@ -54,18 +55,22 @@ files:
|
|
54
55
|
- config/checkstyle/default.xml
|
55
56
|
- example/config.yml
|
56
57
|
- example/data.csv
|
58
|
+
- example/data2.csv.gz
|
57
59
|
- gradle/wrapper/gradle-wrapper.jar
|
58
60
|
- gradle/wrapper/gradle-wrapper.properties
|
59
61
|
- gradlew
|
60
62
|
- gradlew.bat
|
61
63
|
- lib/embulk/input/hdfs.rb
|
64
|
+
- src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java
|
62
65
|
- src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
|
63
|
-
- src/main/java/org/embulk/input/hdfs/
|
64
|
-
- src/main/java/org/embulk/input/hdfs/
|
65
|
-
- src/main/java/org/embulk/input/hdfs/
|
66
|
+
- src/main/java/org/embulk/input/hdfs/PartialFile.java
|
67
|
+
- src/main/java/org/embulk/input/hdfs/PartialFileInputStream.java
|
68
|
+
- src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java
|
69
|
+
- src/main/java/org/embulk/input/hdfs/PartialFileList.java
|
66
70
|
- src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
|
67
71
|
- src/test/resources/sample_01.csv
|
68
72
|
- src/test/resources/sample_02.csv
|
73
|
+
- src/test/resources/sample_03.csv.gz
|
69
74
|
- classpath/activation-1.1.jar
|
70
75
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
71
76
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -89,23 +94,23 @@ files:
|
|
89
94
|
- classpath/curator-client-2.6.0.jar
|
90
95
|
- classpath/curator-framework-2.6.0.jar
|
91
96
|
- classpath/curator-recipes-2.6.0.jar
|
92
|
-
- classpath/embulk-input-hdfs-0.1.
|
97
|
+
- classpath/embulk-input-hdfs-0.2.1.jar
|
93
98
|
- classpath/gson-2.2.4.jar
|
94
|
-
- classpath/hadoop-annotations-2.6.
|
95
|
-
- classpath/hadoop-auth-2.6.
|
96
|
-
- classpath/hadoop-client-2.6.
|
97
|
-
- classpath/hadoop-common-2.6.
|
98
|
-
- classpath/hadoop-hdfs-2.6.
|
99
|
-
- classpath/hadoop-mapreduce-client-app-2.6.
|
100
|
-
- classpath/hadoop-mapreduce-client-common-2.6.
|
101
|
-
- classpath/hadoop-mapreduce-client-core-2.6.
|
102
|
-
- classpath/hadoop-mapreduce-client-jobclient-2.6.
|
103
|
-
- classpath/hadoop-mapreduce-client-shuffle-2.6.
|
104
|
-
- classpath/hadoop-yarn-api-2.6.
|
105
|
-
- classpath/hadoop-yarn-client-2.6.
|
106
|
-
- classpath/hadoop-yarn-common-2.6.
|
107
|
-
- classpath/hadoop-yarn-server-common-2.6.
|
108
|
-
- classpath/hadoop-yarn-server-nodemanager-2.6.
|
99
|
+
- classpath/hadoop-annotations-2.6.4.jar
|
100
|
+
- classpath/hadoop-auth-2.6.4.jar
|
101
|
+
- classpath/hadoop-client-2.6.4.jar
|
102
|
+
- classpath/hadoop-common-2.6.4.jar
|
103
|
+
- classpath/hadoop-hdfs-2.6.4.jar
|
104
|
+
- classpath/hadoop-mapreduce-client-app-2.6.4.jar
|
105
|
+
- classpath/hadoop-mapreduce-client-common-2.6.4.jar
|
106
|
+
- classpath/hadoop-mapreduce-client-core-2.6.4.jar
|
107
|
+
- classpath/hadoop-mapreduce-client-jobclient-2.6.4.jar
|
108
|
+
- classpath/hadoop-mapreduce-client-shuffle-2.6.4.jar
|
109
|
+
- classpath/hadoop-yarn-api-2.6.4.jar
|
110
|
+
- classpath/hadoop-yarn-client-2.6.4.jar
|
111
|
+
- classpath/hadoop-yarn-common-2.6.4.jar
|
112
|
+
- classpath/hadoop-yarn-server-common-2.6.4.jar
|
113
|
+
- classpath/hadoop-yarn-server-nodemanager-2.6.4.jar
|
109
114
|
- classpath/htrace-core-3.0.4.jar
|
110
115
|
- classpath/httpclient-4.2.5.jar
|
111
116
|
- classpath/httpcore-4.2.4.jar
|
@@ -1,40 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
import org.apache.hadoop.fs.FileSystem;
|
4
|
-
import org.apache.hadoop.fs.Path;
|
5
|
-
|
6
|
-
import java.io.IOException;
|
7
|
-
import java.util.ArrayList;
|
8
|
-
import java.util.List;
|
9
|
-
|
10
|
-
/**
|
11
|
-
* Created by takahiro.nakayama on 8/20/15.
|
12
|
-
*/
|
13
|
-
public class HdfsFilePartitioner
|
14
|
-
{
|
15
|
-
private FileSystem fs;
|
16
|
-
private Path path;
|
17
|
-
private long numPartitions;
|
18
|
-
|
19
|
-
public HdfsFilePartitioner(FileSystem fs, Path path, long numPartitions)
|
20
|
-
{
|
21
|
-
this.fs = fs;
|
22
|
-
this.path = path;
|
23
|
-
this.numPartitions = numPartitions;
|
24
|
-
}
|
25
|
-
|
26
|
-
public List<HdfsPartialFile> getHdfsPartialFiles()
|
27
|
-
throws IOException
|
28
|
-
{
|
29
|
-
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
30
|
-
long size = fs.getFileStatus(path).getLen();
|
31
|
-
for (int i = 0; i < numPartitions; i++) {
|
32
|
-
long start = size * i / numPartitions;
|
33
|
-
long end = size * (i + 1) / numPartitions;
|
34
|
-
if (start < end) {
|
35
|
-
hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
|
36
|
-
}
|
37
|
-
}
|
38
|
-
return hdfsPartialFiles;
|
39
|
-
}
|
40
|
-
}
|
@@ -1,39 +0,0 @@
|
|
1
|
-
package org.embulk.input.hdfs;
|
2
|
-
|
3
|
-
/**
|
4
|
-
* Created by takahiro.nakayama on 8/20/15.
|
5
|
-
*/
|
6
|
-
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
|
7
|
-
public class HdfsPartialFile
|
8
|
-
{
|
9
|
-
private String path;
|
10
|
-
private long start;
|
11
|
-
private long end;
|
12
|
-
|
13
|
-
public HdfsPartialFile(String path, long start, long end)
|
14
|
-
{
|
15
|
-
this.path = path;
|
16
|
-
this.start = start;
|
17
|
-
this.end = end;
|
18
|
-
}
|
19
|
-
|
20
|
-
// see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
|
21
|
-
public HdfsPartialFile()
|
22
|
-
{
|
23
|
-
}
|
24
|
-
|
25
|
-
public String getPath()
|
26
|
-
{
|
27
|
-
return path;
|
28
|
-
}
|
29
|
-
|
30
|
-
public long getStart()
|
31
|
-
{
|
32
|
-
return start;
|
33
|
-
}
|
34
|
-
|
35
|
-
public long getEnd()
|
36
|
-
{
|
37
|
-
return end;
|
38
|
-
}
|
39
|
-
}
|