embulk-input-hdfs 0.1.9 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -30,6 +30,7 @@ import javax.annotation.Nullable;
30
30
 
31
31
  import java.io.File;
32
32
  import java.util.ArrayList;
33
+ import java.util.Iterator;
33
34
  import java.util.List;
34
35
 
35
36
  import static org.junit.Assert.assertEquals;
@@ -69,6 +70,8 @@ public class TestHdfsFileInputPlugin
69
70
  assertEquals(true, task.getPartition());
70
71
  assertEquals(0, task.getRewindSeconds());
71
72
  assertEquals(-1, task.getApproximateNumPartitions());
73
+ assertEquals(0, task.getSkipHeaderLines());
74
+ assertEquals(false, task.getDecompression());
72
75
  }
73
76
 
74
77
  @Test(expected = ConfigException.class)
@@ -99,17 +102,14 @@ public class TestHdfsFileInputPlugin
99
102
  }
100
103
  });
101
104
 
102
- List<String> resultFList = Lists.transform(task.getFiles(), new Function<HdfsPartialFile, String>()
103
- {
104
- @Nullable
105
- @Override
106
- public String apply(@Nullable HdfsPartialFile input)
107
- {
108
- assert input != null;
109
- return input.getPath();
105
+ List<String> resultFList = Lists.newArrayList();
106
+ for (int i = 0; i < task.getPartialFileList().getTaskCount();i++) {
107
+ for (PartialFile partialFile : task.getPartialFileList().get(i)) {
108
+ resultFList.add(partialFile.getPath().toString());
110
109
  }
111
- });
112
- assertEquals(fileList, resultFList);
110
+ }
111
+ assertEquals(fileList.size(), resultFList.size());
112
+ assert fileList.containsAll(resultFList);
113
113
  return emptyTaskReports(taskCount);
114
114
  }
115
115
  });
@@ -120,8 +120,9 @@ public class TestHdfsFileInputPlugin
120
120
  {
121
121
  ConfigSource config = getConfigWithDefaultValues();
122
122
  config.set("num_partitions", 10);
123
+ config.set("decompression", true);
123
124
  runner.transaction(config, new Control());
124
- assertRecords(config, output);
125
+ assertRecords(config, output, 12);
125
126
  }
126
127
 
127
128
  @Test
@@ -129,8 +130,31 @@ public class TestHdfsFileInputPlugin
129
130
  {
130
131
  ConfigSource config = getConfigWithDefaultValues();
131
132
  config.set("partition", false);
133
+ config.set("decompression", true);
134
+ runner.transaction(config, new Control());
135
+ assertRecords(config, output, 12);
136
+ }
137
+
138
+ @Test
139
+ public void testHdfsFileInputByOpenWithoutCompressionCodec()
140
+ {
141
+ ConfigSource config = getConfigWithDefaultValues();
142
+ config.set("partition", false);
143
+ config.set("path", getClass().getResource("/sample_01.csv").getPath());
132
144
  runner.transaction(config, new Control());
133
- assertRecords(config, output);
145
+ assertRecords(config, output, 4);
146
+ }
147
+
148
+ @Test
149
+ public void testStrftime()
150
+ {
151
+ ConfigSource config = getConfigWithDefaultValues();
152
+ config.set("path", "/tmp/%Y-%m-%d");
153
+ config.set("rewind_seconds", 86400);
154
+ PluginTask task = config.loadConfig(PluginTask.class);
155
+ String result = plugin.strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
156
+ String expected = task.getJRuby().runScriptlet("(Time.now - 86400).strftime('/tmp/%Y-%m-%d')").toString();
157
+ assertEquals(expected, result);
134
158
  }
135
159
 
136
160
  private class Control
@@ -201,10 +225,10 @@ public class TestHdfsFileInputPlugin
201
225
  return builder.build();
202
226
  }
203
227
 
204
- private void assertRecords(ConfigSource config, MockPageOutput output)
228
+ private void assertRecords(ConfigSource config, MockPageOutput output, long size)
205
229
  {
206
230
  List<Object[]> records = getRecords(config, output);
207
- assertEquals(8, records.size());
231
+ assertEquals(size, records.size());
208
232
  {
209
233
  Object[] record = records.get(0);
210
234
  assertEquals(1L, record[0]);
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-08 00:00:00.000000000 Z
11
+ date: 2016-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -47,6 +47,7 @@ extra_rdoc_files: []
47
47
  files:
48
48
  - .gitignore
49
49
  - .travis.yml
50
+ - CHENGELOG.md
50
51
  - LICENSE.txt
51
52
  - README.md
52
53
  - build.gradle
@@ -54,18 +55,22 @@ files:
54
55
  - config/checkstyle/default.xml
55
56
  - example/config.yml
56
57
  - example/data.csv
58
+ - example/data2.csv.gz
57
59
  - gradle/wrapper/gradle-wrapper.jar
58
60
  - gradle/wrapper/gradle-wrapper.properties
59
61
  - gradlew
60
62
  - gradlew.bat
61
63
  - lib/embulk/input/hdfs.rb
64
+ - src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java
62
65
  - src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
63
- - src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java
64
- - src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java
65
- - src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java
66
+ - src/main/java/org/embulk/input/hdfs/PartialFile.java
67
+ - src/main/java/org/embulk/input/hdfs/PartialFileInputStream.java
68
+ - src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java
69
+ - src/main/java/org/embulk/input/hdfs/PartialFileList.java
66
70
  - src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
67
71
  - src/test/resources/sample_01.csv
68
72
  - src/test/resources/sample_02.csv
73
+ - src/test/resources/sample_03.csv.gz
69
74
  - classpath/activation-1.1.jar
70
75
  - classpath/apacheds-i18n-2.0.0-M15.jar
71
76
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -89,23 +94,23 @@ files:
89
94
  - classpath/curator-client-2.6.0.jar
90
95
  - classpath/curator-framework-2.6.0.jar
91
96
  - classpath/curator-recipes-2.6.0.jar
92
- - classpath/embulk-input-hdfs-0.1.9.jar
97
+ - classpath/embulk-input-hdfs-0.2.1.jar
93
98
  - classpath/gson-2.2.4.jar
94
- - classpath/hadoop-annotations-2.6.3.jar
95
- - classpath/hadoop-auth-2.6.3.jar
96
- - classpath/hadoop-client-2.6.3.jar
97
- - classpath/hadoop-common-2.6.3.jar
98
- - classpath/hadoop-hdfs-2.6.3.jar
99
- - classpath/hadoop-mapreduce-client-app-2.6.3.jar
100
- - classpath/hadoop-mapreduce-client-common-2.6.3.jar
101
- - classpath/hadoop-mapreduce-client-core-2.6.3.jar
102
- - classpath/hadoop-mapreduce-client-jobclient-2.6.3.jar
103
- - classpath/hadoop-mapreduce-client-shuffle-2.6.3.jar
104
- - classpath/hadoop-yarn-api-2.6.3.jar
105
- - classpath/hadoop-yarn-client-2.6.3.jar
106
- - classpath/hadoop-yarn-common-2.6.3.jar
107
- - classpath/hadoop-yarn-server-common-2.6.3.jar
108
- - classpath/hadoop-yarn-server-nodemanager-2.6.3.jar
99
+ - classpath/hadoop-annotations-2.6.4.jar
100
+ - classpath/hadoop-auth-2.6.4.jar
101
+ - classpath/hadoop-client-2.6.4.jar
102
+ - classpath/hadoop-common-2.6.4.jar
103
+ - classpath/hadoop-hdfs-2.6.4.jar
104
+ - classpath/hadoop-mapreduce-client-app-2.6.4.jar
105
+ - classpath/hadoop-mapreduce-client-common-2.6.4.jar
106
+ - classpath/hadoop-mapreduce-client-core-2.6.4.jar
107
+ - classpath/hadoop-mapreduce-client-jobclient-2.6.4.jar
108
+ - classpath/hadoop-mapreduce-client-shuffle-2.6.4.jar
109
+ - classpath/hadoop-yarn-api-2.6.4.jar
110
+ - classpath/hadoop-yarn-client-2.6.4.jar
111
+ - classpath/hadoop-yarn-common-2.6.4.jar
112
+ - classpath/hadoop-yarn-server-common-2.6.4.jar
113
+ - classpath/hadoop-yarn-server-nodemanager-2.6.4.jar
109
114
  - classpath/htrace-core-3.0.4.jar
110
115
  - classpath/httpclient-4.2.5.jar
111
116
  - classpath/httpcore-4.2.4.jar
@@ -1,40 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- import org.apache.hadoop.fs.FileSystem;
4
- import org.apache.hadoop.fs.Path;
5
-
6
- import java.io.IOException;
7
- import java.util.ArrayList;
8
- import java.util.List;
9
-
10
- /**
11
- * Created by takahiro.nakayama on 8/20/15.
12
- */
13
- public class HdfsFilePartitioner
14
- {
15
- private FileSystem fs;
16
- private Path path;
17
- private long numPartitions;
18
-
19
- public HdfsFilePartitioner(FileSystem fs, Path path, long numPartitions)
20
- {
21
- this.fs = fs;
22
- this.path = path;
23
- this.numPartitions = numPartitions;
24
- }
25
-
26
- public List<HdfsPartialFile> getHdfsPartialFiles()
27
- throws IOException
28
- {
29
- List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
30
- long size = fs.getFileStatus(path).getLen();
31
- for (int i = 0; i < numPartitions; i++) {
32
- long start = size * i / numPartitions;
33
- long end = size * (i + 1) / numPartitions;
34
- if (start < end) {
35
- hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
36
- }
37
- }
38
- return hdfsPartialFiles;
39
- }
40
- }
@@ -1,39 +0,0 @@
1
- package org.embulk.input.hdfs;
2
-
3
- /**
4
- * Created by takahiro.nakayama on 8/20/15.
5
- */
6
- // ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
7
- public class HdfsPartialFile
8
- {
9
- private String path;
10
- private long start;
11
- private long end;
12
-
13
- public HdfsPartialFile(String path, long start, long end)
14
- {
15
- this.path = path;
16
- this.start = start;
17
- this.end = end;
18
- }
19
-
20
- // see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
21
- public HdfsPartialFile()
22
- {
23
- }
24
-
25
- public String getPath()
26
- {
27
- return path;
28
- }
29
-
30
- public long getStart()
31
- {
32
- return start;
33
- }
34
-
35
- public long getEnd()
36
- {
37
- return end;
38
- }
39
- }