embulk-input-hdfs 0.0.3 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 730a82c7b3d06734b462824ca3c6036513f2cb60
4
- data.tar.gz: 23cb0dfbda2edfc51a24d743dfaf30557b26a86f
3
+ metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
4
+ data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
5
5
  SHA512:
6
- metadata.gz: 1a92443bbb2df59ab50cb68b532522c4cc73d5d5466a1a7e7aace2309885e9dc09dfaa5f51845fcfeeb34e4ed0dcfc0b8a2d2fe25893d60bfbc48e67d285718e
7
- data.tar.gz: 57e0aff97e4a14d45fcaecfb6e3b33dbaa001fa282bd2907e146615392b6676580193a7cf3def5e9923184c97c18cddb8bde07e4c9a87dfc905c23f1b7b639a5
6
+ metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
7
+ data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
data/.gitignore CHANGED
@@ -7,3 +7,5 @@
7
7
  build/
8
8
  .idea
9
9
  *.iml
10
+ .ruby-version
11
+
data/README.md CHANGED
@@ -14,6 +14,7 @@ Read files on Hdfs.
14
14
  - **config** overwrites configuration parameters (hash, default: `{}`)
15
15
  - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
16
16
  - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
17
+ - **partition** when this is true, partition input files and increase task count. (default: `true`)
17
18
 
18
19
  ## Example
19
20
 
@@ -24,12 +25,13 @@ in:
24
25
  - /opt/analytics/etc/hadoop/conf/core-site.xml
25
26
  - /opt/analytics/etc/hadoop/conf/hdfs-site.xml
26
27
  config:
27
- fs.defaultFS: 'hdfs://hdp-nn1:8020'
28
+ fs.defaultFS: 'hdfs://hadoop-nn1:8020'
28
29
  dfs.replication: 1
29
30
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
30
31
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
31
32
  input_path: /user/embulk/test/%Y-%m-%d/*
32
33
  rewind_seconds: 86400
34
+ partition: true
33
35
  decoders:
34
36
  - {type: gzip}
35
37
  parser:
@@ -50,6 +52,15 @@ in:
50
52
  - {name: c3, type: long}
51
53
  ```
52
54
 
55
+ ## Note
56
+ - the feature of the partition supports only 3 line terminators.
57
+ - `\n`
58
+ - `\r`
59
+ - `\r\n`
60
+
61
+ ## The Reference Implementation
62
+ - [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
63
+
53
64
  ## Build
54
65
 
55
66
  ```
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.0.3"
15
+ version = "0.1.0"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -22,7 +22,7 @@ dependencies {
22
22
  provided "org.embulk:embulk-core:0.7.0"
23
23
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
24
24
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
25
- compile 'com.google.guava:guava:14.0'
25
+ compile 'com.google.guava:guava:15.0'
26
26
  testCompile "junit:junit:4.+"
27
27
  }
28
28
 
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_input(
2
- "hdfs", "org.embulk.input.HdfsFileInputPlugin",
2
+ "hdfs", "org.embulk.input.hdfs.HdfsFileInputPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,267 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+ import java.util.List;
6
+ import java.util.ArrayList;
7
+ import java.util.Map;
8
+
9
+ import com.google.common.base.Function;
10
+ import com.google.common.base.Optional;
11
+ import com.google.common.collect.ImmutableList;
12
+ import com.google.common.collect.Lists;
13
+ import org.apache.hadoop.conf.Configuration;
14
+ import org.apache.hadoop.fs.FileStatus;
15
+ import org.apache.hadoop.fs.FileSystem;
16
+ import org.apache.hadoop.fs.Path;
17
+ import org.embulk.config.TaskReport;
18
+ import org.embulk.config.Config;
19
+ import org.embulk.config.ConfigDefault;
20
+ import org.embulk.config.ConfigInject;
21
+ import org.embulk.config.ConfigDiff;
22
+ import org.embulk.config.ConfigSource;
23
+ import org.embulk.config.Task;
24
+ import org.embulk.config.TaskSource;
25
+ import org.embulk.spi.*;
26
+ import org.embulk.spi.util.InputStreamFileInput;
27
+ import org.embulk.spi.util.InputStreamTransactionalFileInput;
28
+ import org.jruby.embed.ScriptingContainer;
29
+ import org.slf4j.Logger;
30
+
31
+ import javax.annotation.Nullable;
32
+
33
+ public class HdfsFileInputPlugin implements FileInputPlugin
34
+ {
35
+ private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
36
+
37
+ public interface PluginTask extends Task
38
+ {
39
+ @Config("config_files")
40
+ @ConfigDefault("[]")
41
+ public List<String> getConfigFiles();
42
+
43
+ @Config("config")
44
+ @ConfigDefault("{}")
45
+ public Map<String, String> getConfig();
46
+
47
+ @Config("input_path")
48
+ public String getInputPath();
49
+
50
+ @Config("rewind_seconds")
51
+ @ConfigDefault("0")
52
+ public int getRewindSeconds();
53
+
54
+ @Config("partition")
55
+ @ConfigDefault("true")
56
+ public boolean getPartition();
57
+
58
+ // this parameter is experimental.
59
+ @Config("partition_level")
60
+ @ConfigDefault("3")
61
+ public int getPartitonLevel();
62
+
63
+ public List<HdfsPartialFile> getFiles();
64
+ public void setFiles(List<HdfsPartialFile> hdfsFiles);
65
+
66
+ @ConfigInject
67
+ public BufferAllocator getBufferAllocator();
68
+ }
69
+
70
+ @Override
71
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
72
+ {
73
+ PluginTask task = config.loadConfig(PluginTask.class);
74
+
75
+ // listing Files
76
+ String pathString = strftime(task.getInputPath(), task.getRewindSeconds());
77
+ try {
78
+ List<String> originalFileList = buildFileList(getFs(task), pathString);
79
+ task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
80
+ logger.info("Loading target files: {}", originalFileList);
81
+ }
82
+ catch (IOException e) {
83
+ logger.error(e.getMessage());
84
+ throw new RuntimeException(e);
85
+ }
86
+
87
+ // log the detail of partial files.
88
+ for (HdfsPartialFile partialFile : task.getFiles()) {
89
+ logger.info("target file: {}, start: {}, end: {}",
90
+ partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
91
+ }
92
+
93
+ // number of processors is same with number of targets
94
+ int taskCount = task.getFiles().size();
95
+ logger.info("task size: {}", taskCount);
96
+
97
+ return resume(task.dump(), taskCount, control);
98
+ }
99
+
100
+ @Override
101
+ public ConfigDiff resume(TaskSource taskSource,
102
+ int taskCount,
103
+ FileInputPlugin.Control control)
104
+ {
105
+ control.run(taskSource, taskCount);
106
+
107
+ ConfigDiff configDiff = Exec.newConfigDiff();
108
+
109
+ // usually, yo use last_path
110
+ //if (task.getFiles().isEmpty()) {
111
+ // if (task.getLastPath().isPresent()) {
112
+ // configDiff.set("last_path", task.getLastPath().get());
113
+ // }
114
+ //} else {
115
+ // List<String> files = new ArrayList<String>(task.getFiles());
116
+ // Collections.sort(files);
117
+ // configDiff.set("last_path", files.get(files.size() - 1));
118
+ //}
119
+
120
+ return configDiff;
121
+ }
122
+
123
+ @Override
124
+ public void cleanup(TaskSource taskSource,
125
+ int taskCount,
126
+ List<TaskReport> successTaskReports)
127
+ {
128
+ }
129
+
130
+ @Override
131
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
132
+ {
133
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
134
+
135
+ InputStream input;
136
+ try {
137
+ input = openInputStream(task, task.getFiles().get(taskIndex));
138
+ }
139
+ catch (IOException e) {
140
+ logger.error(e.getMessage());
141
+ throw new RuntimeException(e);
142
+ }
143
+
144
+ return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input) {
145
+ @Override
146
+ public void abort()
147
+ { }
148
+
149
+ @Override
150
+ public TaskReport commit()
151
+ {
152
+ return Exec.newTaskReport();
153
+ }
154
+ };
155
+ }
156
+
157
+ private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
158
+ throws IOException
159
+ {
160
+ FileSystem fs = getFs(task);
161
+ InputStream original = fs.open(new Path(partialFile.getPath()));
162
+ return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
163
+ }
164
+
165
+ private static FileSystem getFs(final PluginTask task)
166
+ throws IOException
167
+ {
168
+ Configuration configuration = new Configuration();
169
+
170
+ for (Object configFile : task.getConfigFiles()) {
171
+ configuration.addResource(configFile.toString());
172
+ }
173
+ configuration.reloadConfiguration();
174
+
175
+ for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
176
+ configuration.set(entry.getKey(), entry.getValue());
177
+ }
178
+
179
+ return FileSystem.get(configuration);
180
+ }
181
+
182
+ private String strftime(final String raw, final int rewind_seconds)
183
+ {
184
+ ScriptingContainer jruby = new ScriptingContainer();
185
+ Object resolved = jruby.runScriptlet(
186
+ String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
187
+ return resolved.toString();
188
+ }
189
+
190
+ private List<String> buildFileList(final FileSystem fs, final String pathString)
191
+ throws IOException
192
+ {
193
+ List<String> fileList = new ArrayList<>();
194
+ for (FileStatus entry : fs.globStatus(new Path(pathString))) {
195
+ if (entry.isDirectory()) {
196
+ fileList.addAll(lsr(fs, entry));
197
+ } else {
198
+ fileList.add(entry.getPath().toString());
199
+ }
200
+ }
201
+ return fileList;
202
+ }
203
+
204
+ private List<String> lsr(final FileSystem fs, FileStatus status)
205
+ throws IOException
206
+ {
207
+ List<String> fileList = new ArrayList<>();
208
+ if (status.isDirectory()) {
209
+ for (FileStatus entry : fs.listStatus(status.getPath())) {
210
+ fileList.addAll(lsr(fs, entry));
211
+ }
212
+ }
213
+ else {
214
+ fileList.add(status.getPath().toString());
215
+ }
216
+ return fileList;
217
+ }
218
+
219
+ private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
220
+ throws IOException
221
+ {
222
+ List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
223
+ {
224
+ @Nullable
225
+ @Override
226
+ public Path apply(@Nullable String input)
227
+ {
228
+ return new Path(input);
229
+ }
230
+ });
231
+
232
+ int totalFileLength = 0;
233
+ for (Path path : pathList) {
234
+ totalFileLength += fs.getFileStatus(path).getLen();
235
+ }
236
+
237
+ // TODO: optimum allocation of resources
238
+ int partitionCountParameter = task.getPartitonLevel();
239
+ int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
240
+
241
+ List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
242
+ for (Path path : pathList) {
243
+ int partitionCount;
244
+
245
+ if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
246
+ partitionCount = 1;
247
+ }
248
+ else if (!task.getPartition()) {
249
+ partitionCount = 1;
250
+ }
251
+ else {
252
+ int fileLength = (int) fs.getFileStatus(path).getLen();
253
+ partitionCount = fileLength / partitionSizeByOneTask;
254
+ int remainder = fileLength % partitionSizeByOneTask;
255
+
256
+ if (remainder > 0) {
257
+ partitionCount++;
258
+ }
259
+ }
260
+
261
+ HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
262
+ hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
263
+ }
264
+
265
+ return hdfsPartialFiles;
266
+ }
267
+ }
@@ -0,0 +1,39 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.FileSystem;
4
+ import org.apache.hadoop.fs.Path;
5
+
6
+ import java.io.IOException;
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+
10
+ /**
11
+ * Created by takahiro.nakayama on 8/20/15.
12
+ */
13
+ public class HdfsFilePartitioner
14
+ {
15
+ private FileSystem fs;
16
+ private Path path;
17
+ private int partitionCount;
18
+
19
+ public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
20
+ {
21
+ this.fs = fs;
22
+ this.path = path;
23
+ this.partitionCount = partitionCount;
24
+ }
25
+
26
+ public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
27
+ {
28
+ List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
29
+ long size = fs.getFileStatus(path).getLen();
30
+ for (int i = 0; i < partitionCount; i++) {
31
+ long start = size * i / partitionCount;
32
+ long end = size * (i + 1) / partitionCount;
33
+ if (start < end) {
34
+ hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
35
+ }
36
+ }
37
+ return hdfsPartialFiles;
38
+ }
39
+ }
@@ -0,0 +1,40 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.Path;
4
+
5
+ /**
6
+ * Created by takahiro.nakayama on 8/20/15.
7
+ */
8
+ // ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
9
+ public class HdfsPartialFile
10
+ {
11
+ private String path;
12
+ private long start;
13
+ private long end;
14
+
15
+ public HdfsPartialFile(String path, long start, long end)
16
+ {
17
+ this.path = path;
18
+ this.start = start;
19
+ this.end = end;
20
+ }
21
+
22
+ // see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
23
+ public HdfsPartialFile() { }
24
+
25
+ public String getPath()
26
+ {
27
+ return path;
28
+ }
29
+
30
+ public long getStart()
31
+ {
32
+ return start;
33
+ }
34
+
35
+ public long getEnd()
36
+ {
37
+ return end;
38
+ }
39
+
40
+ }
@@ -0,0 +1,154 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.IOException;
5
+ import java.io.InputStream;
6
+ import java.io.PushbackInputStream;
7
+
8
+ // ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
9
+ public class HdfsPartialFileInputStream extends InputStream
10
+ {
11
+ private final PushbackInputStream original;
12
+ private long start;
13
+ private long end;
14
+ private long current;
15
+ private boolean eof;
16
+
17
+ public HdfsPartialFileInputStream(InputStream original, long start, long end)
18
+ {
19
+ this.original = new PushbackInputStream(new BufferedInputStream(original));
20
+ this.start = start;
21
+ this.end = end;
22
+ current = -1;
23
+ }
24
+
25
+ @Override
26
+ public int read(byte[] b) throws IOException
27
+ {
28
+ return read(b, 0, b.length);
29
+ }
30
+
31
+ @Override
32
+ public int read(byte[] b, int off, int len) throws IOException
33
+ {
34
+ initializeIfNeeded();
35
+
36
+ if (eof) {
37
+ return -1;
38
+ }
39
+
40
+ int read = original.read(b, off, len);
41
+ if (read < 0) {
42
+ eof = true;
43
+ return -1;
44
+ }
45
+
46
+ current += read;
47
+ if (current >= end) {
48
+ for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
49
+ if (b[off + i] == '\n') {
50
+ eof = true;
51
+ return i + 1;
52
+ }
53
+
54
+ if (b[off + i] == '\r') {
55
+ int next = (i < read ? b[off + i + 1] : prefetch());
56
+ if (next != '\n') {
57
+ eof = true;
58
+ return i + 1;
59
+ }
60
+ }
61
+ }
62
+ }
63
+
64
+ return read;
65
+ }
66
+
67
+ @Override
68
+ public int read() throws IOException
69
+ {
70
+ initializeIfNeeded();
71
+
72
+ if (eof) {
73
+ return -1;
74
+ }
75
+
76
+ int read = original.read();
77
+ current++;
78
+
79
+ if (read < 0) {
80
+ eof = true;
81
+ return -1;
82
+ }
83
+
84
+ if (current >= end) {
85
+ if (read == '\n' || read == '\r' && prefetch() != '\n') {
86
+ eof = true;
87
+ }
88
+ }
89
+
90
+ return read;
91
+ }
92
+
93
+ @Override
94
+ public long skip(long n) throws IOException
95
+ {
96
+ throw new IOException("Skip not supported.");
97
+ /*
98
+ long skip = original.skip(n);
99
+ current += skip;
100
+ return skip;
101
+ */
102
+ }
103
+
104
+ @Override
105
+ public int available() throws IOException
106
+ {
107
+ return 0;
108
+ }
109
+
110
+ @Override
111
+ public void close() throws IOException
112
+ {
113
+ original.close();
114
+ }
115
+
116
+ private void initializeIfNeeded() throws IOException
117
+ {
118
+ if (current >= start) {
119
+ return;
120
+
121
+ }
122
+ if (start == 0) {
123
+ current = 0;
124
+ } else {
125
+ current = original.skip(--start);
126
+ if (current != start) {
127
+ throw new IOException("Cannot skip.");
128
+ }
129
+
130
+ int c;
131
+ while ((c = original.read()) >= 0) {
132
+ start++;
133
+ current++;
134
+
135
+ if (c == '\n' || c == '\r' && prefetch() != '\n') {
136
+ break;
137
+ }
138
+ }
139
+ }
140
+
141
+ if (start >= end) {
142
+ eof = true;
143
+ }
144
+ }
145
+
146
+ private int prefetch() throws IOException
147
+ {
148
+ int c = original.read();
149
+ if (c >= 0) {
150
+ original.unread(c);
151
+ }
152
+ return c;
153
+ }
154
+ }
@@ -1,4 +1,4 @@
1
- package org.embulk.input;
1
+ package org.embulk.input.hdfs;
2
2
 
3
3
  public class TestHdfsFileInputPlugin
4
4
  {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - takahiro.nakayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-19 00:00:00.000000000 Z
11
+ date: 2015-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -54,8 +54,11 @@ files:
54
54
  - gradlew
55
55
  - gradlew.bat
56
56
  - lib/embulk/input/hdfs.rb
57
- - src/main/java/org/embulk/input/HdfsFileInputPlugin.java
58
- - src/test/java/org/embulk/input/TestHdfsFileInputPlugin.java
57
+ - src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
58
+ - src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java
59
+ - src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java
60
+ - src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java
61
+ - src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
59
62
  - classpath/activation-1.1.jar
60
63
  - classpath/apacheds-i18n-2.0.0-M15.jar
61
64
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -79,7 +82,7 @@ files:
79
82
  - classpath/curator-client-2.6.0.jar
80
83
  - classpath/curator-framework-2.6.0.jar
81
84
  - classpath/curator-recipes-2.6.0.jar
82
- - classpath/embulk-input-hdfs-0.0.3.jar
85
+ - classpath/embulk-input-hdfs-0.1.0.jar
83
86
  - classpath/gson-2.2.4.jar
84
87
  - classpath/hadoop-annotations-2.6.0.jar
85
88
  - classpath/hadoop-auth-2.6.0.jar
Binary file
@@ -1,231 +0,0 @@
1
- package org.embulk.input;
2
-
3
- import org.apache.hadoop.conf.Configuration;
4
- import org.apache.hadoop.fs.FileStatus;
5
- import org.apache.hadoop.fs.FileSystem;
6
- import org.apache.hadoop.fs.Path;
7
- import org.embulk.config.*;
8
- import org.embulk.spi.BufferAllocator;
9
- import org.embulk.spi.Exec;
10
- import org.embulk.spi.FileInputPlugin;
11
- import org.embulk.spi.TransactionalFileInput;
12
- import org.embulk.spi.util.InputStreamFileInput;
13
- import org.jruby.embed.ScriptingContainer;
14
- import org.slf4j.Logger;
15
-
16
- import java.io.IOException;
17
- import java.io.InputStream;
18
- import java.util.ArrayList;
19
- import java.util.List;
20
- import java.util.Map;
21
-
22
- public class HdfsFileInputPlugin implements FileInputPlugin
23
- {
24
- private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
25
-
26
- public interface PluginTask extends Task
27
- {
28
- @Config("config_files")
29
- @ConfigDefault("[]")
30
- public List<String> getConfigFiles();
31
-
32
- @Config("config")
33
- @ConfigDefault("{}")
34
- public Map<String, String> getConfig();
35
-
36
- @Config("input_path")
37
- public String getInputPath();
38
-
39
- @Config("rewind_seconds")
40
- @ConfigDefault("0")
41
- public int getRewindSeconds();
42
-
43
- public List<String> getTargetFiles();
44
- public void setTargetFiles(List<String> targetFiles);
45
-
46
- @ConfigInject
47
- public BufferAllocator getBufferAllocator();
48
- }
49
-
50
- @Override
51
- public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
52
- {
53
- PluginTask task = config.loadConfig(PluginTask.class);
54
-
55
- // prepare
56
- Configuration configuration = getHdfsConfiguration(task);
57
- FileSystem fs = getFs(configuration);
58
- Path inputPath = new Path(strftime(task.getInputPath(), task.getRewindSeconds()));
59
-
60
- // listing
61
- List<String> targetFiles;
62
- try {
63
- targetFiles = globRecursive(fs, inputPath);
64
- } catch (IOException e) {
65
- logger.error(e.getMessage());
66
- throw new RuntimeException(e);
67
- }
68
- logger.info("Loading target files: {}", targetFiles);
69
- task.setTargetFiles(targetFiles);
70
-
71
- // number of processors is same with number of targets
72
- int taskCount = targetFiles.size();
73
-
74
- return resume(task.dump(), taskCount, control);
75
- }
76
-
77
- @Override
78
- public ConfigDiff resume(TaskSource taskSource,
79
- int taskCount,
80
- FileInputPlugin.Control control)
81
- {
82
- control.run(taskSource, taskCount);
83
- return Exec.newConfigDiff();
84
- }
85
-
86
- @Override
87
- public void cleanup(TaskSource taskSource,
88
- int taskCount,
89
- List<TaskReport> successTaskReports)
90
- {
91
- }
92
-
93
- @Override
94
- public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
95
- {
96
- PluginTask task = taskSource.loadTask(PluginTask.class);
97
-
98
- // prepare
99
- Configuration configuration = getHdfsConfiguration(task);
100
- FileSystem fs = getFs(configuration);
101
-
102
- return new HdfsFileInput(task, fs, taskIndex);
103
- }
104
-
105
- private Configuration getHdfsConfiguration(final PluginTask task)
106
- {
107
- Configuration configuration = new Configuration();
108
-
109
- for (Object configFile : task.getConfigFiles()) {
110
- configuration.addResource(configFile.toString());
111
- }
112
- configuration.reloadConfiguration();
113
-
114
- for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
115
- configuration.set(entry.getKey(), entry.getValue());
116
- }
117
-
118
- return configuration;
119
- }
120
-
121
- private FileSystem getFs(final Configuration configuration)
122
- {
123
- try {
124
- FileSystem fs = FileSystem.get(configuration);
125
- return fs;
126
- }
127
- catch (IOException e) {
128
- logger.error(e.getMessage());
129
- throw new RuntimeException(e);
130
- }
131
- }
132
-
133
- private String strftime(final String raw, final int rewind_seconds)
134
- {
135
- ScriptingContainer jruby = new ScriptingContainer();
136
- Object resolved = jruby.runScriptlet(
137
- String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
138
- return resolved.toString();
139
- }
140
-
141
- private List<String> globRecursive(final FileSystem fs, final Path hdfsPath) throws IOException
142
- {
143
- List<String> container = new ArrayList<String>();
144
- for (FileStatus entry : fs.globStatus(hdfsPath)) {
145
- if (entry.isDirectory()) {
146
- container.addAll(listRecursive(fs, entry));
147
- }
148
- else {
149
- container.add(entry.getPath().toString());
150
- }
151
- }
152
- return container;
153
- }
154
-
155
- private List<String> listRecursive(final FileSystem fs, FileStatus status) throws IOException {
156
- List<String> container = new ArrayList<String>();
157
- if (status.isDirectory()) {
158
- for (FileStatus entry : fs.listStatus(status.getPath())) {
159
- container.addAll(listRecursive(fs, entry));
160
- }
161
- }
162
- else {
163
- container.add(status.getPath().toString());
164
- }
165
- return container;
166
- }
167
-
168
-
169
-
170
- // private List<String> listUniquify(List<String> stringList)
171
- // {
172
- // Set<String> set = new HashSet<String>();
173
- // set.addAll(stringList);
174
- // List<String> uniqueStringList = new ArrayList<String>();
175
- // uniqueStringList.addAll(set);
176
- // return uniqueStringList;
177
- // }
178
-
179
- public static class HdfsFileInput extends InputStreamFileInput implements TransactionalFileInput
180
- {
181
- private static class HdfsFileProvider implements InputStreamFileInput.Provider
182
- {
183
- private final FileSystem fs;
184
- private final Path hdfsPath;
185
- private boolean opened = false;
186
-
187
- public HdfsFileProvider(PluginTask task, FileSystem fs, int taskIndex)
188
- {
189
- this.fs = fs;
190
- this.hdfsPath = new Path(task.getTargetFiles().get(taskIndex));
191
- }
192
-
193
- @Override
194
- public InputStream openNext() throws IOException
195
- {
196
- if (opened) {
197
- return null;
198
- }
199
-
200
- opened = true;
201
- return fs.open(hdfsPath);
202
- }
203
-
204
- @Override
205
- public void close()
206
- {
207
- }
208
- }
209
-
210
- public HdfsFileInput(PluginTask task, FileSystem fs, int taskIndex)
211
- {
212
- super(task.getBufferAllocator(), new HdfsFileProvider(task, fs, taskIndex));
213
- }
214
-
215
- @Override
216
- public void close()
217
- {
218
- }
219
-
220
- @Override
221
- public void abort()
222
- {
223
- }
224
-
225
- @Override
226
- public TaskReport commit()
227
- {
228
- return Exec.newTaskReport();
229
- }
230
- }
231
- }