embulk-input-hdfs 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 730a82c7b3d06734b462824ca3c6036513f2cb60
4
- data.tar.gz: 23cb0dfbda2edfc51a24d743dfaf30557b26a86f
3
+ metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
4
+ data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
5
5
  SHA512:
6
- metadata.gz: 1a92443bbb2df59ab50cb68b532522c4cc73d5d5466a1a7e7aace2309885e9dc09dfaa5f51845fcfeeb34e4ed0dcfc0b8a2d2fe25893d60bfbc48e67d285718e
7
- data.tar.gz: 57e0aff97e4a14d45fcaecfb6e3b33dbaa001fa282bd2907e146615392b6676580193a7cf3def5e9923184c97c18cddb8bde07e4c9a87dfc905c23f1b7b639a5
6
+ metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
7
+ data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
data/.gitignore CHANGED
@@ -7,3 +7,5 @@
7
7
  build/
8
8
  .idea
9
9
  *.iml
10
+ .ruby-version
11
+
data/README.md CHANGED
@@ -14,6 +14,7 @@ Read files on Hdfs.
14
14
  - **config** overwrites configuration parameters (hash, default: `{}`)
15
15
  - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
16
16
  - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
17
+ - **partition** when this is true, partition input files and increase task count. (default: `true`)
17
18
 
18
19
  ## Example
19
20
 
@@ -24,12 +25,13 @@ in:
24
25
  - /opt/analytics/etc/hadoop/conf/core-site.xml
25
26
  - /opt/analytics/etc/hadoop/conf/hdfs-site.xml
26
27
  config:
27
- fs.defaultFS: 'hdfs://hdp-nn1:8020'
28
+ fs.defaultFS: 'hdfs://hadoop-nn1:8020'
28
29
  dfs.replication: 1
29
30
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
30
31
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
31
32
  input_path: /user/embulk/test/%Y-%m-%d/*
32
33
  rewind_seconds: 86400
34
+ partition: true
33
35
  decoders:
34
36
  - {type: gzip}
35
37
  parser:
@@ -50,6 +52,15 @@ in:
50
52
  - {name: c3, type: long}
51
53
  ```
52
54
 
55
+ ## Note
56
+ - the feature of the partition supports only 3 line terminators.
57
+ - `\n`
58
+ - `\r`
59
+ - `\r\n`
60
+
61
+ ## The Reference Implementation
62
+ - [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
63
+
53
64
  ## Build
54
65
 
55
66
  ```
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.0.3"
15
+ version = "0.1.0"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -22,7 +22,7 @@ dependencies {
22
22
  provided "org.embulk:embulk-core:0.7.0"
23
23
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
24
24
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
25
- compile 'com.google.guava:guava:14.0'
25
+ compile 'com.google.guava:guava:15.0'
26
26
  testCompile "junit:junit:4.+"
27
27
  }
28
28
 
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_input(
2
- "hdfs", "org.embulk.input.HdfsFileInputPlugin",
2
+ "hdfs", "org.embulk.input.hdfs.HdfsFileInputPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,267 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+ import java.util.List;
6
+ import java.util.ArrayList;
7
+ import java.util.Map;
8
+
9
+ import com.google.common.base.Function;
10
+ import com.google.common.base.Optional;
11
+ import com.google.common.collect.ImmutableList;
12
+ import com.google.common.collect.Lists;
13
+ import org.apache.hadoop.conf.Configuration;
14
+ import org.apache.hadoop.fs.FileStatus;
15
+ import org.apache.hadoop.fs.FileSystem;
16
+ import org.apache.hadoop.fs.Path;
17
+ import org.embulk.config.TaskReport;
18
+ import org.embulk.config.Config;
19
+ import org.embulk.config.ConfigDefault;
20
+ import org.embulk.config.ConfigInject;
21
+ import org.embulk.config.ConfigDiff;
22
+ import org.embulk.config.ConfigSource;
23
+ import org.embulk.config.Task;
24
+ import org.embulk.config.TaskSource;
25
+ import org.embulk.spi.*;
26
+ import org.embulk.spi.util.InputStreamFileInput;
27
+ import org.embulk.spi.util.InputStreamTransactionalFileInput;
28
+ import org.jruby.embed.ScriptingContainer;
29
+ import org.slf4j.Logger;
30
+
31
+ import javax.annotation.Nullable;
32
+
33
+ public class HdfsFileInputPlugin implements FileInputPlugin
34
+ {
35
+ private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
36
+
37
+ public interface PluginTask extends Task
38
+ {
39
+ @Config("config_files")
40
+ @ConfigDefault("[]")
41
+ public List<String> getConfigFiles();
42
+
43
+ @Config("config")
44
+ @ConfigDefault("{}")
45
+ public Map<String, String> getConfig();
46
+
47
+ @Config("input_path")
48
+ public String getInputPath();
49
+
50
+ @Config("rewind_seconds")
51
+ @ConfigDefault("0")
52
+ public int getRewindSeconds();
53
+
54
+ @Config("partition")
55
+ @ConfigDefault("true")
56
+ public boolean getPartition();
57
+
58
+ // this parameter is experimental.
59
+ @Config("partition_level")
60
+ @ConfigDefault("3")
61
+ public int getPartitonLevel();
62
+
63
+ public List<HdfsPartialFile> getFiles();
64
+ public void setFiles(List<HdfsPartialFile> hdfsFiles);
65
+
66
+ @ConfigInject
67
+ public BufferAllocator getBufferAllocator();
68
+ }
69
+
70
+ @Override
71
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
72
+ {
73
+ PluginTask task = config.loadConfig(PluginTask.class);
74
+
75
+ // listing Files
76
+ String pathString = strftime(task.getInputPath(), task.getRewindSeconds());
77
+ try {
78
+ List<String> originalFileList = buildFileList(getFs(task), pathString);
79
+ task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
80
+ logger.info("Loading target files: {}", originalFileList);
81
+ }
82
+ catch (IOException e) {
83
+ logger.error(e.getMessage());
84
+ throw new RuntimeException(e);
85
+ }
86
+
87
+ // log the detail of partial files.
88
+ for (HdfsPartialFile partialFile : task.getFiles()) {
89
+ logger.info("target file: {}, start: {}, end: {}",
90
+ partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
91
+ }
92
+
93
+ // number of processors is same with number of targets
94
+ int taskCount = task.getFiles().size();
95
+ logger.info("task size: {}", taskCount);
96
+
97
+ return resume(task.dump(), taskCount, control);
98
+ }
99
+
100
+ @Override
101
+ public ConfigDiff resume(TaskSource taskSource,
102
+ int taskCount,
103
+ FileInputPlugin.Control control)
104
+ {
105
+ control.run(taskSource, taskCount);
106
+
107
+ ConfigDiff configDiff = Exec.newConfigDiff();
108
+
109
+ // usually, yo use last_path
110
+ //if (task.getFiles().isEmpty()) {
111
+ // if (task.getLastPath().isPresent()) {
112
+ // configDiff.set("last_path", task.getLastPath().get());
113
+ // }
114
+ //} else {
115
+ // List<String> files = new ArrayList<String>(task.getFiles());
116
+ // Collections.sort(files);
117
+ // configDiff.set("last_path", files.get(files.size() - 1));
118
+ //}
119
+
120
+ return configDiff;
121
+ }
122
+
123
+ @Override
124
+ public void cleanup(TaskSource taskSource,
125
+ int taskCount,
126
+ List<TaskReport> successTaskReports)
127
+ {
128
+ }
129
+
130
+ @Override
131
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
132
+ {
133
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
134
+
135
+ InputStream input;
136
+ try {
137
+ input = openInputStream(task, task.getFiles().get(taskIndex));
138
+ }
139
+ catch (IOException e) {
140
+ logger.error(e.getMessage());
141
+ throw new RuntimeException(e);
142
+ }
143
+
144
+ return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input) {
145
+ @Override
146
+ public void abort()
147
+ { }
148
+
149
+ @Override
150
+ public TaskReport commit()
151
+ {
152
+ return Exec.newTaskReport();
153
+ }
154
+ };
155
+ }
156
+
157
+ private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
158
+ throws IOException
159
+ {
160
+ FileSystem fs = getFs(task);
161
+ InputStream original = fs.open(new Path(partialFile.getPath()));
162
+ return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
163
+ }
164
+
165
+ private static FileSystem getFs(final PluginTask task)
166
+ throws IOException
167
+ {
168
+ Configuration configuration = new Configuration();
169
+
170
+ for (Object configFile : task.getConfigFiles()) {
171
+ configuration.addResource(configFile.toString());
172
+ }
173
+ configuration.reloadConfiguration();
174
+
175
+ for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
176
+ configuration.set(entry.getKey(), entry.getValue());
177
+ }
178
+
179
+ return FileSystem.get(configuration);
180
+ }
181
+
182
+ private String strftime(final String raw, final int rewind_seconds)
183
+ {
184
+ ScriptingContainer jruby = new ScriptingContainer();
185
+ Object resolved = jruby.runScriptlet(
186
+ String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
187
+ return resolved.toString();
188
+ }
189
+
190
+ private List<String> buildFileList(final FileSystem fs, final String pathString)
191
+ throws IOException
192
+ {
193
+ List<String> fileList = new ArrayList<>();
194
+ for (FileStatus entry : fs.globStatus(new Path(pathString))) {
195
+ if (entry.isDirectory()) {
196
+ fileList.addAll(lsr(fs, entry));
197
+ } else {
198
+ fileList.add(entry.getPath().toString());
199
+ }
200
+ }
201
+ return fileList;
202
+ }
203
+
204
+ private List<String> lsr(final FileSystem fs, FileStatus status)
205
+ throws IOException
206
+ {
207
+ List<String> fileList = new ArrayList<>();
208
+ if (status.isDirectory()) {
209
+ for (FileStatus entry : fs.listStatus(status.getPath())) {
210
+ fileList.addAll(lsr(fs, entry));
211
+ }
212
+ }
213
+ else {
214
+ fileList.add(status.getPath().toString());
215
+ }
216
+ return fileList;
217
+ }
218
+
219
+ private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
220
+ throws IOException
221
+ {
222
+ List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
223
+ {
224
+ @Nullable
225
+ @Override
226
+ public Path apply(@Nullable String input)
227
+ {
228
+ return new Path(input);
229
+ }
230
+ });
231
+
232
+ int totalFileLength = 0;
233
+ for (Path path : pathList) {
234
+ totalFileLength += fs.getFileStatus(path).getLen();
235
+ }
236
+
237
+ // TODO: optimum allocation of resources
238
+ int partitionCountParameter = task.getPartitonLevel();
239
+ int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
240
+
241
+ List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
242
+ for (Path path : pathList) {
243
+ int partitionCount;
244
+
245
+ if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
246
+ partitionCount = 1;
247
+ }
248
+ else if (!task.getPartition()) {
249
+ partitionCount = 1;
250
+ }
251
+ else {
252
+ int fileLength = (int) fs.getFileStatus(path).getLen();
253
+ partitionCount = fileLength / partitionSizeByOneTask;
254
+ int remainder = fileLength % partitionSizeByOneTask;
255
+
256
+ if (remainder > 0) {
257
+ partitionCount++;
258
+ }
259
+ }
260
+
261
+ HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
262
+ hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
263
+ }
264
+
265
+ return hdfsPartialFiles;
266
+ }
267
+ }
@@ -0,0 +1,39 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.FileSystem;
4
+ import org.apache.hadoop.fs.Path;
5
+
6
+ import java.io.IOException;
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+
10
+ /**
11
+ * Created by takahiro.nakayama on 8/20/15.
12
+ */
13
+ public class HdfsFilePartitioner
14
+ {
15
+ private FileSystem fs;
16
+ private Path path;
17
+ private int partitionCount;
18
+
19
+ public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
20
+ {
21
+ this.fs = fs;
22
+ this.path = path;
23
+ this.partitionCount = partitionCount;
24
+ }
25
+
26
+ public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
27
+ {
28
+ List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
29
+ long size = fs.getFileStatus(path).getLen();
30
+ for (int i = 0; i < partitionCount; i++) {
31
+ long start = size * i / partitionCount;
32
+ long end = size * (i + 1) / partitionCount;
33
+ if (start < end) {
34
+ hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
35
+ }
36
+ }
37
+ return hdfsPartialFiles;
38
+ }
39
+ }
@@ -0,0 +1,40 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.Path;
4
+
5
+ /**
6
+ * Created by takahiro.nakayama on 8/20/15.
7
+ */
8
+ // ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
9
+ public class HdfsPartialFile
10
+ {
11
+ private String path;
12
+ private long start;
13
+ private long end;
14
+
15
+ public HdfsPartialFile(String path, long start, long end)
16
+ {
17
+ this.path = path;
18
+ this.start = start;
19
+ this.end = end;
20
+ }
21
+
22
+ // see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
23
+ public HdfsPartialFile() { }
24
+
25
+ public String getPath()
26
+ {
27
+ return path;
28
+ }
29
+
30
+ public long getStart()
31
+ {
32
+ return start;
33
+ }
34
+
35
+ public long getEnd()
36
+ {
37
+ return end;
38
+ }
39
+
40
+ }
@@ -0,0 +1,154 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.IOException;
5
+ import java.io.InputStream;
6
+ import java.io.PushbackInputStream;
7
+
8
+ // ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
9
+ public class HdfsPartialFileInputStream extends InputStream
10
+ {
11
+ private final PushbackInputStream original;
12
+ private long start;
13
+ private long end;
14
+ private long current;
15
+ private boolean eof;
16
+
17
+ public HdfsPartialFileInputStream(InputStream original, long start, long end)
18
+ {
19
+ this.original = new PushbackInputStream(new BufferedInputStream(original));
20
+ this.start = start;
21
+ this.end = end;
22
+ current = -1;
23
+ }
24
+
25
+ @Override
26
+ public int read(byte[] b) throws IOException
27
+ {
28
+ return read(b, 0, b.length);
29
+ }
30
+
31
+ @Override
32
+ public int read(byte[] b, int off, int len) throws IOException
33
+ {
34
+ initializeIfNeeded();
35
+
36
+ if (eof) {
37
+ return -1;
38
+ }
39
+
40
+ int read = original.read(b, off, len);
41
+ if (read < 0) {
42
+ eof = true;
43
+ return -1;
44
+ }
45
+
46
+ current += read;
47
+ if (current >= end) {
48
+ for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
49
+ if (b[off + i] == '\n') {
50
+ eof = true;
51
+ return i + 1;
52
+ }
53
+
54
+ if (b[off + i] == '\r') {
55
+ int next = (i < read ? b[off + i + 1] : prefetch());
56
+ if (next != '\n') {
57
+ eof = true;
58
+ return i + 1;
59
+ }
60
+ }
61
+ }
62
+ }
63
+
64
+ return read;
65
+ }
66
+
67
+ @Override
68
+ public int read() throws IOException
69
+ {
70
+ initializeIfNeeded();
71
+
72
+ if (eof) {
73
+ return -1;
74
+ }
75
+
76
+ int read = original.read();
77
+ current++;
78
+
79
+ if (read < 0) {
80
+ eof = true;
81
+ return -1;
82
+ }
83
+
84
+ if (current >= end) {
85
+ if (read == '\n' || read == '\r' && prefetch() != '\n') {
86
+ eof = true;
87
+ }
88
+ }
89
+
90
+ return read;
91
+ }
92
+
93
+ @Override
94
+ public long skip(long n) throws IOException
95
+ {
96
+ throw new IOException("Skip not supported.");
97
+ /*
98
+ long skip = original.skip(n);
99
+ current += skip;
100
+ return skip;
101
+ */
102
+ }
103
+
104
+ @Override
105
+ public int available() throws IOException
106
+ {
107
+ return 0;
108
+ }
109
+
110
+ @Override
111
+ public void close() throws IOException
112
+ {
113
+ original.close();
114
+ }
115
+
116
+ private void initializeIfNeeded() throws IOException
117
+ {
118
+ if (current >= start) {
119
+ return;
120
+
121
+ }
122
+ if (start == 0) {
123
+ current = 0;
124
+ } else {
125
+ current = original.skip(--start);
126
+ if (current != start) {
127
+ throw new IOException("Cannot skip.");
128
+ }
129
+
130
+ int c;
131
+ while ((c = original.read()) >= 0) {
132
+ start++;
133
+ current++;
134
+
135
+ if (c == '\n' || c == '\r' && prefetch() != '\n') {
136
+ break;
137
+ }
138
+ }
139
+ }
140
+
141
+ if (start >= end) {
142
+ eof = true;
143
+ }
144
+ }
145
+
146
+ private int prefetch() throws IOException
147
+ {
148
+ int c = original.read();
149
+ if (c >= 0) {
150
+ original.unread(c);
151
+ }
152
+ return c;
153
+ }
154
+ }
@@ -1,4 +1,4 @@
1
- package org.embulk.input;
1
+ package org.embulk.input.hdfs;
2
2
 
3
3
  public class TestHdfsFileInputPlugin
4
4
  {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - takahiro.nakayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-19 00:00:00.000000000 Z
11
+ date: 2015-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -54,8 +54,11 @@ files:
54
54
  - gradlew
55
55
  - gradlew.bat
56
56
  - lib/embulk/input/hdfs.rb
57
- - src/main/java/org/embulk/input/HdfsFileInputPlugin.java
58
- - src/test/java/org/embulk/input/TestHdfsFileInputPlugin.java
57
+ - src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
58
+ - src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java
59
+ - src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java
60
+ - src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java
61
+ - src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
59
62
  - classpath/activation-1.1.jar
60
63
  - classpath/apacheds-i18n-2.0.0-M15.jar
61
64
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -79,7 +82,7 @@ files:
79
82
  - classpath/curator-client-2.6.0.jar
80
83
  - classpath/curator-framework-2.6.0.jar
81
84
  - classpath/curator-recipes-2.6.0.jar
82
- - classpath/embulk-input-hdfs-0.0.3.jar
85
+ - classpath/embulk-input-hdfs-0.1.0.jar
83
86
  - classpath/gson-2.2.4.jar
84
87
  - classpath/hadoop-annotations-2.6.0.jar
85
88
  - classpath/hadoop-auth-2.6.0.jar
Binary file
@@ -1,231 +0,0 @@
1
- package org.embulk.input;
2
-
3
- import org.apache.hadoop.conf.Configuration;
4
- import org.apache.hadoop.fs.FileStatus;
5
- import org.apache.hadoop.fs.FileSystem;
6
- import org.apache.hadoop.fs.Path;
7
- import org.embulk.config.*;
8
- import org.embulk.spi.BufferAllocator;
9
- import org.embulk.spi.Exec;
10
- import org.embulk.spi.FileInputPlugin;
11
- import org.embulk.spi.TransactionalFileInput;
12
- import org.embulk.spi.util.InputStreamFileInput;
13
- import org.jruby.embed.ScriptingContainer;
14
- import org.slf4j.Logger;
15
-
16
- import java.io.IOException;
17
- import java.io.InputStream;
18
- import java.util.ArrayList;
19
- import java.util.List;
20
- import java.util.Map;
21
-
22
- public class HdfsFileInputPlugin implements FileInputPlugin
23
- {
24
- private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
25
-
26
- public interface PluginTask extends Task
27
- {
28
- @Config("config_files")
29
- @ConfigDefault("[]")
30
- public List<String> getConfigFiles();
31
-
32
- @Config("config")
33
- @ConfigDefault("{}")
34
- public Map<String, String> getConfig();
35
-
36
- @Config("input_path")
37
- public String getInputPath();
38
-
39
- @Config("rewind_seconds")
40
- @ConfigDefault("0")
41
- public int getRewindSeconds();
42
-
43
- public List<String> getTargetFiles();
44
- public void setTargetFiles(List<String> targetFiles);
45
-
46
- @ConfigInject
47
- public BufferAllocator getBufferAllocator();
48
- }
49
-
50
- @Override
51
- public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
52
- {
53
- PluginTask task = config.loadConfig(PluginTask.class);
54
-
55
- // prepare
56
- Configuration configuration = getHdfsConfiguration(task);
57
- FileSystem fs = getFs(configuration);
58
- Path inputPath = new Path(strftime(task.getInputPath(), task.getRewindSeconds()));
59
-
60
- // listing
61
- List<String> targetFiles;
62
- try {
63
- targetFiles = globRecursive(fs, inputPath);
64
- } catch (IOException e) {
65
- logger.error(e.getMessage());
66
- throw new RuntimeException(e);
67
- }
68
- logger.info("Loading target files: {}", targetFiles);
69
- task.setTargetFiles(targetFiles);
70
-
71
- // number of processors is same with number of targets
72
- int taskCount = targetFiles.size();
73
-
74
- return resume(task.dump(), taskCount, control);
75
- }
76
-
77
- @Override
78
- public ConfigDiff resume(TaskSource taskSource,
79
- int taskCount,
80
- FileInputPlugin.Control control)
81
- {
82
- control.run(taskSource, taskCount);
83
- return Exec.newConfigDiff();
84
- }
85
-
86
- @Override
87
- public void cleanup(TaskSource taskSource,
88
- int taskCount,
89
- List<TaskReport> successTaskReports)
90
- {
91
- }
92
-
93
- @Override
94
- public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
95
- {
96
- PluginTask task = taskSource.loadTask(PluginTask.class);
97
-
98
- // prepare
99
- Configuration configuration = getHdfsConfiguration(task);
100
- FileSystem fs = getFs(configuration);
101
-
102
- return new HdfsFileInput(task, fs, taskIndex);
103
- }
104
-
105
- private Configuration getHdfsConfiguration(final PluginTask task)
106
- {
107
- Configuration configuration = new Configuration();
108
-
109
- for (Object configFile : task.getConfigFiles()) {
110
- configuration.addResource(configFile.toString());
111
- }
112
- configuration.reloadConfiguration();
113
-
114
- for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
115
- configuration.set(entry.getKey(), entry.getValue());
116
- }
117
-
118
- return configuration;
119
- }
120
-
121
- private FileSystem getFs(final Configuration configuration)
122
- {
123
- try {
124
- FileSystem fs = FileSystem.get(configuration);
125
- return fs;
126
- }
127
- catch (IOException e) {
128
- logger.error(e.getMessage());
129
- throw new RuntimeException(e);
130
- }
131
- }
132
-
133
- private String strftime(final String raw, final int rewind_seconds)
134
- {
135
- ScriptingContainer jruby = new ScriptingContainer();
136
- Object resolved = jruby.runScriptlet(
137
- String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
138
- return resolved.toString();
139
- }
140
-
141
- private List<String> globRecursive(final FileSystem fs, final Path hdfsPath) throws IOException
142
- {
143
- List<String> container = new ArrayList<String>();
144
- for (FileStatus entry : fs.globStatus(hdfsPath)) {
145
- if (entry.isDirectory()) {
146
- container.addAll(listRecursive(fs, entry));
147
- }
148
- else {
149
- container.add(entry.getPath().toString());
150
- }
151
- }
152
- return container;
153
- }
154
-
155
- private List<String> listRecursive(final FileSystem fs, FileStatus status) throws IOException {
156
- List<String> container = new ArrayList<String>();
157
- if (status.isDirectory()) {
158
- for (FileStatus entry : fs.listStatus(status.getPath())) {
159
- container.addAll(listRecursive(fs, entry));
160
- }
161
- }
162
- else {
163
- container.add(status.getPath().toString());
164
- }
165
- return container;
166
- }
167
-
168
-
169
-
170
- // private List<String> listUniquify(List<String> stringList)
171
- // {
172
- // Set<String> set = new HashSet<String>();
173
- // set.addAll(stringList);
174
- // List<String> uniqueStringList = new ArrayList<String>();
175
- // uniqueStringList.addAll(set);
176
- // return uniqueStringList;
177
- // }
178
-
179
- public static class HdfsFileInput extends InputStreamFileInput implements TransactionalFileInput
180
- {
181
- private static class HdfsFileProvider implements InputStreamFileInput.Provider
182
- {
183
- private final FileSystem fs;
184
- private final Path hdfsPath;
185
- private boolean opened = false;
186
-
187
- public HdfsFileProvider(PluginTask task, FileSystem fs, int taskIndex)
188
- {
189
- this.fs = fs;
190
- this.hdfsPath = new Path(task.getTargetFiles().get(taskIndex));
191
- }
192
-
193
- @Override
194
- public InputStream openNext() throws IOException
195
- {
196
- if (opened) {
197
- return null;
198
- }
199
-
200
- opened = true;
201
- return fs.open(hdfsPath);
202
- }
203
-
204
- @Override
205
- public void close()
206
- {
207
- }
208
- }
209
-
210
- public HdfsFileInput(PluginTask task, FileSystem fs, int taskIndex)
211
- {
212
- super(task.getBufferAllocator(), new HdfsFileProvider(task, fs, taskIndex));
213
- }
214
-
215
- @Override
216
- public void close()
217
- {
218
- }
219
-
220
- @Override
221
- public void abort()
222
- {
223
- }
224
-
225
- @Override
226
- public TaskReport commit()
227
- {
228
- return Exec.newTaskReport();
229
- }
230
- }
231
- }