embulk-output-hdfs 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84d24b454173e755db38f3b87e8fb5808d4788b2
4
- data.tar.gz: 2f031bc26b79c4483e1cea88133251c199e51f37
3
+ metadata.gz: e23e49bd7a7a7cb1587c929faf9979fe3ae5cf94
4
+ data.tar.gz: 13833b306d2d69da411177c6f4a094955b572f79
5
5
  SHA512:
6
- metadata.gz: 3db39115ff402039d21530cb7251f2def2c4e4f5ac1aeb735dc69e6f444d35390be6f560d452d69318b7025de70a421d8fd62cae4dff19e78002b5c670cdfd3e
7
- data.tar.gz: c4ebc62b415d33caad0713d41d5277c85b033467486a4a680cc109fd2fe2f1e6e2975034b2de046fba8136345bae112a64dcf501646d149ddd8860b0fca19091
6
+ metadata.gz: 6ac5ec5966dd880ed625d85afdaa8bd587dcf03aadfa4c2464f5bb78034dea247d263197ced5a1130d3d394a801dd9ea3d5a91180973cc4ced4dd43c332c1a10
7
+ data.tar.gz: 82f2d8d22029f356925e7412d9a891317d5b6c4fa667787ce2df46fd7195db4aaf813410c705e590ba21958fcb9242a81ca6b4e04d2a4904c4770eacf23f95eb
data/.gitignore CHANGED
@@ -6,3 +6,6 @@
6
6
  /classpath/
7
7
  build/
8
8
  .idea
9
+ *.iml
10
+ .ruby-version
11
+
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
- # Hdfs output plugin for Embulk
1
+ # Hdfs file output plugin for Embulk
2
2
 
3
3
  A File Output Plugin for Embulk to write HDFS.
4
4
 
5
5
  ## Overview
6
6
 
7
7
  * **Plugin type**: file output
8
- * **Load all or nothing**: no
8
+ * **Load all or nothing**: yes
9
9
  * **Resume supported**: no
10
10
  * **Cleanup supported**: no
11
11
 
@@ -13,8 +13,12 @@ A File Output Plugin for Embulk to write HDFS.
13
13
 
14
14
  - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
15
15
  - **config** overwrites configuration parameters (hash, default: `{}`)
16
- - **output_path** the path finally stored files. (string, default: `"/tmp/embulk.output.hdfs_output.%Y%m%d_%s"`)
17
- - **working_path** the path temporary stored files. (string, default: `"/tmp/embulk.working.hdfs_output.%Y%m%d_%s"`)
16
+ - **path_prefix** prefix of target files (string, required)
17
+ - **file_ext** suffix of target files (string, required)
18
+ - **sequence_format** format for sequence part of target files (string, default: `'.%03d.%02d'`)
19
+ - **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
20
+ - **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
21
+ - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
18
22
 
19
23
  ## Example
20
24
 
@@ -24,14 +28,13 @@ out:
24
28
  config_files:
25
29
  - /etc/hadoop/conf/core-site.xml
26
30
  - /etc/hadoop/conf/hdfs-site.xml
27
- - /etc/hadoop/conf/mapred-site.xml
28
- - /etc/hadoop/conf/yarn-site.xml
29
31
  config:
30
32
  fs.defaultFS: 'hdfs://hdp-nn1:8020'
31
- dfs.replication: 1
32
- mapreduce.client.submit.file.replication: 1
33
33
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
34
34
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
35
+ path_prefix: '/tmp/embulk/hdfs_output/%Y-%m-%d/out'
36
+ file_ext: 'txt'
37
+ overwrite: true
35
38
  formatter:
36
39
  type: csv
37
40
  encoding: UTF-8
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.2"
15
+ version = "0.2.0"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -22,7 +22,7 @@ dependencies {
22
22
  provided "org.embulk:embulk-core:0.7.0"
23
23
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
24
24
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
25
- compile 'com.google.guava:guava:14.0'
25
+ compile 'com.google.guava:guava:15.0'
26
26
  testCompile "junit:junit:4.+"
27
27
  }
28
28
 
@@ -57,9 +57,9 @@ task gemspec {
57
57
  Gem::Specification.new do |spec|
58
58
  spec.name = "${project.name}"
59
59
  spec.version = "${project.version}"
60
- spec.authors = ["takahiro.nakayama"]
61
- spec.summary = %[Hdfs output plugin for Embulk]
62
- spec.description = %[Dumps records to Hdfs.]
60
+ spec.authors = ["Civitaspo"]
61
+ spec.summary = %[Hdfs file output plugin for Embulk]
62
+ spec.description = %[Stores files on Hdfs.]
63
63
  spec.email = ["civitaspo@gmail.com"]
64
64
  spec.licenses = ["MIT"]
65
65
  spec.homepage = "https://github.com/civitaspo/embulk-output-hdfs"
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_output(
2
- "hdfs", "org.embulk.output.HdfsOutputPlugin",
2
+ "hdfs", "org.embulk.output.hdfs.HdfsFileOutputPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,198 @@
1
+ package org.embulk.output.hdfs;
2
+
3
+ import java.io.IOException;
4
+ import java.io.OutputStream;
5
+ import java.util.ArrayList;
6
+ import java.util.List;
7
+ import java.util.Map;
8
+
9
+ import org.apache.hadoop.conf.Configuration;
10
+ import org.apache.hadoop.fs.FileSystem;
11
+ import org.apache.hadoop.fs.Path;
12
+ import org.embulk.config.TaskReport;
13
+ import org.embulk.config.Config;
14
+ import org.embulk.config.ConfigDefault;
15
+ import org.embulk.config.ConfigDiff;
16
+ import org.embulk.config.ConfigSource;
17
+ import org.embulk.config.Task;
18
+ import org.embulk.config.TaskSource;
19
+ import org.embulk.spi.Buffer;
20
+ import org.embulk.spi.Exec;
21
+ import org.embulk.spi.FileOutputPlugin;
22
+ import org.embulk.spi.TransactionalFileOutput;
23
+ import org.jruby.embed.ScriptingContainer;
24
+ import org.slf4j.Logger;
25
+
26
+ public class HdfsFileOutputPlugin
27
+ implements FileOutputPlugin
28
+ {
29
+ private static final Logger logger = Exec.getLogger(HdfsFileOutputPlugin.class);
30
+
31
+ public interface PluginTask
32
+ extends Task
33
+ {
34
+ @Config("config_files")
35
+ @ConfigDefault("[]")
36
+ public List<String> getConfigFiles();
37
+
38
+ @Config("config")
39
+ @ConfigDefault("{}")
40
+ public Map<String, String> getConfig();
41
+
42
+ @Config("path_prefix")
43
+ public String getPathPrefix();
44
+
45
+ @Config("file_ext")
46
+ public String getFileNameExtension();
47
+
48
+ @Config("sequence_format")
49
+ @ConfigDefault("\"%03d.%02d.\"")
50
+ public String getSequenceFormat();
51
+
52
+ @Config("rewind_seconds")
53
+ @ConfigDefault("0")
54
+ public int getRewindSeconds();
55
+
56
+ @Config("overwrite")
57
+ @ConfigDefault("false")
58
+ public boolean getOverwrite();
59
+
60
+ }
61
+
62
+ @Override
63
+ public ConfigDiff transaction(ConfigSource config, int taskCount,
64
+ FileOutputPlugin.Control control)
65
+ {
66
+ PluginTask task = config.loadConfig(PluginTask.class);
67
+
68
+ control.run(task.dump());
69
+ return Exec.newConfigDiff();
70
+ }
71
+
72
+ @Override
73
+ public ConfigDiff resume(TaskSource taskSource,
74
+ int taskCount,
75
+ FileOutputPlugin.Control control)
76
+ {
77
+ throw new UnsupportedOperationException("hdfs output plugin does not support resuming");
78
+ }
79
+
80
+ @Override
81
+ public void cleanup(TaskSource taskSource,
82
+ int taskCount,
83
+ List<TaskReport> successTaskReports)
84
+ {
85
+ }
86
+
87
+ @Override
88
+ public TransactionalFileOutput open(TaskSource taskSource, final int taskIndex)
89
+ {
90
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
91
+
92
+ final String pathPrefix = strftime(task.getPathPrefix(), task.getRewindSeconds());
93
+ final String pathSuffix = task.getFileNameExtension();
94
+ final String sequenceFormat = task.getSequenceFormat();
95
+
96
+ return new TransactionalFileOutput()
97
+ {
98
+ private final List<String> hdfsFileNames = new ArrayList<>();
99
+ private int fileIndex = 0;
100
+ private OutputStream output = null;
101
+
102
+ @Override
103
+ public void nextFile()
104
+ {
105
+ closeCurrentStream();
106
+ Path path = new Path(pathPrefix + String.format(sequenceFormat, taskIndex, fileIndex) + pathSuffix);
107
+ try {
108
+ FileSystem fs = getFs(task);
109
+ output = fs.create(path, task.getOverwrite());
110
+ logger.info("Uploading '{}'", path);
111
+ }
112
+ catch (IOException e) {
113
+ logger.error(e.getMessage());
114
+ throw new RuntimeException(e);
115
+ }
116
+ hdfsFileNames.add(path.toString());
117
+ fileIndex++;
118
+ }
119
+
120
+ @Override
121
+ public void add(Buffer buffer)
122
+ {
123
+ try {
124
+ output.write(buffer.array(), buffer.offset(), buffer.limit());
125
+ }
126
+ catch (IOException e) {
127
+ throw new RuntimeException(e);
128
+ }
129
+ finally {
130
+ buffer.release();
131
+ }
132
+ }
133
+
134
+ @Override
135
+ public void finish()
136
+ {
137
+ closeCurrentStream();
138
+ }
139
+
140
+ @Override
141
+ public void close()
142
+ {
143
+ closeCurrentStream();
144
+ }
145
+
146
+ @Override
147
+ public void abort()
148
+ {
149
+ }
150
+
151
+ @Override
152
+ public TaskReport commit()
153
+ {
154
+ TaskReport report = Exec.newTaskReport();
155
+ report.set("hdfs_file_names", hdfsFileNames);
156
+ return report;
157
+ }
158
+
159
+ private void closeCurrentStream()
160
+ {
161
+ if (output != null) {
162
+ try {
163
+ output.close();
164
+ output = null;
165
+ }
166
+ catch (IOException e) {
167
+ throw new RuntimeException(e);
168
+ }
169
+ }
170
+ }
171
+ };
172
+ }
173
+
174
+ private static FileSystem getFs(final PluginTask task)
175
+ throws IOException
176
+ {
177
+ Configuration configuration = new Configuration();
178
+
179
+ for (Object configFile : task.getConfigFiles()) {
180
+ configuration.addResource(configFile.toString());
181
+ }
182
+ configuration.reloadConfiguration();
183
+
184
+ for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
185
+ configuration.set(entry.getKey(), entry.getValue());
186
+ }
187
+
188
+ return FileSystem.get(configuration);
189
+ }
190
+
191
+ private String strftime(final String raw, final int rewind_seconds)
192
+ {
193
+ ScriptingContainer jruby = new ScriptingContainer();
194
+ Object resolved = jruby.runScriptlet(
195
+ String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
196
+ return resolved.toString();
197
+ }
198
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.output.hdfs;
2
+
3
+ public class TestHdfsFileOutputPlugin
4
+ {
5
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
- - takahiro.nakayama
7
+ - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-19 00:00:00.000000000 Z
11
+ date: 2015-09-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,7 +38,7 @@ dependencies:
38
38
  version: '10.0'
39
39
  prerelease: false
40
40
  type: :development
41
- description: Dumps records to Hdfs.
41
+ description: Stores files on Hdfs.
42
42
  email:
43
43
  - civitaspo@gmail.com
44
44
  executables: []
@@ -54,8 +54,8 @@ files:
54
54
  - gradlew
55
55
  - gradlew.bat
56
56
  - lib/embulk/output/hdfs.rb
57
- - src/main/java/org/embulk/output/HdfsOutputPlugin.java
58
- - src/test/java/org/embulk/output/TestHdfsOutputPlugin.java
57
+ - src/main/java/org/embulk/output/hdfs/HdfsFileOutputPlugin.java
58
+ - src/test/java/org/embulk/output/hdfs/TestHdfsFileOutputPlugin.java
59
59
  - classpath/activation-1.1.jar
60
60
  - classpath/apacheds-i18n-2.0.0-M15.jar
61
61
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -79,7 +79,7 @@ files:
79
79
  - classpath/curator-client-2.6.0.jar
80
80
  - classpath/curator-framework-2.6.0.jar
81
81
  - classpath/curator-recipes-2.6.0.jar
82
- - classpath/embulk-output-hdfs-0.1.2.jar
82
+ - classpath/embulk-output-hdfs-0.2.0.jar
83
83
  - classpath/gson-2.2.4.jar
84
84
  - classpath/hadoop-annotations-2.6.0.jar
85
85
  - classpath/hadoop-auth-2.6.0.jar
@@ -151,5 +151,5 @@ rubyforge_project:
151
151
  rubygems_version: 2.1.9
152
152
  signing_key:
153
153
  specification_version: 4
154
- summary: Hdfs output plugin for Embulk
154
+ summary: Hdfs file output plugin for Embulk
155
155
  test_files: []
Binary file
@@ -1,219 +0,0 @@
1
- package org.embulk.output;
2
-
3
- import com.google.common.base.Throwables;
4
- import org.apache.hadoop.conf.Configuration;
5
- import org.apache.hadoop.fs.FileSystem;
6
- import org.apache.hadoop.fs.Path;
7
- import org.embulk.config.*;
8
- import org.embulk.spi.Buffer;
9
- import org.embulk.spi.Exec;
10
- import org.embulk.spi.FileOutputPlugin;
11
- import org.embulk.spi.TransactionalFileOutput;
12
- import org.jruby.embed.ScriptingContainer;
13
- import org.slf4j.Logger;
14
-
15
- import java.io.IOException;
16
- import java.io.OutputStream;
17
- import java.util.List;
18
- import java.util.Map;
19
-
20
- public class HdfsOutputPlugin implements FileOutputPlugin
21
- {
22
- private static final Logger logger = Exec.getLogger(HdfsOutputPlugin.class);
23
-
24
- public interface PluginTask extends Task
25
- {
26
- @Config("config_files")
27
- @ConfigDefault("[]")
28
- public List<String> getConfigFiles();
29
-
30
- @Config("config")
31
- @ConfigDefault("{}")
32
- public Map<String, String> getConfig();
33
-
34
- @Config("sequence_format")
35
- @ConfigDefault("\"%03d.%02d\"")
36
- public String getSequenceFormat();
37
-
38
- @Config("output_path")
39
- @ConfigDefault("\"/tmp/embulk.output.hdfs_output.%Y%m%d_%s\"")
40
- public String getOutputPath();
41
-
42
- @Config("working_path")
43
- @ConfigDefault("\"/tmp/embulk.working.hdfs_output.%Y%m%d_%s\"")
44
- public String getWorkingPath();
45
-
46
- }
47
-
48
- @Override
49
- public ConfigDiff transaction(ConfigSource config,
50
- int taskCount,
51
- FileOutputPlugin.Control control)
52
- {
53
- PluginTask task = config.loadConfig(PluginTask.class);
54
- return resume(task.dump(), taskCount, control);
55
- }
56
-
57
- @Override
58
- public ConfigDiff resume(TaskSource taskSource,
59
- int taskCount,
60
- FileOutputPlugin.Control control)
61
- {
62
- control.run(taskSource);
63
- return Exec.newConfigDiff();
64
- }
65
-
66
-
67
- @Override
68
- public void cleanup(TaskSource taskSource,
69
- int taskCount,
70
- List<TaskReport> successTaskReports)
71
- {
72
- }
73
-
74
- @Override
75
- public TransactionalFileOutput open(TaskSource taskSource, final int taskIndex)
76
- {
77
- PluginTask task = taskSource.loadTask(PluginTask.class);
78
-
79
- Configuration configuration = getHdfsConfiguration(task);
80
- FileSystem fs = getFs(configuration);
81
- String workingPath = strftime(task.getWorkingPath());
82
- String outputPath = strftime(task.getOutputPath());
83
- return new TransactionalHdfsFileOutput(task, fs, workingPath, outputPath, taskIndex);
84
- }
85
-
86
- private Configuration getHdfsConfiguration(final PluginTask task)
87
- {
88
- Configuration configuration = new Configuration();
89
-
90
- List configFiles = task.getConfigFiles();
91
- for (Object configFile : configFiles) {
92
- configuration.addResource(configFile.toString());
93
- }
94
-
95
- for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
96
- configuration.set(entry.getKey(), entry.getValue());
97
- }
98
-
99
- return configuration;
100
- }
101
-
102
- private FileSystem getFs(final Configuration configuration) {
103
- try {
104
- FileSystem fs = FileSystem.get(configuration);
105
- return fs;
106
- }
107
- catch (IOException e) {
108
- logger.error(e.getMessage());
109
- throw Throwables.propagate(e);
110
- }
111
- }
112
-
113
- private String strftime(final String path)
114
- {
115
- // strftime
116
- ScriptingContainer jruby = new ScriptingContainer();
117
- Object result = jruby.runScriptlet("Time.now.strftime('" + path + "')");
118
- return result.toString();
119
- }
120
-
121
- static class TransactionalHdfsFileOutput implements TransactionalFileOutput
122
- {
123
- private final int taskIndex;
124
- private final FileSystem fs;
125
- private final String workingPath;
126
- private final String outputPath;
127
- private final String sequenceFormat;
128
-
129
- private int fileIndex = 0;
130
- private int callCount = 0;
131
- private Path currentPath = null;
132
- private OutputStream currentStream = null;
133
-
134
- public TransactionalHdfsFileOutput(PluginTask task, FileSystem fs, String workingPath, String outputPath, int taskIndex)
135
- {
136
- this.taskIndex = taskIndex;
137
- this.fs = fs;
138
- this.workingPath = workingPath;
139
- this.outputPath = outputPath;
140
- this.sequenceFormat = task.getSequenceFormat();
141
- }
142
-
143
- public void nextFile() {
144
- closeCurrentStream();
145
- currentPath = new Path(workingPath + '/' + String.format(sequenceFormat, taskIndex, fileIndex));
146
- try {
147
- if (fs.exists(currentPath)) {
148
- throw new IllegalAccessException(currentPath.toString() + "already exists.");
149
- }
150
- currentStream = fs.create(currentPath);
151
- logger.info("Uploading '{}'", currentPath.toString());
152
- }
153
- catch (IOException | IllegalAccessException e) {
154
- logger.error(e.getMessage());
155
- throw Throwables.propagate(e);
156
- }
157
- fileIndex++;
158
- }
159
-
160
- @Override
161
- public void add(Buffer buffer) {
162
- if (currentStream == null) {
163
- throw new IllegalStateException("nextFile() must be called before poll()");
164
- }
165
- try {
166
- logger.debug("#add called {} times for taskIndex {}", callCount, taskIndex);
167
- currentStream.write(buffer.array(), buffer.offset(), buffer.limit());
168
- callCount++;
169
- } catch (IOException e) {
170
- throw new RuntimeException(e);
171
- } finally {
172
- buffer.release();
173
- }
174
- }
175
-
176
- @Override
177
- public void finish() {
178
- closeCurrentStream();
179
- }
180
-
181
- @Override
182
- public void close() {
183
- closeCurrentStream();
184
- }
185
-
186
- @Override
187
- public void abort() {
188
- }
189
-
190
- @Override
191
- public TaskReport commit() {
192
- try {
193
- fs.rename(new Path(workingPath), new Path(outputPath));
194
- logger.info("rename {} => {}", workingPath, outputPath);
195
- } catch (IOException e) {
196
- logger.error(e.getMessage());
197
- throw Throwables.propagate(e);
198
- }
199
-
200
- TaskReport report = Exec.newTaskReport();
201
- report.set("files", currentPath);
202
- return report;
203
- }
204
-
205
- private void closeCurrentStream() {
206
- try {
207
- if (currentStream != null) {
208
- currentStream.close();
209
- currentStream = null;
210
- }
211
-
212
- callCount = 0;
213
- } catch (IOException e) {
214
- logger.error(e.getMessage());
215
- throw Throwables.propagate(e);
216
- }
217
- }
218
- }
219
- }
@@ -1,5 +0,0 @@
1
- package org.embulk.output;
2
-
3
- public class TestHdfsOutputPlugin
4
- {
5
- }