embulk-output-hdfs 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84d24b454173e755db38f3b87e8fb5808d4788b2
4
- data.tar.gz: 2f031bc26b79c4483e1cea88133251c199e51f37
3
+ metadata.gz: e23e49bd7a7a7cb1587c929faf9979fe3ae5cf94
4
+ data.tar.gz: 13833b306d2d69da411177c6f4a094955b572f79
5
5
  SHA512:
6
- metadata.gz: 3db39115ff402039d21530cb7251f2def2c4e4f5ac1aeb735dc69e6f444d35390be6f560d452d69318b7025de70a421d8fd62cae4dff19e78002b5c670cdfd3e
7
- data.tar.gz: c4ebc62b415d33caad0713d41d5277c85b033467486a4a680cc109fd2fe2f1e6e2975034b2de046fba8136345bae112a64dcf501646d149ddd8860b0fca19091
6
+ metadata.gz: 6ac5ec5966dd880ed625d85afdaa8bd587dcf03aadfa4c2464f5bb78034dea247d263197ced5a1130d3d394a801dd9ea3d5a91180973cc4ced4dd43c332c1a10
7
+ data.tar.gz: 82f2d8d22029f356925e7412d9a891317d5b6c4fa667787ce2df46fd7195db4aaf813410c705e590ba21958fcb9242a81ca6b4e04d2a4904c4770eacf23f95eb
data/.gitignore CHANGED
@@ -6,3 +6,6 @@
6
6
  /classpath/
7
7
  build/
8
8
  .idea
9
+ *.iml
10
+ .ruby-version
11
+
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
- # Hdfs output plugin for Embulk
1
+ # Hdfs file output plugin for Embulk
2
2
 
3
3
  A File Output Plugin for Embulk to write HDFS.
4
4
 
5
5
  ## Overview
6
6
 
7
7
  * **Plugin type**: file output
8
- * **Load all or nothing**: no
8
+ * **Load all or nothing**: yes
9
9
  * **Resume supported**: no
10
10
  * **Cleanup supported**: no
11
11
 
@@ -13,8 +13,12 @@ A File Output Plugin for Embulk to write HDFS.
13
13
 
14
14
  - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
15
15
  - **config** overwrites configuration parameters (hash, default: `{}`)
16
- - **output_path** the path finally stored files. (string, default: `"/tmp/embulk.output.hdfs_output.%Y%m%d_%s"`)
17
- - **working_path** the path temporary stored files. (string, default: `"/tmp/embulk.working.hdfs_output.%Y%m%d_%s"`)
16
+ - **path_prefix** prefix of target files (string, required)
17
+ - **file_ext** suffix of target files (string, required)
18
+ - **sequence_format** format for sequence part of target files (string, default: `'.%03d.%02d'`)
19
+ - **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
20
+ - **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
21
+ - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
18
22
 
19
23
  ## Example
20
24
 
@@ -24,14 +28,13 @@ out:
24
28
  config_files:
25
29
  - /etc/hadoop/conf/core-site.xml
26
30
  - /etc/hadoop/conf/hdfs-site.xml
27
- - /etc/hadoop/conf/mapred-site.xml
28
- - /etc/hadoop/conf/yarn-site.xml
29
31
  config:
30
32
  fs.defaultFS: 'hdfs://hdp-nn1:8020'
31
- dfs.replication: 1
32
- mapreduce.client.submit.file.replication: 1
33
33
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
34
34
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
35
+ path_prefix: '/tmp/embulk/hdfs_output/%Y-%m-%d/out'
36
+ file_ext: 'txt'
37
+ overwrite: true
35
38
  formatter:
36
39
  type: csv
37
40
  encoding: UTF-8
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.2"
15
+ version = "0.2.0"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -22,7 +22,7 @@ dependencies {
22
22
  provided "org.embulk:embulk-core:0.7.0"
23
23
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
24
24
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
25
- compile 'com.google.guava:guava:14.0'
25
+ compile 'com.google.guava:guava:15.0'
26
26
  testCompile "junit:junit:4.+"
27
27
  }
28
28
 
@@ -57,9 +57,9 @@ task gemspec {
57
57
  Gem::Specification.new do |spec|
58
58
  spec.name = "${project.name}"
59
59
  spec.version = "${project.version}"
60
- spec.authors = ["takahiro.nakayama"]
61
- spec.summary = %[Hdfs output plugin for Embulk]
62
- spec.description = %[Dumps records to Hdfs.]
60
+ spec.authors = ["Civitaspo"]
61
+ spec.summary = %[Hdfs file output plugin for Embulk]
62
+ spec.description = %[Stores files on Hdfs.]
63
63
  spec.email = ["civitaspo@gmail.com"]
64
64
  spec.licenses = ["MIT"]
65
65
  spec.homepage = "https://github.com/civitaspo/embulk-output-hdfs"
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_output(
2
- "hdfs", "org.embulk.output.HdfsOutputPlugin",
2
+ "hdfs", "org.embulk.output.hdfs.HdfsFileOutputPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,198 @@
1
+ package org.embulk.output.hdfs;
2
+
3
+ import java.io.IOException;
4
+ import java.io.OutputStream;
5
+ import java.util.ArrayList;
6
+ import java.util.List;
7
+ import java.util.Map;
8
+
9
+ import org.apache.hadoop.conf.Configuration;
10
+ import org.apache.hadoop.fs.FileSystem;
11
+ import org.apache.hadoop.fs.Path;
12
+ import org.embulk.config.TaskReport;
13
+ import org.embulk.config.Config;
14
+ import org.embulk.config.ConfigDefault;
15
+ import org.embulk.config.ConfigDiff;
16
+ import org.embulk.config.ConfigSource;
17
+ import org.embulk.config.Task;
18
+ import org.embulk.config.TaskSource;
19
+ import org.embulk.spi.Buffer;
20
+ import org.embulk.spi.Exec;
21
+ import org.embulk.spi.FileOutputPlugin;
22
+ import org.embulk.spi.TransactionalFileOutput;
23
+ import org.jruby.embed.ScriptingContainer;
24
+ import org.slf4j.Logger;
25
+
26
+ public class HdfsFileOutputPlugin
27
+ implements FileOutputPlugin
28
+ {
29
+ private static final Logger logger = Exec.getLogger(HdfsFileOutputPlugin.class);
30
+
31
+ public interface PluginTask
32
+ extends Task
33
+ {
34
+ @Config("config_files")
35
+ @ConfigDefault("[]")
36
+ public List<String> getConfigFiles();
37
+
38
+ @Config("config")
39
+ @ConfigDefault("{}")
40
+ public Map<String, String> getConfig();
41
+
42
+ @Config("path_prefix")
43
+ public String getPathPrefix();
44
+
45
+ @Config("file_ext")
46
+ public String getFileNameExtension();
47
+
48
+ @Config("sequence_format")
49
+ @ConfigDefault("\"%03d.%02d.\"")
50
+ public String getSequenceFormat();
51
+
52
+ @Config("rewind_seconds")
53
+ @ConfigDefault("0")
54
+ public int getRewindSeconds();
55
+
56
+ @Config("overwrite")
57
+ @ConfigDefault("false")
58
+ public boolean getOverwrite();
59
+
60
+ }
61
+
62
+ @Override
63
+ public ConfigDiff transaction(ConfigSource config, int taskCount,
64
+ FileOutputPlugin.Control control)
65
+ {
66
+ PluginTask task = config.loadConfig(PluginTask.class);
67
+
68
+ control.run(task.dump());
69
+ return Exec.newConfigDiff();
70
+ }
71
+
72
+ @Override
73
+ public ConfigDiff resume(TaskSource taskSource,
74
+ int taskCount,
75
+ FileOutputPlugin.Control control)
76
+ {
77
+ throw new UnsupportedOperationException("hdfs output plugin does not support resuming");
78
+ }
79
+
80
+ @Override
81
+ public void cleanup(TaskSource taskSource,
82
+ int taskCount,
83
+ List<TaskReport> successTaskReports)
84
+ {
85
+ }
86
+
87
+ @Override
88
+ public TransactionalFileOutput open(TaskSource taskSource, final int taskIndex)
89
+ {
90
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
91
+
92
+ final String pathPrefix = strftime(task.getPathPrefix(), task.getRewindSeconds());
93
+ final String pathSuffix = task.getFileNameExtension();
94
+ final String sequenceFormat = task.getSequenceFormat();
95
+
96
+ return new TransactionalFileOutput()
97
+ {
98
+ private final List<String> hdfsFileNames = new ArrayList<>();
99
+ private int fileIndex = 0;
100
+ private OutputStream output = null;
101
+
102
+ @Override
103
+ public void nextFile()
104
+ {
105
+ closeCurrentStream();
106
+ Path path = new Path(pathPrefix + String.format(sequenceFormat, taskIndex, fileIndex) + pathSuffix);
107
+ try {
108
+ FileSystem fs = getFs(task);
109
+ output = fs.create(path, task.getOverwrite());
110
+ logger.info("Uploading '{}'", path);
111
+ }
112
+ catch (IOException e) {
113
+ logger.error(e.getMessage());
114
+ throw new RuntimeException(e);
115
+ }
116
+ hdfsFileNames.add(path.toString());
117
+ fileIndex++;
118
+ }
119
+
120
+ @Override
121
+ public void add(Buffer buffer)
122
+ {
123
+ try {
124
+ output.write(buffer.array(), buffer.offset(), buffer.limit());
125
+ }
126
+ catch (IOException e) {
127
+ throw new RuntimeException(e);
128
+ }
129
+ finally {
130
+ buffer.release();
131
+ }
132
+ }
133
+
134
+ @Override
135
+ public void finish()
136
+ {
137
+ closeCurrentStream();
138
+ }
139
+
140
+ @Override
141
+ public void close()
142
+ {
143
+ closeCurrentStream();
144
+ }
145
+
146
+ @Override
147
+ public void abort()
148
+ {
149
+ }
150
+
151
+ @Override
152
+ public TaskReport commit()
153
+ {
154
+ TaskReport report = Exec.newTaskReport();
155
+ report.set("hdfs_file_names", hdfsFileNames);
156
+ return report;
157
+ }
158
+
159
+ private void closeCurrentStream()
160
+ {
161
+ if (output != null) {
162
+ try {
163
+ output.close();
164
+ output = null;
165
+ }
166
+ catch (IOException e) {
167
+ throw new RuntimeException(e);
168
+ }
169
+ }
170
+ }
171
+ };
172
+ }
173
+
174
+ private static FileSystem getFs(final PluginTask task)
175
+ throws IOException
176
+ {
177
+ Configuration configuration = new Configuration();
178
+
179
+ for (Object configFile : task.getConfigFiles()) {
180
+ configuration.addResource(configFile.toString());
181
+ }
182
+ configuration.reloadConfiguration();
183
+
184
+ for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
185
+ configuration.set(entry.getKey(), entry.getValue());
186
+ }
187
+
188
+ return FileSystem.get(configuration);
189
+ }
190
+
191
+ private String strftime(final String raw, final int rewind_seconds)
192
+ {
193
+ ScriptingContainer jruby = new ScriptingContainer();
194
+ Object resolved = jruby.runScriptlet(
195
+ String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
196
+ return resolved.toString();
197
+ }
198
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.output.hdfs;
2
+
3
+ public class TestHdfsFileOutputPlugin
4
+ {
5
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
- - takahiro.nakayama
7
+ - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-19 00:00:00.000000000 Z
11
+ date: 2015-09-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,7 +38,7 @@ dependencies:
38
38
  version: '10.0'
39
39
  prerelease: false
40
40
  type: :development
41
- description: Dumps records to Hdfs.
41
+ description: Stores files on Hdfs.
42
42
  email:
43
43
  - civitaspo@gmail.com
44
44
  executables: []
@@ -54,8 +54,8 @@ files:
54
54
  - gradlew
55
55
  - gradlew.bat
56
56
  - lib/embulk/output/hdfs.rb
57
- - src/main/java/org/embulk/output/HdfsOutputPlugin.java
58
- - src/test/java/org/embulk/output/TestHdfsOutputPlugin.java
57
+ - src/main/java/org/embulk/output/hdfs/HdfsFileOutputPlugin.java
58
+ - src/test/java/org/embulk/output/hdfs/TestHdfsFileOutputPlugin.java
59
59
  - classpath/activation-1.1.jar
60
60
  - classpath/apacheds-i18n-2.0.0-M15.jar
61
61
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
@@ -79,7 +79,7 @@ files:
79
79
  - classpath/curator-client-2.6.0.jar
80
80
  - classpath/curator-framework-2.6.0.jar
81
81
  - classpath/curator-recipes-2.6.0.jar
82
- - classpath/embulk-output-hdfs-0.1.2.jar
82
+ - classpath/embulk-output-hdfs-0.2.0.jar
83
83
  - classpath/gson-2.2.4.jar
84
84
  - classpath/hadoop-annotations-2.6.0.jar
85
85
  - classpath/hadoop-auth-2.6.0.jar
@@ -151,5 +151,5 @@ rubyforge_project:
151
151
  rubygems_version: 2.1.9
152
152
  signing_key:
153
153
  specification_version: 4
154
- summary: Hdfs output plugin for Embulk
154
+ summary: Hdfs file output plugin for Embulk
155
155
  test_files: []
Binary file
@@ -1,219 +0,0 @@
1
- package org.embulk.output;
2
-
3
- import com.google.common.base.Throwables;
4
- import org.apache.hadoop.conf.Configuration;
5
- import org.apache.hadoop.fs.FileSystem;
6
- import org.apache.hadoop.fs.Path;
7
- import org.embulk.config.*;
8
- import org.embulk.spi.Buffer;
9
- import org.embulk.spi.Exec;
10
- import org.embulk.spi.FileOutputPlugin;
11
- import org.embulk.spi.TransactionalFileOutput;
12
- import org.jruby.embed.ScriptingContainer;
13
- import org.slf4j.Logger;
14
-
15
- import java.io.IOException;
16
- import java.io.OutputStream;
17
- import java.util.List;
18
- import java.util.Map;
19
-
20
- public class HdfsOutputPlugin implements FileOutputPlugin
21
- {
22
- private static final Logger logger = Exec.getLogger(HdfsOutputPlugin.class);
23
-
24
- public interface PluginTask extends Task
25
- {
26
- @Config("config_files")
27
- @ConfigDefault("[]")
28
- public List<String> getConfigFiles();
29
-
30
- @Config("config")
31
- @ConfigDefault("{}")
32
- public Map<String, String> getConfig();
33
-
34
- @Config("sequence_format")
35
- @ConfigDefault("\"%03d.%02d\"")
36
- public String getSequenceFormat();
37
-
38
- @Config("output_path")
39
- @ConfigDefault("\"/tmp/embulk.output.hdfs_output.%Y%m%d_%s\"")
40
- public String getOutputPath();
41
-
42
- @Config("working_path")
43
- @ConfigDefault("\"/tmp/embulk.working.hdfs_output.%Y%m%d_%s\"")
44
- public String getWorkingPath();
45
-
46
- }
47
-
48
- @Override
49
- public ConfigDiff transaction(ConfigSource config,
50
- int taskCount,
51
- FileOutputPlugin.Control control)
52
- {
53
- PluginTask task = config.loadConfig(PluginTask.class);
54
- return resume(task.dump(), taskCount, control);
55
- }
56
-
57
- @Override
58
- public ConfigDiff resume(TaskSource taskSource,
59
- int taskCount,
60
- FileOutputPlugin.Control control)
61
- {
62
- control.run(taskSource);
63
- return Exec.newConfigDiff();
64
- }
65
-
66
-
67
- @Override
68
- public void cleanup(TaskSource taskSource,
69
- int taskCount,
70
- List<TaskReport> successTaskReports)
71
- {
72
- }
73
-
74
- @Override
75
- public TransactionalFileOutput open(TaskSource taskSource, final int taskIndex)
76
- {
77
- PluginTask task = taskSource.loadTask(PluginTask.class);
78
-
79
- Configuration configuration = getHdfsConfiguration(task);
80
- FileSystem fs = getFs(configuration);
81
- String workingPath = strftime(task.getWorkingPath());
82
- String outputPath = strftime(task.getOutputPath());
83
- return new TransactionalHdfsFileOutput(task, fs, workingPath, outputPath, taskIndex);
84
- }
85
-
86
- private Configuration getHdfsConfiguration(final PluginTask task)
87
- {
88
- Configuration configuration = new Configuration();
89
-
90
- List configFiles = task.getConfigFiles();
91
- for (Object configFile : configFiles) {
92
- configuration.addResource(configFile.toString());
93
- }
94
-
95
- for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
96
- configuration.set(entry.getKey(), entry.getValue());
97
- }
98
-
99
- return configuration;
100
- }
101
-
102
- private FileSystem getFs(final Configuration configuration) {
103
- try {
104
- FileSystem fs = FileSystem.get(configuration);
105
- return fs;
106
- }
107
- catch (IOException e) {
108
- logger.error(e.getMessage());
109
- throw Throwables.propagate(e);
110
- }
111
- }
112
-
113
- private String strftime(final String path)
114
- {
115
- // strftime
116
- ScriptingContainer jruby = new ScriptingContainer();
117
- Object result = jruby.runScriptlet("Time.now.strftime('" + path + "')");
118
- return result.toString();
119
- }
120
-
121
- static class TransactionalHdfsFileOutput implements TransactionalFileOutput
122
- {
123
- private final int taskIndex;
124
- private final FileSystem fs;
125
- private final String workingPath;
126
- private final String outputPath;
127
- private final String sequenceFormat;
128
-
129
- private int fileIndex = 0;
130
- private int callCount = 0;
131
- private Path currentPath = null;
132
- private OutputStream currentStream = null;
133
-
134
- public TransactionalHdfsFileOutput(PluginTask task, FileSystem fs, String workingPath, String outputPath, int taskIndex)
135
- {
136
- this.taskIndex = taskIndex;
137
- this.fs = fs;
138
- this.workingPath = workingPath;
139
- this.outputPath = outputPath;
140
- this.sequenceFormat = task.getSequenceFormat();
141
- }
142
-
143
- public void nextFile() {
144
- closeCurrentStream();
145
- currentPath = new Path(workingPath + '/' + String.format(sequenceFormat, taskIndex, fileIndex));
146
- try {
147
- if (fs.exists(currentPath)) {
148
- throw new IllegalAccessException(currentPath.toString() + "already exists.");
149
- }
150
- currentStream = fs.create(currentPath);
151
- logger.info("Uploading '{}'", currentPath.toString());
152
- }
153
- catch (IOException | IllegalAccessException e) {
154
- logger.error(e.getMessage());
155
- throw Throwables.propagate(e);
156
- }
157
- fileIndex++;
158
- }
159
-
160
- @Override
161
- public void add(Buffer buffer) {
162
- if (currentStream == null) {
163
- throw new IllegalStateException("nextFile() must be called before poll()");
164
- }
165
- try {
166
- logger.debug("#add called {} times for taskIndex {}", callCount, taskIndex);
167
- currentStream.write(buffer.array(), buffer.offset(), buffer.limit());
168
- callCount++;
169
- } catch (IOException e) {
170
- throw new RuntimeException(e);
171
- } finally {
172
- buffer.release();
173
- }
174
- }
175
-
176
- @Override
177
- public void finish() {
178
- closeCurrentStream();
179
- }
180
-
181
- @Override
182
- public void close() {
183
- closeCurrentStream();
184
- }
185
-
186
- @Override
187
- public void abort() {
188
- }
189
-
190
- @Override
191
- public TaskReport commit() {
192
- try {
193
- fs.rename(new Path(workingPath), new Path(outputPath));
194
- logger.info("rename {} => {}", workingPath, outputPath);
195
- } catch (IOException e) {
196
- logger.error(e.getMessage());
197
- throw Throwables.propagate(e);
198
- }
199
-
200
- TaskReport report = Exec.newTaskReport();
201
- report.set("files", currentPath);
202
- return report;
203
- }
204
-
205
- private void closeCurrentStream() {
206
- try {
207
- if (currentStream != null) {
208
- currentStream.close();
209
- currentStream = null;
210
- }
211
-
212
- callCount = 0;
213
- } catch (IOException e) {
214
- logger.error(e.getMessage());
215
- throw Throwables.propagate(e);
216
- }
217
- }
218
- }
219
- }
@@ -1,5 +0,0 @@
1
- package org.embulk.output;
2
-
3
- public class TestHdfsOutputPlugin
4
- {
5
- }