embulk-output-hdfs 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +6 -1
  3. data/CHANGELOG.md +9 -0
  4. data/README.md +38 -9
  5. data/build.gradle +10 -8
  6. data/example/config.yml +3 -1
  7. data/example/config_deprecated_option.yml +52 -0
  8. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  9. data/gradle/wrapper/gradle-wrapper.properties +1 -2
  10. data/gradlew +43 -35
  11. data/gradlew.bat +4 -10
  12. data/settings.gradle +1 -0
  13. data/src/main/java/org/embulk/output/hdfs/HdfsFileOutput.java +160 -0
  14. data/src/main/java/org/embulk/output/hdfs/HdfsFileOutputPlugin.java +55 -175
  15. data/src/main/java/org/embulk/output/hdfs/ModeTask.java +111 -0
  16. data/src/main/java/org/embulk/output/hdfs/client/HdfsClient.java +269 -0
  17. data/src/main/java/org/embulk/output/hdfs/compat/ModeCompat.java +76 -0
  18. data/src/main/java/org/embulk/output/hdfs/transaction/AbortIfExistTx.java +6 -0
  19. data/src/main/java/org/embulk/output/hdfs/transaction/AbstractTx.java +53 -0
  20. data/src/main/java/org/embulk/output/hdfs/transaction/ControlRun.java +10 -0
  21. data/src/main/java/org/embulk/output/hdfs/transaction/DeleteFilesInAdvanceTx.java +22 -0
  22. data/src/main/java/org/embulk/output/hdfs/transaction/DeleteRecursiveInAdvanceTx.java +22 -0
  23. data/src/main/java/org/embulk/output/hdfs/transaction/OverwriteTx.java +11 -0
  24. data/src/main/java/org/embulk/output/hdfs/transaction/ReplaceTx.java +62 -0
  25. data/src/main/java/org/embulk/output/hdfs/transaction/Tx.java +13 -0
  26. data/src/main/java/org/embulk/output/hdfs/util/SafeWorkspaceName.java +21 -0
  27. data/src/main/java/org/embulk/output/hdfs/util/SamplePath.java +21 -0
  28. data/src/main/java/org/embulk/output/hdfs/util/StrftimeUtil.java +23 -0
  29. data/src/test/java/org/embulk/output/hdfs/TestHdfsFileOutputPlugin.java +153 -22
  30. metadata +87 -70
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90bae20ab751bea6d3807b44b252062d137a710d
4
- data.tar.gz: 482d46a137ba2fad65988bd98ec88613a884e590
3
+ metadata.gz: dd16ecf369d77f2a3dab20ebbb305dfbfca1a2a9
4
+ data.tar.gz: e89f1b6e282e461abc6116c42dfe393abff4032e
5
5
  SHA512:
6
- metadata.gz: b23e3d09a38d4dd493e965bd3229e87ff89d43828ce48cd71c2b0ae996575d6a8a6fd9e404b6b075c3bb7719961d45af856dc6a21e778516d3783e06b4c92cd9
7
- data.tar.gz: c5ad1e1f16d5c632a5352c25dc3e065945a40fcdac135aeb8cc8150332d4812cc262c2869fe9308337b246ffdf0bf08448d1e8f47f27c08eb542be529491515b
6
+ metadata.gz: e8e335c24da1f14cd05492baaddb1c2de0c57d0cfa9a15eef17b85c288e1bb0f57ba91cf95f1006a612b16b624a57ac7d422a7853fe6ccf44d66caabc9984523
7
+ data.tar.gz: 0049d9041ee1796b7e6e1cbc8dd1f7e0e738509f67abceaf0e7220ada5c95d9b63820cd38a5d37c720ee1e303e6504d3d38ba2524e9f9412143c5329cdba3186
@@ -1,3 +1,4 @@
1
+ dist: precise
1
2
  language: java
2
3
  jdk:
3
4
  - openjdk7
@@ -6,4 +7,8 @@ jdk:
6
7
  script:
7
8
  - ./gradlew test
8
9
  after_success:
9
- - ./gradlew jacocoTestReport coveralls
10
+ - ./gradlew jacocoTestReport coveralls
11
+ addons:
12
+ hosts:
13
+ - example.com
14
+ hostname: example.com
@@ -1,3 +1,12 @@
1
+ 0.3.0 (2017-12-03)
2
+ ==================
3
+ * Add: `mode` option.
4
+ * Add `replace` behaviour.
5
+ * Deprecated: `delete_in_advance` option. Please use `mode` instead.
6
+ * Deprecated: `overwrite` option. Please use `mode` instead.
7
+ * Enhancement: Delete behaviour become safely.
8
+ * Enhancement: Update embulk 0.8.38
9
+
1
10
  0.2.4 (2016-04-27)
2
11
  ==================
3
12
  - Enhancement: Avoid to create 0 byte files
data/README.md CHANGED
@@ -14,19 +14,21 @@ A File Output Plugin for Embulk to write HDFS.
14
14
 
15
15
  ## Configuration
16
16
 
17
- - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
18
- - **config** overwrites configuration parameters (hash, default: `{}`)
19
- - **path_prefix** prefix of target files (string, required)
20
- - **file_ext** suffix of target files (string, required)
21
- - **sequence_format** format for sequence part of target files (string, default: `'.%03d.%02d'`)
22
- - **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
23
- - **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
17
+ - **config_files**: list of paths to Hadoop's configuration files (array of strings, default: `[]`)
18
+ - **config**: overwrites configuration parameters (hash, default: `{}`)
19
+ - **path_prefix**: prefix of target files (string, required)
20
+ - **file_ext**: suffix of target files (string, required)
21
+ - **sequence_format**: format for sequence part of target files (string, default: `'%03d.%02d.'`)
22
+ - **rewind_seconds**: When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
23
+ - **doas**: username which access to Hdfs (string, default: executed user)
24
+ - **overwrite** *(Deprecated: Please use `mode` option instead)*: overwrite files when the same filenames already exists (boolean, default: `false`)
24
25
  - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
25
- - **doas** username which access to Hdfs (string, default: executed user)
26
- - **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
26
+ - **delete_in_advance** *(Deprecated: Please use `mode` option instead)*: delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
27
27
  - `NONE`: do nothing
28
28
  - `FILE_ONLY`: delete files
29
29
  - `RECURSIVE`: delete files and directories
30
+ - **mode**: "abort_if_exist", "overwrite", "delete_files_in_advance", "delete_recursive_in_advance", or "replace". See below. (string, optional, default: `"abort_if_exist"`)
31
+ * In the future, default mode will become `"replace"`.
30
32
 
31
33
  ## CAUTION
32
34
  If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
@@ -34,6 +36,33 @@ If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance`
34
36
  this means `embulk-output-hdfs` can destroy your hdfs.
35
37
  So, please be careful when you use `delete_in_advance` option and `doas` option ...
36
38
 
39
+ ## About DELETE
40
+
41
+ When this plugin deletes files or directories, use [`Hadoop Trash API`](https://hadoop.apache.org/docs/r2.8.0/api/org/apache/hadoop/fs/Trash.html). So, you can find them in the trash during `fs.trash.interval`.
42
+
43
+ ## Modes
44
+
45
+ * **abort_if_exist**:
46
+ * Behavior: This mode writes rows to the target files in order. If target files already exist, abort the transaction.
47
+ * Transactional: No. If fails, the target files could have some rows written.
48
+ * Resumable: No.
49
+ * **overwrite**:
50
+ * Behavior: This mode writes rows to the target files in order. If target files already exist, this re-write from the beginning of the file.
51
+ * Transactional: No. If fails, the target files could have some rows written.
52
+ * Resumable: No.
53
+ * **delete_files_in_advance**:
54
+ * Behavior: This mode delete files at first, then writes rows to the target files in order.
55
+ * Transactional: No. If fails, the target files could be removed.
56
+ * Resumable: No.
57
+ * **delete_recursive_in_advance**:
58
+ * Behavior: This mode delete directories recursively at first, then writes rows to the target files in order.
59
+ * Transactional: No. If fails, the target files could be removed.
60
+ * Resumable: No.
61
+ * **replace**:
62
+ * Behavior: This mode writes rows to the workspace files in order, then replace them to target directories. This **replace** is not **atomic** because hdfs api does not have atomic replace.
63
+ * Transactional: No. If fails, the target files could be removed.
64
+ * Resumable: No.
65
+
37
66
  ## Example
38
67
 
39
68
  ```yaml
@@ -15,20 +15,20 @@ configurations {
15
15
  provided
16
16
  }
17
17
 
18
- version = "0.2.4"
18
+ version = "0.3.0"
19
19
 
20
20
  sourceCompatibility = 1.7
21
21
  targetCompatibility = 1.7
22
22
 
23
23
  dependencies {
24
- compile "org.embulk:embulk-core:0.8.8"
25
- provided "org.embulk:embulk-core:0.8.8"
24
+ compile "org.embulk:embulk-core:0.8.38"
25
+ provided "org.embulk:embulk-core:0.8.38"
26
26
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
27
27
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
28
28
  compile 'com.google.guava:guava:15.0'
29
29
  testCompile "junit:junit:4.+"
30
- testCompile "org.embulk:embulk-core:0.8.8:tests"
31
- testCompile "org.embulk:embulk-standards:0.8.8"
30
+ testCompile "org.embulk:embulk-core:0.8.38:tests"
31
+ testCompile "org.embulk:embulk-standards:0.8.38"
32
32
  }
33
33
 
34
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -72,9 +72,11 @@ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
72
72
  script "pkg/${project.name}-${project.version}.gem"
73
73
  }
74
74
 
75
- task "package"(dependsOn: ["gemspec", "classpath"]) << {
76
- println "> Build succeeded."
77
- println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
75
+ task "package"(dependsOn: ["gemspec", "classpath"]) {
76
+ doLast {
77
+ println "> Build succeeded."
78
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
79
+ }
78
80
  }
79
81
 
80
82
  task gemspec {
@@ -6,12 +6,14 @@ hdfs_example: &hdfs_example
6
6
  fs.defaultFS: 'hdfs://hadoop-nn1:8020'
7
7
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
8
8
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
9
+ fs.trash.interval: 3600
9
10
 
10
11
  local_fs_example: &local_fs_example
11
12
  config:
12
13
  fs.defaultFS: 'file:///'
13
14
  fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
14
15
  fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
16
+ fs.trash.interval: 3600
15
17
  io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
16
18
 
17
19
  in:
@@ -38,7 +40,7 @@ out:
38
40
  <<: *local_fs_example
39
41
  path_prefix: /tmp/embulk-output-hdfs_example/file_
40
42
  file_ext: csv
41
- delete_in_advance: FILE_ONLY
43
+ mode: replace
42
44
  formatter:
43
45
  type: csv
44
46
  newline: CRLF
@@ -0,0 +1,52 @@
1
+ hdfs_example: &hdfs_example
2
+ config_files:
3
+ - /etc/hadoop/conf/core-site.xml
4
+ - /etc/hadoop/conf/hdfs-site.xml
5
+ config:
6
+ fs.defaultFS: 'hdfs://hadoop-nn1:8020'
7
+ fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
8
+ fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
9
+
10
+ local_fs_example: &local_fs_example
11
+ config:
12
+ fs.defaultFS: 'file:///'
13
+ fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
14
+ fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
15
+ io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
16
+
17
+ in:
18
+ type: file
19
+ path_prefix: example/data
20
+ parser:
21
+ charset: UTF-8
22
+ newline: CRLF
23
+ type: csv
24
+ delimiter: ','
25
+ quote: '"'
26
+ header_line: true
27
+ stop_on_invalid_record: true
28
+ columns:
29
+ - {name: id, type: long}
30
+ - {name: account, type: long}
31
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
32
+ - {name: purchase, type: timestamp, format: '%Y%m%d'}
33
+ - {name: comment, type: string}
34
+
35
+
36
+ out:
37
+ type: hdfs
38
+ <<: *local_fs_example
39
+ path_prefix: /tmp/embulk-output-hdfs_example/file_
40
+ file_ext: csv
41
+ delete_in_advance: FILE_ONLY
42
+ formatter:
43
+ type: csv
44
+ newline: CRLF
45
+ newline_in_field: LF
46
+ header_line: true
47
+ charset: UTF-8
48
+ quote_policy: NONE
49
+ quote: '"'
50
+ escape: '\'
51
+ null_string: ''
52
+ default_timezone: UTC
@@ -1,6 +1,5 @@
1
- #Wed Jan 13 12:41:02 JST 2016
2
1
  distributionBase=GRADLE_USER_HOME
3
2
  distributionPath=wrapper/dists
4
3
  zipStoreBase=GRADLE_USER_HOME
5
4
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
5
+ distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
data/gradlew CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env bash
1
+ #!/usr/bin/env sh
2
2
 
3
3
  ##############################################################################
4
4
  ##
@@ -6,20 +6,38 @@
6
6
  ##
7
7
  ##############################################################################
8
8
 
9
- # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
- DEFAULT_JVM_OPTS=""
9
+ # Attempt to set APP_HOME
10
+ # Resolve links: $0 may be a link
11
+ PRG="$0"
12
+ # Need this for relative symlinks.
13
+ while [ -h "$PRG" ] ; do
14
+ ls=`ls -ld "$PRG"`
15
+ link=`expr "$ls" : '.*-> \(.*\)$'`
16
+ if expr "$link" : '/.*' > /dev/null; then
17
+ PRG="$link"
18
+ else
19
+ PRG=`dirname "$PRG"`"/$link"
20
+ fi
21
+ done
22
+ SAVED="`pwd`"
23
+ cd "`dirname \"$PRG\"`/" >/dev/null
24
+ APP_HOME="`pwd -P`"
25
+ cd "$SAVED" >/dev/null
11
26
 
12
27
  APP_NAME="Gradle"
13
28
  APP_BASE_NAME=`basename "$0"`
14
29
 
30
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
+ DEFAULT_JVM_OPTS=""
32
+
15
33
  # Use the maximum available, or set MAX_FD != -1 to use that value.
16
34
  MAX_FD="maximum"
17
35
 
18
- warn ( ) {
36
+ warn () {
19
37
  echo "$*"
20
38
  }
21
39
 
22
- die ( ) {
40
+ die () {
23
41
  echo
24
42
  echo "$*"
25
43
  echo
@@ -30,6 +48,7 @@ die ( ) {
30
48
  cygwin=false
31
49
  msys=false
32
50
  darwin=false
51
+ nonstop=false
33
52
  case "`uname`" in
34
53
  CYGWIN* )
35
54
  cygwin=true
@@ -40,31 +59,11 @@ case "`uname`" in
40
59
  MINGW* )
41
60
  msys=true
42
61
  ;;
62
+ NONSTOP* )
63
+ nonstop=true
64
+ ;;
43
65
  esac
44
66
 
45
- # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
- if $cygwin ; then
47
- [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
- fi
49
-
50
- # Attempt to set APP_HOME
51
- # Resolve links: $0 may be a link
52
- PRG="$0"
53
- # Need this for relative symlinks.
54
- while [ -h "$PRG" ] ; do
55
- ls=`ls -ld "$PRG"`
56
- link=`expr "$ls" : '.*-> \(.*\)$'`
57
- if expr "$link" : '/.*' > /dev/null; then
58
- PRG="$link"
59
- else
60
- PRG=`dirname "$PRG"`"/$link"
61
- fi
62
- done
63
- SAVED="`pwd`"
64
- cd "`dirname \"$PRG\"`/" >&-
65
- APP_HOME="`pwd -P`"
66
- cd "$SAVED" >&-
67
-
68
67
  CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
68
 
70
69
  # Determine the Java command to use to start the JVM.
@@ -90,7 +89,7 @@ location of your Java installation."
90
89
  fi
91
90
 
92
91
  # Increase the maximum file descriptors if we can.
93
- if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
92
+ if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
94
93
  MAX_FD_LIMIT=`ulimit -H -n`
95
94
  if [ $? -eq 0 ] ; then
96
95
  if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
@@ -114,6 +113,7 @@ fi
114
113
  if $cygwin ; then
115
114
  APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
115
  CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
+ JAVACMD=`cygpath --unix "$JAVACMD"`
117
117
 
118
118
  # We build the pattern for arguments to be converted via cygpath
119
119
  ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
@@ -154,11 +154,19 @@ if $cygwin ; then
154
154
  esac
155
155
  fi
156
156
 
157
- # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
- function splitJvmOpts() {
159
- JVM_OPTS=("$@")
157
+ # Escape application args
158
+ save () {
159
+ for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
+ echo " "
160
161
  }
161
- eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
- JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
162
+ APP_ARGS=$(save "$@")
163
+
164
+ # Collect all arguments for the java command, following the shell quoting and substitution rules
165
+ eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166
+
167
+ # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168
+ if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169
+ cd "$(dirname "$0")"
170
+ fi
163
171
 
164
- exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
172
+ exec "$JAVACMD" "$@"
@@ -8,14 +8,14 @@
8
8
  @rem Set local scope for the variables with windows NT shell
9
9
  if "%OS%"=="Windows_NT" setlocal
10
10
 
11
- @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
- set DEFAULT_JVM_OPTS=
13
-
14
11
  set DIRNAME=%~dp0
15
12
  if "%DIRNAME%" == "" set DIRNAME=.
16
13
  set APP_BASE_NAME=%~n0
17
14
  set APP_HOME=%DIRNAME%
18
15
 
16
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
+ set DEFAULT_JVM_OPTS=
18
+
19
19
  @rem Find java.exe
20
20
  if defined JAVA_HOME goto findJavaFromJavaHome
21
21
 
@@ -46,10 +46,9 @@ echo location of your Java installation.
46
46
  goto fail
47
47
 
48
48
  :init
49
- @rem Get command-line arguments, handling Windowz variants
49
+ @rem Get command-line arguments, handling Windows variants
50
50
 
51
51
  if not "%OS%" == "Windows_NT" goto win9xME_args
52
- if "%@eval[2+2]" == "4" goto 4NT_args
53
52
 
54
53
  :win9xME_args
55
54
  @rem Slurp the command line arguments.
@@ -60,11 +59,6 @@ set _SKIP=2
60
59
  if "x%~1" == "x" goto execute
61
60
 
62
61
  set CMD_LINE_ARGS=%*
63
- goto execute
64
-
65
- :4NT_args
66
- @rem Get arguments from the 4NT Shell from JP Software
67
- set CMD_LINE_ARGS=%$
68
62
 
69
63
  :execute
70
64
  @rem Setup the command line
@@ -0,0 +1 @@
1
+ rootProject.name = 'embulk-output-hdfs'
@@ -0,0 +1,160 @@
1
+ package org.embulk.output.hdfs;
2
+
3
+ import org.apache.hadoop.fs.Path;
4
+ import org.embulk.config.TaskReport;
5
+ import org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask;
6
+ import org.embulk.output.hdfs.client.HdfsClient;
7
+ import org.embulk.spi.Buffer;
8
+ import org.embulk.spi.Exec;
9
+ import org.embulk.spi.FileOutput;
10
+ import org.embulk.spi.TransactionalFileOutput;
11
+ import org.embulk.spi.util.RetryExecutor;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.io.IOException;
15
+ import java.io.OutputStream;
16
+
17
+ public class HdfsFileOutput
18
+ implements FileOutput, TransactionalFileOutput
19
+ {
20
+ private static final Logger logger = Exec.getLogger(HdfsFileOutput.class);
21
+ private final RetryExecutor re = RetryExecutor.retryExecutor()
22
+ .withRetryLimit(3)
23
+ .withMaxRetryWait(500) // ms
24
+ .withMaxRetryWait(10 * 60 * 1000); // ms
25
+
26
+ private final HdfsClient hdfsClient;
27
+ private final int taskIdx;
28
+ private final String pathPrefix;
29
+ private final String sequenceFormat;
30
+ private final String fileExt;
31
+ private final boolean overwrite;
32
+
33
+ private int fileIdx = 0;
34
+ private Path currentPath = null;
35
+ private OutputStream o = null;
36
+
37
+ public HdfsFileOutput(PluginTask task, String pathPrefix, boolean overwrite, int taskIdx)
38
+ {
39
+ this.hdfsClient = HdfsClient.build(task);
40
+ this.pathPrefix = pathPrefix;
41
+ this.taskIdx = taskIdx;
42
+ this.sequenceFormat = task.getSequenceFormat();
43
+ this.fileExt = task.getFileExt();
44
+ this.overwrite = overwrite;
45
+ }
46
+
47
+ @Override
48
+ public void abort()
49
+ {
50
+ }
51
+
52
+ @Override
53
+ public TaskReport commit()
54
+ {
55
+ return Exec.newTaskReport();
56
+ }
57
+
58
+ @Override
59
+ public void nextFile()
60
+ {
61
+ closeCurrentStream();
62
+ currentPath = newPath();
63
+ fileIdx++;
64
+ }
65
+
66
+ @Override
67
+ public void add(Buffer buffer)
68
+ {
69
+ try {
70
+ // this implementation is for creating file when there is data.
71
+ if (o == null) {
72
+ o = hdfsClient.create(currentPath, overwrite);
73
+ logger.info("Uploading '{}'", currentPath);
74
+ }
75
+ write(buffer);
76
+ }
77
+ catch (RetryExecutor.RetryGiveupException e) {
78
+ throw new RuntimeException(e);
79
+ }
80
+ finally {
81
+ buffer.release();
82
+ }
83
+ }
84
+
85
+ @Override
86
+ public void finish()
87
+ {
88
+ closeCurrentStream();
89
+ }
90
+
91
+ @Override
92
+ public void close()
93
+ {
94
+ closeCurrentStream();
95
+ hdfsClient.close();
96
+ }
97
+
98
+ private void write(final Buffer buffer)
99
+ throws RetryExecutor.RetryGiveupException
100
+ {
101
+ re.run(new RetryExecutor.Retryable<Void>()
102
+ {
103
+ @Override
104
+ public Void call()
105
+ throws Exception
106
+ {
107
+ o.write(buffer.array(), buffer.offset(), buffer.limit());
108
+ return null;
109
+ }
110
+
111
+ @Override
112
+ public boolean isRetryableException(Exception exception)
113
+ {
114
+ return true; // TODO: which Exception is retryable?
115
+ }
116
+
117
+ @Override
118
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
119
+ throws RetryExecutor.RetryGiveupException
120
+ {
121
+ String m = String.format(
122
+ "%s. (Retry: Count: %d, Limit: %d, Wait: %d ms)",
123
+ exception.getMessage(),
124
+ retryCount,
125
+ retryLimit,
126
+ retryWait);
127
+ logger.warn(m, exception);
128
+ }
129
+
130
+ @Override
131
+ public void onGiveup(Exception firstException, Exception lastException)
132
+ throws RetryExecutor.RetryGiveupException
133
+ {
134
+ }
135
+ });
136
+ }
137
+
138
+ private Path newPath()
139
+ {
140
+ return new Path(pathPrefix + getSequence() + fileExt);
141
+ }
142
+
143
+ private String getSequence()
144
+ {
145
+ return String.format(sequenceFormat, taskIdx, fileIdx);
146
+ }
147
+
148
+ private void closeCurrentStream()
149
+ {
150
+ if (o != null) {
151
+ try {
152
+ o.close();
153
+ o = null;
154
+ }
155
+ catch (IOException e) {
156
+ throw new RuntimeException(e);
157
+ }
158
+ }
159
+ }
160
+ }