embulk-output-hdfs 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +6 -1
  3. data/CHANGELOG.md +9 -0
  4. data/README.md +38 -9
  5. data/build.gradle +10 -8
  6. data/example/config.yml +3 -1
  7. data/example/config_deprecated_option.yml +52 -0
  8. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  9. data/gradle/wrapper/gradle-wrapper.properties +1 -2
  10. data/gradlew +43 -35
  11. data/gradlew.bat +4 -10
  12. data/settings.gradle +1 -0
  13. data/src/main/java/org/embulk/output/hdfs/HdfsFileOutput.java +160 -0
  14. data/src/main/java/org/embulk/output/hdfs/HdfsFileOutputPlugin.java +55 -175
  15. data/src/main/java/org/embulk/output/hdfs/ModeTask.java +111 -0
  16. data/src/main/java/org/embulk/output/hdfs/client/HdfsClient.java +269 -0
  17. data/src/main/java/org/embulk/output/hdfs/compat/ModeCompat.java +76 -0
  18. data/src/main/java/org/embulk/output/hdfs/transaction/AbortIfExistTx.java +6 -0
  19. data/src/main/java/org/embulk/output/hdfs/transaction/AbstractTx.java +53 -0
  20. data/src/main/java/org/embulk/output/hdfs/transaction/ControlRun.java +10 -0
  21. data/src/main/java/org/embulk/output/hdfs/transaction/DeleteFilesInAdvanceTx.java +22 -0
  22. data/src/main/java/org/embulk/output/hdfs/transaction/DeleteRecursiveInAdvanceTx.java +22 -0
  23. data/src/main/java/org/embulk/output/hdfs/transaction/OverwriteTx.java +11 -0
  24. data/src/main/java/org/embulk/output/hdfs/transaction/ReplaceTx.java +62 -0
  25. data/src/main/java/org/embulk/output/hdfs/transaction/Tx.java +13 -0
  26. data/src/main/java/org/embulk/output/hdfs/util/SafeWorkspaceName.java +21 -0
  27. data/src/main/java/org/embulk/output/hdfs/util/SamplePath.java +21 -0
  28. data/src/main/java/org/embulk/output/hdfs/util/StrftimeUtil.java +23 -0
  29. data/src/test/java/org/embulk/output/hdfs/TestHdfsFileOutputPlugin.java +153 -22
  30. metadata +87 -70
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90bae20ab751bea6d3807b44b252062d137a710d
4
- data.tar.gz: 482d46a137ba2fad65988bd98ec88613a884e590
3
+ metadata.gz: dd16ecf369d77f2a3dab20ebbb305dfbfca1a2a9
4
+ data.tar.gz: e89f1b6e282e461abc6116c42dfe393abff4032e
5
5
  SHA512:
6
- metadata.gz: b23e3d09a38d4dd493e965bd3229e87ff89d43828ce48cd71c2b0ae996575d6a8a6fd9e404b6b075c3bb7719961d45af856dc6a21e778516d3783e06b4c92cd9
7
- data.tar.gz: c5ad1e1f16d5c632a5352c25dc3e065945a40fcdac135aeb8cc8150332d4812cc262c2869fe9308337b246ffdf0bf08448d1e8f47f27c08eb542be529491515b
6
+ metadata.gz: e8e335c24da1f14cd05492baaddb1c2de0c57d0cfa9a15eef17b85c288e1bb0f57ba91cf95f1006a612b16b624a57ac7d422a7853fe6ccf44d66caabc9984523
7
+ data.tar.gz: 0049d9041ee1796b7e6e1cbc8dd1f7e0e738509f67abceaf0e7220ada5c95d9b63820cd38a5d37c720ee1e303e6504d3d38ba2524e9f9412143c5329cdba3186
@@ -1,3 +1,4 @@
1
+ dist: precise
1
2
  language: java
2
3
  jdk:
3
4
  - openjdk7
@@ -6,4 +7,8 @@ jdk:
6
7
  script:
7
8
  - ./gradlew test
8
9
  after_success:
9
- - ./gradlew jacocoTestReport coveralls
10
+ - ./gradlew jacocoTestReport coveralls
11
+ addons:
12
+ hosts:
13
+ - example.com
14
+ hostname: example.com
@@ -1,3 +1,12 @@
1
+ 0.3.0 (2017-12-03)
2
+ ==================
3
+ * Add: `mode` option.
4
+ * Add `replace` behaviour.
5
+ * Deprecated: `delete_in_advance` option. Please use `mode` instead.
6
+ * Deprecated: `overwrite` option. Please use `mode` instead.
7
+ * Enhancement: Delete behaviour become safely.
8
+ * Enhancement: Update embulk 0.8.38
9
+
1
10
  0.2.4 (2016-04-27)
2
11
  ==================
3
12
  - Enhancement: Avoid to create 0 byte files
data/README.md CHANGED
@@ -14,19 +14,21 @@ A File Output Plugin for Embulk to write HDFS.
14
14
 
15
15
  ## Configuration
16
16
 
17
- - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
18
- - **config** overwrites configuration parameters (hash, default: `{}`)
19
- - **path_prefix** prefix of target files (string, required)
20
- - **file_ext** suffix of target files (string, required)
21
- - **sequence_format** format for sequence part of target files (string, default: `'.%03d.%02d'`)
22
- - **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
23
- - **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
17
+ - **config_files**: list of paths to Hadoop's configuration files (array of strings, default: `[]`)
18
+ - **config**: overwrites configuration parameters (hash, default: `{}`)
19
+ - **path_prefix**: prefix of target files (string, required)
20
+ - **file_ext**: suffix of target files (string, required)
21
+ - **sequence_format**: format for sequence part of target files (string, default: `'%03d.%02d.'`)
22
+ - **rewind_seconds**: When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
23
+ - **doas**: username which access to Hdfs (string, default: executed user)
24
+ - **overwrite** *(Deprecated: Please use `mode` option instead)*: overwrite files when the same filenames already exists (boolean, default: `false`)
24
25
  - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
25
- - **doas** username which access to Hdfs (string, default: executed user)
26
- - **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
26
+ - **delete_in_advance** *(Deprecated: Please use `mode` option instead)*: delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
27
27
  - `NONE`: do nothing
28
28
  - `FILE_ONLY`: delete files
29
29
  - `RECURSIVE`: delete files and directories
30
+ - **mode**: "abort_if_exist", "overwrite", "delete_files_in_advance", "delete_recursive_in_advance", or "replace". See below. (string, optional, default: `"abort_if_exist"`)
31
+ * In the future, default mode will become `"replace"`.
30
32
 
31
33
  ## CAUTION
32
34
  If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
@@ -34,6 +36,33 @@ If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance`
34
36
  this means `embulk-output-hdfs` can destroy your hdfs.
35
37
  So, please be careful when you use `delete_in_advance` option and `doas` option ...
36
38
 
39
+ ## About DELETE
40
+
41
+ When this plugin deletes files or directories, use [`Hadoop Trash API`](https://hadoop.apache.org/docs/r2.8.0/api/org/apache/hadoop/fs/Trash.html). So, you can find them in the trash during `fs.trash.interval`.
42
+
43
+ ## Modes
44
+
45
+ * **abort_if_exist**:
46
+ * Behavior: This mode writes rows to the target files in order. If target files already exist, abort the transaction.
47
+ * Transactional: No. If fails, the target files could have some rows written.
48
+ * Resumable: No.
49
+ * **overwrite**:
50
+ * Behavior: This mode writes rows to the target files in order. If target files already exist, this re-write from the beginning of the file.
51
+ * Transactional: No. If fails, the target files could have some rows written.
52
+ * Resumable: No.
53
+ * **delete_files_in_advance**:
54
+ * Behavior: This mode delete files at first, then writes rows to the target files in order.
55
+ * Transactional: No. If fails, the target files could be removed.
56
+ * Resumable: No.
57
+ * **delete_recursive_in_advance**:
58
+ * Behavior: This mode delete directories recursively at first, then writes rows to the target files in order.
59
+ * Transactional: No. If fails, the target files could be removed.
60
+ * Resumable: No.
61
+ * **replace**:
62
+ * Behavior: This mode writes rows to the workspace files in order, then replace them to target directories. This **replace** is not **atomic** because hdfs api does not have atomic replace.
63
+ * Transactional: No. If fails, the target files could be removed.
64
+ * Resumable: No.
65
+
37
66
  ## Example
38
67
 
39
68
  ```yaml
@@ -15,20 +15,20 @@ configurations {
15
15
  provided
16
16
  }
17
17
 
18
- version = "0.2.4"
18
+ version = "0.3.0"
19
19
 
20
20
  sourceCompatibility = 1.7
21
21
  targetCompatibility = 1.7
22
22
 
23
23
  dependencies {
24
- compile "org.embulk:embulk-core:0.8.8"
25
- provided "org.embulk:embulk-core:0.8.8"
24
+ compile "org.embulk:embulk-core:0.8.38"
25
+ provided "org.embulk:embulk-core:0.8.38"
26
26
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
27
27
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
28
28
  compile 'com.google.guava:guava:15.0'
29
29
  testCompile "junit:junit:4.+"
30
- testCompile "org.embulk:embulk-core:0.8.8:tests"
31
- testCompile "org.embulk:embulk-standards:0.8.8"
30
+ testCompile "org.embulk:embulk-core:0.8.38:tests"
31
+ testCompile "org.embulk:embulk-standards:0.8.38"
32
32
  }
33
33
 
34
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -72,9 +72,11 @@ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
72
72
  script "pkg/${project.name}-${project.version}.gem"
73
73
  }
74
74
 
75
- task "package"(dependsOn: ["gemspec", "classpath"]) << {
76
- println "> Build succeeded."
77
- println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
75
+ task "package"(dependsOn: ["gemspec", "classpath"]) {
76
+ doLast {
77
+ println "> Build succeeded."
78
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
79
+ }
78
80
  }
79
81
 
80
82
  task gemspec {
@@ -6,12 +6,14 @@ hdfs_example: &hdfs_example
6
6
  fs.defaultFS: 'hdfs://hadoop-nn1:8020'
7
7
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
8
8
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
9
+ fs.trash.interval: 3600
9
10
 
10
11
  local_fs_example: &local_fs_example
11
12
  config:
12
13
  fs.defaultFS: 'file:///'
13
14
  fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
14
15
  fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
16
+ fs.trash.interval: 3600
15
17
  io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
16
18
 
17
19
  in:
@@ -38,7 +40,7 @@ out:
38
40
  <<: *local_fs_example
39
41
  path_prefix: /tmp/embulk-output-hdfs_example/file_
40
42
  file_ext: csv
41
- delete_in_advance: FILE_ONLY
43
+ mode: replace
42
44
  formatter:
43
45
  type: csv
44
46
  newline: CRLF
@@ -0,0 +1,52 @@
1
+ hdfs_example: &hdfs_example
2
+ config_files:
3
+ - /etc/hadoop/conf/core-site.xml
4
+ - /etc/hadoop/conf/hdfs-site.xml
5
+ config:
6
+ fs.defaultFS: 'hdfs://hadoop-nn1:8020'
7
+ fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
8
+ fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
9
+
10
+ local_fs_example: &local_fs_example
11
+ config:
12
+ fs.defaultFS: 'file:///'
13
+ fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
14
+ fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
15
+ io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
16
+
17
+ in:
18
+ type: file
19
+ path_prefix: example/data
20
+ parser:
21
+ charset: UTF-8
22
+ newline: CRLF
23
+ type: csv
24
+ delimiter: ','
25
+ quote: '"'
26
+ header_line: true
27
+ stop_on_invalid_record: true
28
+ columns:
29
+ - {name: id, type: long}
30
+ - {name: account, type: long}
31
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
32
+ - {name: purchase, type: timestamp, format: '%Y%m%d'}
33
+ - {name: comment, type: string}
34
+
35
+
36
+ out:
37
+ type: hdfs
38
+ <<: *local_fs_example
39
+ path_prefix: /tmp/embulk-output-hdfs_example/file_
40
+ file_ext: csv
41
+ delete_in_advance: FILE_ONLY
42
+ formatter:
43
+ type: csv
44
+ newline: CRLF
45
+ newline_in_field: LF
46
+ header_line: true
47
+ charset: UTF-8
48
+ quote_policy: NONE
49
+ quote: '"'
50
+ escape: '\'
51
+ null_string: ''
52
+ default_timezone: UTC
@@ -1,6 +1,5 @@
1
- #Wed Jan 13 12:41:02 JST 2016
2
1
  distributionBase=GRADLE_USER_HOME
3
2
  distributionPath=wrapper/dists
4
3
  zipStoreBase=GRADLE_USER_HOME
5
4
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
5
+ distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
data/gradlew CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env bash
1
+ #!/usr/bin/env sh
2
2
 
3
3
  ##############################################################################
4
4
  ##
@@ -6,20 +6,38 @@
6
6
  ##
7
7
  ##############################################################################
8
8
 
9
- # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
- DEFAULT_JVM_OPTS=""
9
+ # Attempt to set APP_HOME
10
+ # Resolve links: $0 may be a link
11
+ PRG="$0"
12
+ # Need this for relative symlinks.
13
+ while [ -h "$PRG" ] ; do
14
+ ls=`ls -ld "$PRG"`
15
+ link=`expr "$ls" : '.*-> \(.*\)$'`
16
+ if expr "$link" : '/.*' > /dev/null; then
17
+ PRG="$link"
18
+ else
19
+ PRG=`dirname "$PRG"`"/$link"
20
+ fi
21
+ done
22
+ SAVED="`pwd`"
23
+ cd "`dirname \"$PRG\"`/" >/dev/null
24
+ APP_HOME="`pwd -P`"
25
+ cd "$SAVED" >/dev/null
11
26
 
12
27
  APP_NAME="Gradle"
13
28
  APP_BASE_NAME=`basename "$0"`
14
29
 
30
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
+ DEFAULT_JVM_OPTS=""
32
+
15
33
  # Use the maximum available, or set MAX_FD != -1 to use that value.
16
34
  MAX_FD="maximum"
17
35
 
18
- warn ( ) {
36
+ warn () {
19
37
  echo "$*"
20
38
  }
21
39
 
22
- die ( ) {
40
+ die () {
23
41
  echo
24
42
  echo "$*"
25
43
  echo
@@ -30,6 +48,7 @@ die ( ) {
30
48
  cygwin=false
31
49
  msys=false
32
50
  darwin=false
51
+ nonstop=false
33
52
  case "`uname`" in
34
53
  CYGWIN* )
35
54
  cygwin=true
@@ -40,31 +59,11 @@ case "`uname`" in
40
59
  MINGW* )
41
60
  msys=true
42
61
  ;;
62
+ NONSTOP* )
63
+ nonstop=true
64
+ ;;
43
65
  esac
44
66
 
45
- # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
- if $cygwin ; then
47
- [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
- fi
49
-
50
- # Attempt to set APP_HOME
51
- # Resolve links: $0 may be a link
52
- PRG="$0"
53
- # Need this for relative symlinks.
54
- while [ -h "$PRG" ] ; do
55
- ls=`ls -ld "$PRG"`
56
- link=`expr "$ls" : '.*-> \(.*\)$'`
57
- if expr "$link" : '/.*' > /dev/null; then
58
- PRG="$link"
59
- else
60
- PRG=`dirname "$PRG"`"/$link"
61
- fi
62
- done
63
- SAVED="`pwd`"
64
- cd "`dirname \"$PRG\"`/" >&-
65
- APP_HOME="`pwd -P`"
66
- cd "$SAVED" >&-
67
-
68
67
  CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
68
 
70
69
  # Determine the Java command to use to start the JVM.
@@ -90,7 +89,7 @@ location of your Java installation."
90
89
  fi
91
90
 
92
91
  # Increase the maximum file descriptors if we can.
93
- if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
92
+ if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
94
93
  MAX_FD_LIMIT=`ulimit -H -n`
95
94
  if [ $? -eq 0 ] ; then
96
95
  if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
@@ -114,6 +113,7 @@ fi
114
113
  if $cygwin ; then
115
114
  APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
115
  CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
+ JAVACMD=`cygpath --unix "$JAVACMD"`
117
117
 
118
118
  # We build the pattern for arguments to be converted via cygpath
119
119
  ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
@@ -154,11 +154,19 @@ if $cygwin ; then
154
154
  esac
155
155
  fi
156
156
 
157
- # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
- function splitJvmOpts() {
159
- JVM_OPTS=("$@")
157
+ # Escape application args
158
+ save () {
159
+ for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
+ echo " "
160
161
  }
161
- eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
- JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
162
+ APP_ARGS=$(save "$@")
163
+
164
+ # Collect all arguments for the java command, following the shell quoting and substitution rules
165
+ eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166
+
167
+ # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168
+ if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169
+ cd "$(dirname "$0")"
170
+ fi
163
171
 
164
- exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
172
+ exec "$JAVACMD" "$@"
@@ -8,14 +8,14 @@
8
8
  @rem Set local scope for the variables with windows NT shell
9
9
  if "%OS%"=="Windows_NT" setlocal
10
10
 
11
- @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
- set DEFAULT_JVM_OPTS=
13
-
14
11
  set DIRNAME=%~dp0
15
12
  if "%DIRNAME%" == "" set DIRNAME=.
16
13
  set APP_BASE_NAME=%~n0
17
14
  set APP_HOME=%DIRNAME%
18
15
 
16
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
+ set DEFAULT_JVM_OPTS=
18
+
19
19
  @rem Find java.exe
20
20
  if defined JAVA_HOME goto findJavaFromJavaHome
21
21
 
@@ -46,10 +46,9 @@ echo location of your Java installation.
46
46
  goto fail
47
47
 
48
48
  :init
49
- @rem Get command-line arguments, handling Windowz variants
49
+ @rem Get command-line arguments, handling Windows variants
50
50
 
51
51
  if not "%OS%" == "Windows_NT" goto win9xME_args
52
- if "%@eval[2+2]" == "4" goto 4NT_args
53
52
 
54
53
  :win9xME_args
55
54
  @rem Slurp the command line arguments.
@@ -60,11 +59,6 @@ set _SKIP=2
60
59
  if "x%~1" == "x" goto execute
61
60
 
62
61
  set CMD_LINE_ARGS=%*
63
- goto execute
64
-
65
- :4NT_args
66
- @rem Get arguments from the 4NT Shell from JP Software
67
- set CMD_LINE_ARGS=%$
68
62
 
69
63
  :execute
70
64
  @rem Setup the command line
@@ -0,0 +1 @@
1
+ rootProject.name = 'embulk-output-hdfs'
@@ -0,0 +1,160 @@
1
+ package org.embulk.output.hdfs;
2
+
3
+ import org.apache.hadoop.fs.Path;
4
+ import org.embulk.config.TaskReport;
5
+ import org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask;
6
+ import org.embulk.output.hdfs.client.HdfsClient;
7
+ import org.embulk.spi.Buffer;
8
+ import org.embulk.spi.Exec;
9
+ import org.embulk.spi.FileOutput;
10
+ import org.embulk.spi.TransactionalFileOutput;
11
+ import org.embulk.spi.util.RetryExecutor;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.io.IOException;
15
+ import java.io.OutputStream;
16
+
17
+ public class HdfsFileOutput
18
+ implements FileOutput, TransactionalFileOutput
19
+ {
20
+ private static final Logger logger = Exec.getLogger(HdfsFileOutput.class);
21
+ private final RetryExecutor re = RetryExecutor.retryExecutor()
22
+ .withRetryLimit(3)
23
+ .withMaxRetryWait(500) // ms
24
+ .withMaxRetryWait(10 * 60 * 1000); // ms
25
+
26
+ private final HdfsClient hdfsClient;
27
+ private final int taskIdx;
28
+ private final String pathPrefix;
29
+ private final String sequenceFormat;
30
+ private final String fileExt;
31
+ private final boolean overwrite;
32
+
33
+ private int fileIdx = 0;
34
+ private Path currentPath = null;
35
+ private OutputStream o = null;
36
+
37
+ public HdfsFileOutput(PluginTask task, String pathPrefix, boolean overwrite, int taskIdx)
38
+ {
39
+ this.hdfsClient = HdfsClient.build(task);
40
+ this.pathPrefix = pathPrefix;
41
+ this.taskIdx = taskIdx;
42
+ this.sequenceFormat = task.getSequenceFormat();
43
+ this.fileExt = task.getFileExt();
44
+ this.overwrite = overwrite;
45
+ }
46
+
47
+ @Override
48
+ public void abort()
49
+ {
50
+ }
51
+
52
+ @Override
53
+ public TaskReport commit()
54
+ {
55
+ return Exec.newTaskReport();
56
+ }
57
+
58
+ @Override
59
+ public void nextFile()
60
+ {
61
+ closeCurrentStream();
62
+ currentPath = newPath();
63
+ fileIdx++;
64
+ }
65
+
66
+ @Override
67
+ public void add(Buffer buffer)
68
+ {
69
+ try {
70
+ // this implementation is for creating file when there is data.
71
+ if (o == null) {
72
+ o = hdfsClient.create(currentPath, overwrite);
73
+ logger.info("Uploading '{}'", currentPath);
74
+ }
75
+ write(buffer);
76
+ }
77
+ catch (RetryExecutor.RetryGiveupException e) {
78
+ throw new RuntimeException(e);
79
+ }
80
+ finally {
81
+ buffer.release();
82
+ }
83
+ }
84
+
85
+ @Override
86
+ public void finish()
87
+ {
88
+ closeCurrentStream();
89
+ }
90
+
91
+ @Override
92
+ public void close()
93
+ {
94
+ closeCurrentStream();
95
+ hdfsClient.close();
96
+ }
97
+
98
+ private void write(final Buffer buffer)
99
+ throws RetryExecutor.RetryGiveupException
100
+ {
101
+ re.run(new RetryExecutor.Retryable<Void>()
102
+ {
103
+ @Override
104
+ public Void call()
105
+ throws Exception
106
+ {
107
+ o.write(buffer.array(), buffer.offset(), buffer.limit());
108
+ return null;
109
+ }
110
+
111
+ @Override
112
+ public boolean isRetryableException(Exception exception)
113
+ {
114
+ return true; // TODO: which Exception is retryable?
115
+ }
116
+
117
+ @Override
118
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
119
+ throws RetryExecutor.RetryGiveupException
120
+ {
121
+ String m = String.format(
122
+ "%s. (Retry: Count: %d, Limit: %d, Wait: %d ms)",
123
+ exception.getMessage(),
124
+ retryCount,
125
+ retryLimit,
126
+ retryWait);
127
+ logger.warn(m, exception);
128
+ }
129
+
130
+ @Override
131
+ public void onGiveup(Exception firstException, Exception lastException)
132
+ throws RetryExecutor.RetryGiveupException
133
+ {
134
+ }
135
+ });
136
+ }
137
+
138
+ private Path newPath()
139
+ {
140
+ return new Path(pathPrefix + getSequence() + fileExt);
141
+ }
142
+
143
+ private String getSequence()
144
+ {
145
+ return String.format(sequenceFormat, taskIdx, fileIdx);
146
+ }
147
+
148
+ private void closeCurrentStream()
149
+ {
150
+ if (o != null) {
151
+ try {
152
+ o.close();
153
+ o = null;
154
+ }
155
+ catch (IOException e) {
156
+ throw new RuntimeException(e);
157
+ }
158
+ }
159
+ }
160
+ }