embulk-output-hdfs 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +6 -1
- data/CHANGELOG.md +9 -0
- data/README.md +38 -9
- data/build.gradle +10 -8
- data/example/config.yml +3 -1
- data/example/config_deprecated_option.yml +52 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -2
- data/gradlew +43 -35
- data/gradlew.bat +4 -10
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/output/hdfs/HdfsFileOutput.java +160 -0
- data/src/main/java/org/embulk/output/hdfs/HdfsFileOutputPlugin.java +55 -175
- data/src/main/java/org/embulk/output/hdfs/ModeTask.java +111 -0
- data/src/main/java/org/embulk/output/hdfs/client/HdfsClient.java +269 -0
- data/src/main/java/org/embulk/output/hdfs/compat/ModeCompat.java +76 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/AbortIfExistTx.java +6 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/AbstractTx.java +53 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/ControlRun.java +10 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/DeleteFilesInAdvanceTx.java +22 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/DeleteRecursiveInAdvanceTx.java +22 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/OverwriteTx.java +11 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/ReplaceTx.java +62 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/Tx.java +13 -0
- data/src/main/java/org/embulk/output/hdfs/util/SafeWorkspaceName.java +21 -0
- data/src/main/java/org/embulk/output/hdfs/util/SamplePath.java +21 -0
- data/src/main/java/org/embulk/output/hdfs/util/StrftimeUtil.java +23 -0
- data/src/test/java/org/embulk/output/hdfs/TestHdfsFileOutputPlugin.java +153 -22
- metadata +87 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd16ecf369d77f2a3dab20ebbb305dfbfca1a2a9
|
4
|
+
data.tar.gz: e89f1b6e282e461abc6116c42dfe393abff4032e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8e335c24da1f14cd05492baaddb1c2de0c57d0cfa9a15eef17b85c288e1bb0f57ba91cf95f1006a612b16b624a57ac7d422a7853fe6ccf44d66caabc9984523
|
7
|
+
data.tar.gz: 0049d9041ee1796b7e6e1cbc8dd1f7e0e738509f67abceaf0e7220ada5c95d9b63820cd38a5d37c720ee1e303e6504d3d38ba2524e9f9412143c5329cdba3186
|
data/.travis.yml
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
dist: precise
|
1
2
|
language: java
|
2
3
|
jdk:
|
3
4
|
- openjdk7
|
@@ -6,4 +7,8 @@ jdk:
|
|
6
7
|
script:
|
7
8
|
- ./gradlew test
|
8
9
|
after_success:
|
9
|
-
- ./gradlew jacocoTestReport coveralls
|
10
|
+
- ./gradlew jacocoTestReport coveralls
|
11
|
+
addons:
|
12
|
+
hosts:
|
13
|
+
- example.com
|
14
|
+
hostname: example.com
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
0.3.0 (2017-12-03)
|
2
|
+
==================
|
3
|
+
* Add: `mode` option.
|
4
|
+
* Add `replace` behaviour.
|
5
|
+
* Deprecated: `delete_in_advance` option. Please use `mode` instead.
|
6
|
+
* Deprecated: `overwrite` option. Please use `mode` instead.
|
7
|
+
* Enhancement: Delete behaviour become safely.
|
8
|
+
* Enhancement: Update embulk 0.8.38
|
9
|
+
|
1
10
|
0.2.4 (2016-04-27)
|
2
11
|
==================
|
3
12
|
- Enhancement: Avoid to create 0 byte files
|
data/README.md
CHANGED
@@ -14,19 +14,21 @@ A File Output Plugin for Embulk to write HDFS.
|
|
14
14
|
|
15
15
|
## Configuration
|
16
16
|
|
17
|
-
- **config_files
|
18
|
-
- **config
|
19
|
-
- **path_prefix
|
20
|
-
- **file_ext
|
21
|
-
- **sequence_format
|
22
|
-
- **rewind_seconds
|
23
|
-
- **
|
17
|
+
- **config_files**: list of paths to Hadoop's configuration files (array of strings, default: `[]`)
|
18
|
+
- **config**: overwrites configuration parameters (hash, default: `{}`)
|
19
|
+
- **path_prefix**: prefix of target files (string, required)
|
20
|
+
- **file_ext**: suffix of target files (string, required)
|
21
|
+
- **sequence_format**: format for sequence part of target files (string, default: `'%03d.%02d.'`)
|
22
|
+
- **rewind_seconds**: When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
|
23
|
+
- **doas**: username which access to Hdfs (string, default: executed user)
|
24
|
+
- **overwrite** *(Deprecated: Please use `mode` option instead)*: overwrite files when the same filenames already exists (boolean, default: `false`)
|
24
25
|
- *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
|
25
|
-
- **
|
26
|
-
- **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
|
26
|
+
- **delete_in_advance** *(Deprecated: Please use `mode` option instead)*: delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
|
27
27
|
- `NONE`: do nothing
|
28
28
|
- `FILE_ONLY`: delete files
|
29
29
|
- `RECURSIVE`: delete files and directories
|
30
|
+
- **mode**: "abort_if_exist", "overwrite", "delete_files_in_advance", "delete_recursive_in_advance", or "replace". See below. (string, optional, default: `"abort_if_exist"`)
|
31
|
+
* In the future, default mode will become `"replace"`.
|
30
32
|
|
31
33
|
## CAUTION
|
32
34
|
If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
|
@@ -34,6 +36,33 @@ If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance`
|
|
34
36
|
this means `embulk-output-hdfs` can destroy your hdfs.
|
35
37
|
So, please be careful when you use `delete_in_advance` option and `doas` option ...
|
36
38
|
|
39
|
+
## About DELETE
|
40
|
+
|
41
|
+
When this plugin deletes files or directories, use [`Hadoop Trash API`](https://hadoop.apache.org/docs/r2.8.0/api/org/apache/hadoop/fs/Trash.html). So, you can find them in the trash during `fs.trash.interval`.
|
42
|
+
|
43
|
+
## Modes
|
44
|
+
|
45
|
+
* **abort_if_exist**:
|
46
|
+
* Behavior: This mode writes rows to the target files in order. If target files already exist, abort the transaction.
|
47
|
+
* Transactional: No. If fails, the target files could have some rows written.
|
48
|
+
* Resumable: No.
|
49
|
+
* **overwrite**:
|
50
|
+
* Behavior: This mode writes rows to the target files in order. If target files already exist, this re-write from the beginning of the file.
|
51
|
+
* Transactional: No. If fails, the target files could have some rows written.
|
52
|
+
* Resumable: No.
|
53
|
+
* **delete_files_in_advance**:
|
54
|
+
* Behavior: This mode delete files at first, then writes rows to the target files in order.
|
55
|
+
* Transactional: No. If fails, the target files could be removed.
|
56
|
+
* Resumable: No.
|
57
|
+
* **delete_recursive_in_advance**:
|
58
|
+
* Behavior: This mode delete directories recursively at first, then writes rows to the target files in order.
|
59
|
+
* Transactional: No. If fails, the target files could be removed.
|
60
|
+
* Resumable: No.
|
61
|
+
* **replace**:
|
62
|
+
* Behavior: This mode writes rows to the workspace files in order, then replace them to target directories. This **replace** is not **atomic** because hdfs api does not have atomic replace.
|
63
|
+
* Transactional: No. If fails, the target files could be removed.
|
64
|
+
* Resumable: No.
|
65
|
+
|
37
66
|
## Example
|
38
67
|
|
39
68
|
```yaml
|
data/build.gradle
CHANGED
@@ -15,20 +15,20 @@ configurations {
|
|
15
15
|
provided
|
16
16
|
}
|
17
17
|
|
18
|
-
version = "0.
|
18
|
+
version = "0.3.0"
|
19
19
|
|
20
20
|
sourceCompatibility = 1.7
|
21
21
|
targetCompatibility = 1.7
|
22
22
|
|
23
23
|
dependencies {
|
24
|
-
compile "org.embulk:embulk-core:0.8.
|
25
|
-
provided "org.embulk:embulk-core:0.8.
|
24
|
+
compile "org.embulk:embulk-core:0.8.38"
|
25
|
+
provided "org.embulk:embulk-core:0.8.38"
|
26
26
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
27
27
|
compile 'org.apache.hadoop:hadoop-client:2.6.0'
|
28
28
|
compile 'com.google.guava:guava:15.0'
|
29
29
|
testCompile "junit:junit:4.+"
|
30
|
-
testCompile "org.embulk:embulk-core:0.8.
|
31
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
30
|
+
testCompile "org.embulk:embulk-core:0.8.38:tests"
|
31
|
+
testCompile "org.embulk:embulk-standards:0.8.38"
|
32
32
|
}
|
33
33
|
|
34
34
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -72,9 +72,11 @@ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
|
|
72
72
|
script "pkg/${project.name}-${project.version}.gem"
|
73
73
|
}
|
74
74
|
|
75
|
-
task "package"(dependsOn: ["gemspec", "classpath"])
|
76
|
-
|
77
|
-
|
75
|
+
task "package"(dependsOn: ["gemspec", "classpath"]) {
|
76
|
+
doLast {
|
77
|
+
println "> Build succeeded."
|
78
|
+
println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
|
79
|
+
}
|
78
80
|
}
|
79
81
|
|
80
82
|
task gemspec {
|
data/example/config.yml
CHANGED
@@ -6,12 +6,14 @@ hdfs_example: &hdfs_example
|
|
6
6
|
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
7
7
|
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
8
8
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
9
|
+
fs.trash.interval: 3600
|
9
10
|
|
10
11
|
local_fs_example: &local_fs_example
|
11
12
|
config:
|
12
13
|
fs.defaultFS: 'file:///'
|
13
14
|
fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
14
15
|
fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
16
|
+
fs.trash.interval: 3600
|
15
17
|
io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
|
16
18
|
|
17
19
|
in:
|
@@ -38,7 +40,7 @@ out:
|
|
38
40
|
<<: *local_fs_example
|
39
41
|
path_prefix: /tmp/embulk-output-hdfs_example/file_
|
40
42
|
file_ext: csv
|
41
|
-
|
43
|
+
mode: replace
|
42
44
|
formatter:
|
43
45
|
type: csv
|
44
46
|
newline: CRLF
|
@@ -0,0 +1,52 @@
|
|
1
|
+
hdfs_example: &hdfs_example
|
2
|
+
config_files:
|
3
|
+
- /etc/hadoop/conf/core-site.xml
|
4
|
+
- /etc/hadoop/conf/hdfs-site.xml
|
5
|
+
config:
|
6
|
+
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
7
|
+
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
8
|
+
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
9
|
+
|
10
|
+
local_fs_example: &local_fs_example
|
11
|
+
config:
|
12
|
+
fs.defaultFS: 'file:///'
|
13
|
+
fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
14
|
+
fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
15
|
+
io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
|
16
|
+
|
17
|
+
in:
|
18
|
+
type: file
|
19
|
+
path_prefix: example/data
|
20
|
+
parser:
|
21
|
+
charset: UTF-8
|
22
|
+
newline: CRLF
|
23
|
+
type: csv
|
24
|
+
delimiter: ','
|
25
|
+
quote: '"'
|
26
|
+
header_line: true
|
27
|
+
stop_on_invalid_record: true
|
28
|
+
columns:
|
29
|
+
- {name: id, type: long}
|
30
|
+
- {name: account, type: long}
|
31
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
32
|
+
- {name: purchase, type: timestamp, format: '%Y%m%d'}
|
33
|
+
- {name: comment, type: string}
|
34
|
+
|
35
|
+
|
36
|
+
out:
|
37
|
+
type: hdfs
|
38
|
+
<<: *local_fs_example
|
39
|
+
path_prefix: /tmp/embulk-output-hdfs_example/file_
|
40
|
+
file_ext: csv
|
41
|
+
delete_in_advance: FILE_ONLY
|
42
|
+
formatter:
|
43
|
+
type: csv
|
44
|
+
newline: CRLF
|
45
|
+
newline_in_field: LF
|
46
|
+
header_line: true
|
47
|
+
charset: UTF-8
|
48
|
+
quote_policy: NONE
|
49
|
+
quote: '"'
|
50
|
+
escape: '\'
|
51
|
+
null_string: ''
|
52
|
+
default_timezone: UTC
|
Binary file
|
@@ -1,6 +1,5 @@
|
|
1
|
-
#Wed Jan 13 12:41:02 JST 2016
|
2
1
|
distributionBase=GRADLE_USER_HOME
|
3
2
|
distributionPath=wrapper/dists
|
4
3
|
zipStoreBase=GRADLE_USER_HOME
|
5
4
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-
|
5
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env sh
|
2
2
|
|
3
3
|
##############################################################################
|
4
4
|
##
|
@@ -6,20 +6,38 @@
|
|
6
6
|
##
|
7
7
|
##############################################################################
|
8
8
|
|
9
|
-
#
|
10
|
-
|
9
|
+
# Attempt to set APP_HOME
|
10
|
+
# Resolve links: $0 may be a link
|
11
|
+
PRG="$0"
|
12
|
+
# Need this for relative symlinks.
|
13
|
+
while [ -h "$PRG" ] ; do
|
14
|
+
ls=`ls -ld "$PRG"`
|
15
|
+
link=`expr "$ls" : '.*-> \(.*\)$'`
|
16
|
+
if expr "$link" : '/.*' > /dev/null; then
|
17
|
+
PRG="$link"
|
18
|
+
else
|
19
|
+
PRG=`dirname "$PRG"`"/$link"
|
20
|
+
fi
|
21
|
+
done
|
22
|
+
SAVED="`pwd`"
|
23
|
+
cd "`dirname \"$PRG\"`/" >/dev/null
|
24
|
+
APP_HOME="`pwd -P`"
|
25
|
+
cd "$SAVED" >/dev/null
|
11
26
|
|
12
27
|
APP_NAME="Gradle"
|
13
28
|
APP_BASE_NAME=`basename "$0"`
|
14
29
|
|
30
|
+
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
+
DEFAULT_JVM_OPTS=""
|
32
|
+
|
15
33
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
16
34
|
MAX_FD="maximum"
|
17
35
|
|
18
|
-
warn (
|
36
|
+
warn () {
|
19
37
|
echo "$*"
|
20
38
|
}
|
21
39
|
|
22
|
-
die (
|
40
|
+
die () {
|
23
41
|
echo
|
24
42
|
echo "$*"
|
25
43
|
echo
|
@@ -30,6 +48,7 @@ die ( ) {
|
|
30
48
|
cygwin=false
|
31
49
|
msys=false
|
32
50
|
darwin=false
|
51
|
+
nonstop=false
|
33
52
|
case "`uname`" in
|
34
53
|
CYGWIN* )
|
35
54
|
cygwin=true
|
@@ -40,31 +59,11 @@ case "`uname`" in
|
|
40
59
|
MINGW* )
|
41
60
|
msys=true
|
42
61
|
;;
|
62
|
+
NONSTOP* )
|
63
|
+
nonstop=true
|
64
|
+
;;
|
43
65
|
esac
|
44
66
|
|
45
|
-
# For Cygwin, ensure paths are in UNIX format before anything is touched.
|
46
|
-
if $cygwin ; then
|
47
|
-
[ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
|
48
|
-
fi
|
49
|
-
|
50
|
-
# Attempt to set APP_HOME
|
51
|
-
# Resolve links: $0 may be a link
|
52
|
-
PRG="$0"
|
53
|
-
# Need this for relative symlinks.
|
54
|
-
while [ -h "$PRG" ] ; do
|
55
|
-
ls=`ls -ld "$PRG"`
|
56
|
-
link=`expr "$ls" : '.*-> \(.*\)$'`
|
57
|
-
if expr "$link" : '/.*' > /dev/null; then
|
58
|
-
PRG="$link"
|
59
|
-
else
|
60
|
-
PRG=`dirname "$PRG"`"/$link"
|
61
|
-
fi
|
62
|
-
done
|
63
|
-
SAVED="`pwd`"
|
64
|
-
cd "`dirname \"$PRG\"`/" >&-
|
65
|
-
APP_HOME="`pwd -P`"
|
66
|
-
cd "$SAVED" >&-
|
67
|
-
|
68
67
|
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
69
68
|
|
70
69
|
# Determine the Java command to use to start the JVM.
|
@@ -90,7 +89,7 @@ location of your Java installation."
|
|
90
89
|
fi
|
91
90
|
|
92
91
|
# Increase the maximum file descriptors if we can.
|
93
|
-
if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
|
92
|
+
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
94
93
|
MAX_FD_LIMIT=`ulimit -H -n`
|
95
94
|
if [ $? -eq 0 ] ; then
|
96
95
|
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
@@ -114,6 +113,7 @@ fi
|
|
114
113
|
if $cygwin ; then
|
115
114
|
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
116
115
|
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
116
|
+
JAVACMD=`cygpath --unix "$JAVACMD"`
|
117
117
|
|
118
118
|
# We build the pattern for arguments to be converted via cygpath
|
119
119
|
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
@@ -154,11 +154,19 @@ if $cygwin ; then
|
|
154
154
|
esac
|
155
155
|
fi
|
156
156
|
|
157
|
-
#
|
158
|
-
|
159
|
-
|
157
|
+
# Escape application args
|
158
|
+
save () {
|
159
|
+
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
|
+
echo " "
|
160
161
|
}
|
161
|
-
|
162
|
-
|
162
|
+
APP_ARGS=$(save "$@")
|
163
|
+
|
164
|
+
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
|
+
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
166
|
+
|
167
|
+
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
168
|
+
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
169
|
+
cd "$(dirname "$0")"
|
170
|
+
fi
|
163
171
|
|
164
|
-
exec "$JAVACMD" "
|
172
|
+
exec "$JAVACMD" "$@"
|
data/gradlew.bat
CHANGED
@@ -8,14 +8,14 @@
|
|
8
8
|
@rem Set local scope for the variables with windows NT shell
|
9
9
|
if "%OS%"=="Windows_NT" setlocal
|
10
10
|
|
11
|
-
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
12
|
-
set DEFAULT_JVM_OPTS=
|
13
|
-
|
14
11
|
set DIRNAME=%~dp0
|
15
12
|
if "%DIRNAME%" == "" set DIRNAME=.
|
16
13
|
set APP_BASE_NAME=%~n0
|
17
14
|
set APP_HOME=%DIRNAME%
|
18
15
|
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
19
|
@rem Find java.exe
|
20
20
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
21
|
|
@@ -46,10 +46,9 @@ echo location of your Java installation.
|
|
46
46
|
goto fail
|
47
47
|
|
48
48
|
:init
|
49
|
-
@rem Get command-line arguments, handling
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
50
|
|
51
51
|
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
-
if "%@eval[2+2]" == "4" goto 4NT_args
|
53
52
|
|
54
53
|
:win9xME_args
|
55
54
|
@rem Slurp the command line arguments.
|
@@ -60,11 +59,6 @@ set _SKIP=2
|
|
60
59
|
if "x%~1" == "x" goto execute
|
61
60
|
|
62
61
|
set CMD_LINE_ARGS=%*
|
63
|
-
goto execute
|
64
|
-
|
65
|
-
:4NT_args
|
66
|
-
@rem Get arguments from the 4NT Shell from JP Software
|
67
|
-
set CMD_LINE_ARGS=%$
|
68
62
|
|
69
63
|
:execute
|
70
64
|
@rem Setup the command line
|
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = 'embulk-output-hdfs'
|
@@ -0,0 +1,160 @@
|
|
1
|
+
package org.embulk.output.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.Path;
|
4
|
+
import org.embulk.config.TaskReport;
|
5
|
+
import org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask;
|
6
|
+
import org.embulk.output.hdfs.client.HdfsClient;
|
7
|
+
import org.embulk.spi.Buffer;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.FileOutput;
|
10
|
+
import org.embulk.spi.TransactionalFileOutput;
|
11
|
+
import org.embulk.spi.util.RetryExecutor;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.io.IOException;
|
15
|
+
import java.io.OutputStream;
|
16
|
+
|
17
|
+
public class HdfsFileOutput
|
18
|
+
implements FileOutput, TransactionalFileOutput
|
19
|
+
{
|
20
|
+
private static final Logger logger = Exec.getLogger(HdfsFileOutput.class);
|
21
|
+
private final RetryExecutor re = RetryExecutor.retryExecutor()
|
22
|
+
.withRetryLimit(3)
|
23
|
+
.withMaxRetryWait(500) // ms
|
24
|
+
.withMaxRetryWait(10 * 60 * 1000); // ms
|
25
|
+
|
26
|
+
private final HdfsClient hdfsClient;
|
27
|
+
private final int taskIdx;
|
28
|
+
private final String pathPrefix;
|
29
|
+
private final String sequenceFormat;
|
30
|
+
private final String fileExt;
|
31
|
+
private final boolean overwrite;
|
32
|
+
|
33
|
+
private int fileIdx = 0;
|
34
|
+
private Path currentPath = null;
|
35
|
+
private OutputStream o = null;
|
36
|
+
|
37
|
+
public HdfsFileOutput(PluginTask task, String pathPrefix, boolean overwrite, int taskIdx)
|
38
|
+
{
|
39
|
+
this.hdfsClient = HdfsClient.build(task);
|
40
|
+
this.pathPrefix = pathPrefix;
|
41
|
+
this.taskIdx = taskIdx;
|
42
|
+
this.sequenceFormat = task.getSequenceFormat();
|
43
|
+
this.fileExt = task.getFileExt();
|
44
|
+
this.overwrite = overwrite;
|
45
|
+
}
|
46
|
+
|
47
|
+
@Override
|
48
|
+
public void abort()
|
49
|
+
{
|
50
|
+
}
|
51
|
+
|
52
|
+
@Override
|
53
|
+
public TaskReport commit()
|
54
|
+
{
|
55
|
+
return Exec.newTaskReport();
|
56
|
+
}
|
57
|
+
|
58
|
+
@Override
|
59
|
+
public void nextFile()
|
60
|
+
{
|
61
|
+
closeCurrentStream();
|
62
|
+
currentPath = newPath();
|
63
|
+
fileIdx++;
|
64
|
+
}
|
65
|
+
|
66
|
+
@Override
|
67
|
+
public void add(Buffer buffer)
|
68
|
+
{
|
69
|
+
try {
|
70
|
+
// this implementation is for creating file when there is data.
|
71
|
+
if (o == null) {
|
72
|
+
o = hdfsClient.create(currentPath, overwrite);
|
73
|
+
logger.info("Uploading '{}'", currentPath);
|
74
|
+
}
|
75
|
+
write(buffer);
|
76
|
+
}
|
77
|
+
catch (RetryExecutor.RetryGiveupException e) {
|
78
|
+
throw new RuntimeException(e);
|
79
|
+
}
|
80
|
+
finally {
|
81
|
+
buffer.release();
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
@Override
|
86
|
+
public void finish()
|
87
|
+
{
|
88
|
+
closeCurrentStream();
|
89
|
+
}
|
90
|
+
|
91
|
+
@Override
|
92
|
+
public void close()
|
93
|
+
{
|
94
|
+
closeCurrentStream();
|
95
|
+
hdfsClient.close();
|
96
|
+
}
|
97
|
+
|
98
|
+
private void write(final Buffer buffer)
|
99
|
+
throws RetryExecutor.RetryGiveupException
|
100
|
+
{
|
101
|
+
re.run(new RetryExecutor.Retryable<Void>()
|
102
|
+
{
|
103
|
+
@Override
|
104
|
+
public Void call()
|
105
|
+
throws Exception
|
106
|
+
{
|
107
|
+
o.write(buffer.array(), buffer.offset(), buffer.limit());
|
108
|
+
return null;
|
109
|
+
}
|
110
|
+
|
111
|
+
@Override
|
112
|
+
public boolean isRetryableException(Exception exception)
|
113
|
+
{
|
114
|
+
return true; // TODO: which Exception is retryable?
|
115
|
+
}
|
116
|
+
|
117
|
+
@Override
|
118
|
+
public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
|
119
|
+
throws RetryExecutor.RetryGiveupException
|
120
|
+
{
|
121
|
+
String m = String.format(
|
122
|
+
"%s. (Retry: Count: %d, Limit: %d, Wait: %d ms)",
|
123
|
+
exception.getMessage(),
|
124
|
+
retryCount,
|
125
|
+
retryLimit,
|
126
|
+
retryWait);
|
127
|
+
logger.warn(m, exception);
|
128
|
+
}
|
129
|
+
|
130
|
+
@Override
|
131
|
+
public void onGiveup(Exception firstException, Exception lastException)
|
132
|
+
throws RetryExecutor.RetryGiveupException
|
133
|
+
{
|
134
|
+
}
|
135
|
+
});
|
136
|
+
}
|
137
|
+
|
138
|
+
private Path newPath()
|
139
|
+
{
|
140
|
+
return new Path(pathPrefix + getSequence() + fileExt);
|
141
|
+
}
|
142
|
+
|
143
|
+
private String getSequence()
|
144
|
+
{
|
145
|
+
return String.format(sequenceFormat, taskIdx, fileIdx);
|
146
|
+
}
|
147
|
+
|
148
|
+
private void closeCurrentStream()
|
149
|
+
{
|
150
|
+
if (o != null) {
|
151
|
+
try {
|
152
|
+
o.close();
|
153
|
+
o = null;
|
154
|
+
}
|
155
|
+
catch (IOException e) {
|
156
|
+
throw new RuntimeException(e);
|
157
|
+
}
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|