embulk-output-hdfs 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +6 -1
- data/CHANGELOG.md +9 -0
- data/README.md +38 -9
- data/build.gradle +10 -8
- data/example/config.yml +3 -1
- data/example/config_deprecated_option.yml +52 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +1 -2
- data/gradlew +43 -35
- data/gradlew.bat +4 -10
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/output/hdfs/HdfsFileOutput.java +160 -0
- data/src/main/java/org/embulk/output/hdfs/HdfsFileOutputPlugin.java +55 -175
- data/src/main/java/org/embulk/output/hdfs/ModeTask.java +111 -0
- data/src/main/java/org/embulk/output/hdfs/client/HdfsClient.java +269 -0
- data/src/main/java/org/embulk/output/hdfs/compat/ModeCompat.java +76 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/AbortIfExistTx.java +6 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/AbstractTx.java +53 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/ControlRun.java +10 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/DeleteFilesInAdvanceTx.java +22 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/DeleteRecursiveInAdvanceTx.java +22 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/OverwriteTx.java +11 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/ReplaceTx.java +62 -0
- data/src/main/java/org/embulk/output/hdfs/transaction/Tx.java +13 -0
- data/src/main/java/org/embulk/output/hdfs/util/SafeWorkspaceName.java +21 -0
- data/src/main/java/org/embulk/output/hdfs/util/SamplePath.java +21 -0
- data/src/main/java/org/embulk/output/hdfs/util/StrftimeUtil.java +23 -0
- data/src/test/java/org/embulk/output/hdfs/TestHdfsFileOutputPlugin.java +153 -22
- metadata +87 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd16ecf369d77f2a3dab20ebbb305dfbfca1a2a9
|
4
|
+
data.tar.gz: e89f1b6e282e461abc6116c42dfe393abff4032e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8e335c24da1f14cd05492baaddb1c2de0c57d0cfa9a15eef17b85c288e1bb0f57ba91cf95f1006a612b16b624a57ac7d422a7853fe6ccf44d66caabc9984523
|
7
|
+
data.tar.gz: 0049d9041ee1796b7e6e1cbc8dd1f7e0e738509f67abceaf0e7220ada5c95d9b63820cd38a5d37c720ee1e303e6504d3d38ba2524e9f9412143c5329cdba3186
|
data/.travis.yml
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
dist: precise
|
1
2
|
language: java
|
2
3
|
jdk:
|
3
4
|
- openjdk7
|
@@ -6,4 +7,8 @@ jdk:
|
|
6
7
|
script:
|
7
8
|
- ./gradlew test
|
8
9
|
after_success:
|
9
|
-
- ./gradlew jacocoTestReport coveralls
|
10
|
+
- ./gradlew jacocoTestReport coveralls
|
11
|
+
addons:
|
12
|
+
hosts:
|
13
|
+
- example.com
|
14
|
+
hostname: example.com
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
0.3.0 (2017-12-03)
|
2
|
+
==================
|
3
|
+
* Add: `mode` option.
|
4
|
+
* Add `replace` behaviour.
|
5
|
+
* Deprecated: `delete_in_advance` option. Please use `mode` instead.
|
6
|
+
* Deprecated: `overwrite` option. Please use `mode` instead.
|
7
|
+
* Enhancement: Delete behaviour become safely.
|
8
|
+
* Enhancement: Update embulk 0.8.38
|
9
|
+
|
1
10
|
0.2.4 (2016-04-27)
|
2
11
|
==================
|
3
12
|
- Enhancement: Avoid to create 0 byte files
|
data/README.md
CHANGED
@@ -14,19 +14,21 @@ A File Output Plugin for Embulk to write HDFS.
|
|
14
14
|
|
15
15
|
## Configuration
|
16
16
|
|
17
|
-
- **config_files
|
18
|
-
- **config
|
19
|
-
- **path_prefix
|
20
|
-
- **file_ext
|
21
|
-
- **sequence_format
|
22
|
-
- **rewind_seconds
|
23
|
-
- **
|
17
|
+
- **config_files**: list of paths to Hadoop's configuration files (array of strings, default: `[]`)
|
18
|
+
- **config**: overwrites configuration parameters (hash, default: `{}`)
|
19
|
+
- **path_prefix**: prefix of target files (string, required)
|
20
|
+
- **file_ext**: suffix of target files (string, required)
|
21
|
+
- **sequence_format**: format for sequence part of target files (string, default: `'%03d.%02d.'`)
|
22
|
+
- **rewind_seconds**: When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
|
23
|
+
- **doas**: username which access to Hdfs (string, default: executed user)
|
24
|
+
- **overwrite** *(Deprecated: Please use `mode` option instead)*: overwrite files when the same filenames already exists (boolean, default: `false`)
|
24
25
|
- *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
|
25
|
-
- **
|
26
|
-
- **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
|
26
|
+
- **delete_in_advance** *(Deprecated: Please use `mode` option instead)*: delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
|
27
27
|
- `NONE`: do nothing
|
28
28
|
- `FILE_ONLY`: delete files
|
29
29
|
- `RECURSIVE`: delete files and directories
|
30
|
+
- **mode**: "abort_if_exist", "overwrite", "delete_files_in_advance", "delete_recursive_in_advance", or "replace". See below. (string, optional, default: `"abort_if_exist"`)
|
31
|
+
* In the future, default mode will become `"replace"`.
|
30
32
|
|
31
33
|
## CAUTION
|
32
34
|
If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
|
@@ -34,6 +36,33 @@ If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance`
|
|
34
36
|
this means `embulk-output-hdfs` can destroy your hdfs.
|
35
37
|
So, please be careful when you use `delete_in_advance` option and `doas` option ...
|
36
38
|
|
39
|
+
## About DELETE
|
40
|
+
|
41
|
+
When this plugin deletes files or directories, use [`Hadoop Trash API`](https://hadoop.apache.org/docs/r2.8.0/api/org/apache/hadoop/fs/Trash.html). So, you can find them in the trash during `fs.trash.interval`.
|
42
|
+
|
43
|
+
## Modes
|
44
|
+
|
45
|
+
* **abort_if_exist**:
|
46
|
+
* Behavior: This mode writes rows to the target files in order. If target files already exist, abort the transaction.
|
47
|
+
* Transactional: No. If fails, the target files could have some rows written.
|
48
|
+
* Resumable: No.
|
49
|
+
* **overwrite**:
|
50
|
+
* Behavior: This mode writes rows to the target files in order. If target files already exist, this re-write from the beginning of the file.
|
51
|
+
* Transactional: No. If fails, the target files could have some rows written.
|
52
|
+
* Resumable: No.
|
53
|
+
* **delete_files_in_advance**:
|
54
|
+
* Behavior: This mode delete files at first, then writes rows to the target files in order.
|
55
|
+
* Transactional: No. If fails, the target files could be removed.
|
56
|
+
* Resumable: No.
|
57
|
+
* **delete_recursive_in_advance**:
|
58
|
+
* Behavior: This mode delete directories recursively at first, then writes rows to the target files in order.
|
59
|
+
* Transactional: No. If fails, the target files could be removed.
|
60
|
+
* Resumable: No.
|
61
|
+
* **replace**:
|
62
|
+
* Behavior: This mode writes rows to the workspace files in order, then replace them to target directories. This **replace** is not **atomic** because hdfs api does not have atomic replace.
|
63
|
+
* Transactional: No. If fails, the target files could be removed.
|
64
|
+
* Resumable: No.
|
65
|
+
|
37
66
|
## Example
|
38
67
|
|
39
68
|
```yaml
|
data/build.gradle
CHANGED
@@ -15,20 +15,20 @@ configurations {
|
|
15
15
|
provided
|
16
16
|
}
|
17
17
|
|
18
|
-
version = "0.
|
18
|
+
version = "0.3.0"
|
19
19
|
|
20
20
|
sourceCompatibility = 1.7
|
21
21
|
targetCompatibility = 1.7
|
22
22
|
|
23
23
|
dependencies {
|
24
|
-
compile "org.embulk:embulk-core:0.8.
|
25
|
-
provided "org.embulk:embulk-core:0.8.
|
24
|
+
compile "org.embulk:embulk-core:0.8.38"
|
25
|
+
provided "org.embulk:embulk-core:0.8.38"
|
26
26
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
27
27
|
compile 'org.apache.hadoop:hadoop-client:2.6.0'
|
28
28
|
compile 'com.google.guava:guava:15.0'
|
29
29
|
testCompile "junit:junit:4.+"
|
30
|
-
testCompile "org.embulk:embulk-core:0.8.
|
31
|
-
testCompile "org.embulk:embulk-standards:0.8.
|
30
|
+
testCompile "org.embulk:embulk-core:0.8.38:tests"
|
31
|
+
testCompile "org.embulk:embulk-standards:0.8.38"
|
32
32
|
}
|
33
33
|
|
34
34
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -72,9 +72,11 @@ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
|
|
72
72
|
script "pkg/${project.name}-${project.version}.gem"
|
73
73
|
}
|
74
74
|
|
75
|
-
task "package"(dependsOn: ["gemspec", "classpath"])
|
76
|
-
|
77
|
-
|
75
|
+
task "package"(dependsOn: ["gemspec", "classpath"]) {
|
76
|
+
doLast {
|
77
|
+
println "> Build succeeded."
|
78
|
+
println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
|
79
|
+
}
|
78
80
|
}
|
79
81
|
|
80
82
|
task gemspec {
|
data/example/config.yml
CHANGED
@@ -6,12 +6,14 @@ hdfs_example: &hdfs_example
|
|
6
6
|
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
7
7
|
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
8
8
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
9
|
+
fs.trash.interval: 3600
|
9
10
|
|
10
11
|
local_fs_example: &local_fs_example
|
11
12
|
config:
|
12
13
|
fs.defaultFS: 'file:///'
|
13
14
|
fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
14
15
|
fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
16
|
+
fs.trash.interval: 3600
|
15
17
|
io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
|
16
18
|
|
17
19
|
in:
|
@@ -38,7 +40,7 @@ out:
|
|
38
40
|
<<: *local_fs_example
|
39
41
|
path_prefix: /tmp/embulk-output-hdfs_example/file_
|
40
42
|
file_ext: csv
|
41
|
-
|
43
|
+
mode: replace
|
42
44
|
formatter:
|
43
45
|
type: csv
|
44
46
|
newline: CRLF
|
@@ -0,0 +1,52 @@
|
|
1
|
+
hdfs_example: &hdfs_example
|
2
|
+
config_files:
|
3
|
+
- /etc/hadoop/conf/core-site.xml
|
4
|
+
- /etc/hadoop/conf/hdfs-site.xml
|
5
|
+
config:
|
6
|
+
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
7
|
+
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
8
|
+
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
9
|
+
|
10
|
+
local_fs_example: &local_fs_example
|
11
|
+
config:
|
12
|
+
fs.defaultFS: 'file:///'
|
13
|
+
fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
14
|
+
fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
|
15
|
+
io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
|
16
|
+
|
17
|
+
in:
|
18
|
+
type: file
|
19
|
+
path_prefix: example/data
|
20
|
+
parser:
|
21
|
+
charset: UTF-8
|
22
|
+
newline: CRLF
|
23
|
+
type: csv
|
24
|
+
delimiter: ','
|
25
|
+
quote: '"'
|
26
|
+
header_line: true
|
27
|
+
stop_on_invalid_record: true
|
28
|
+
columns:
|
29
|
+
- {name: id, type: long}
|
30
|
+
- {name: account, type: long}
|
31
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
32
|
+
- {name: purchase, type: timestamp, format: '%Y%m%d'}
|
33
|
+
- {name: comment, type: string}
|
34
|
+
|
35
|
+
|
36
|
+
out:
|
37
|
+
type: hdfs
|
38
|
+
<<: *local_fs_example
|
39
|
+
path_prefix: /tmp/embulk-output-hdfs_example/file_
|
40
|
+
file_ext: csv
|
41
|
+
delete_in_advance: FILE_ONLY
|
42
|
+
formatter:
|
43
|
+
type: csv
|
44
|
+
newline: CRLF
|
45
|
+
newline_in_field: LF
|
46
|
+
header_line: true
|
47
|
+
charset: UTF-8
|
48
|
+
quote_policy: NONE
|
49
|
+
quote: '"'
|
50
|
+
escape: '\'
|
51
|
+
null_string: ''
|
52
|
+
default_timezone: UTC
|
Binary file
|
@@ -1,6 +1,5 @@
|
|
1
|
-
#Wed Jan 13 12:41:02 JST 2016
|
2
1
|
distributionBase=GRADLE_USER_HOME
|
3
2
|
distributionPath=wrapper/dists
|
4
3
|
zipStoreBase=GRADLE_USER_HOME
|
5
4
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-
|
5
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-bin.zip
|
data/gradlew
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env sh
|
2
2
|
|
3
3
|
##############################################################################
|
4
4
|
##
|
@@ -6,20 +6,38 @@
|
|
6
6
|
##
|
7
7
|
##############################################################################
|
8
8
|
|
9
|
-
#
|
10
|
-
|
9
|
+
# Attempt to set APP_HOME
|
10
|
+
# Resolve links: $0 may be a link
|
11
|
+
PRG="$0"
|
12
|
+
# Need this for relative symlinks.
|
13
|
+
while [ -h "$PRG" ] ; do
|
14
|
+
ls=`ls -ld "$PRG"`
|
15
|
+
link=`expr "$ls" : '.*-> \(.*\)$'`
|
16
|
+
if expr "$link" : '/.*' > /dev/null; then
|
17
|
+
PRG="$link"
|
18
|
+
else
|
19
|
+
PRG=`dirname "$PRG"`"/$link"
|
20
|
+
fi
|
21
|
+
done
|
22
|
+
SAVED="`pwd`"
|
23
|
+
cd "`dirname \"$PRG\"`/" >/dev/null
|
24
|
+
APP_HOME="`pwd -P`"
|
25
|
+
cd "$SAVED" >/dev/null
|
11
26
|
|
12
27
|
APP_NAME="Gradle"
|
13
28
|
APP_BASE_NAME=`basename "$0"`
|
14
29
|
|
30
|
+
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
+
DEFAULT_JVM_OPTS=""
|
32
|
+
|
15
33
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
16
34
|
MAX_FD="maximum"
|
17
35
|
|
18
|
-
warn (
|
36
|
+
warn () {
|
19
37
|
echo "$*"
|
20
38
|
}
|
21
39
|
|
22
|
-
die (
|
40
|
+
die () {
|
23
41
|
echo
|
24
42
|
echo "$*"
|
25
43
|
echo
|
@@ -30,6 +48,7 @@ die ( ) {
|
|
30
48
|
cygwin=false
|
31
49
|
msys=false
|
32
50
|
darwin=false
|
51
|
+
nonstop=false
|
33
52
|
case "`uname`" in
|
34
53
|
CYGWIN* )
|
35
54
|
cygwin=true
|
@@ -40,31 +59,11 @@ case "`uname`" in
|
|
40
59
|
MINGW* )
|
41
60
|
msys=true
|
42
61
|
;;
|
62
|
+
NONSTOP* )
|
63
|
+
nonstop=true
|
64
|
+
;;
|
43
65
|
esac
|
44
66
|
|
45
|
-
# For Cygwin, ensure paths are in UNIX format before anything is touched.
|
46
|
-
if $cygwin ; then
|
47
|
-
[ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
|
48
|
-
fi
|
49
|
-
|
50
|
-
# Attempt to set APP_HOME
|
51
|
-
# Resolve links: $0 may be a link
|
52
|
-
PRG="$0"
|
53
|
-
# Need this for relative symlinks.
|
54
|
-
while [ -h "$PRG" ] ; do
|
55
|
-
ls=`ls -ld "$PRG"`
|
56
|
-
link=`expr "$ls" : '.*-> \(.*\)$'`
|
57
|
-
if expr "$link" : '/.*' > /dev/null; then
|
58
|
-
PRG="$link"
|
59
|
-
else
|
60
|
-
PRG=`dirname "$PRG"`"/$link"
|
61
|
-
fi
|
62
|
-
done
|
63
|
-
SAVED="`pwd`"
|
64
|
-
cd "`dirname \"$PRG\"`/" >&-
|
65
|
-
APP_HOME="`pwd -P`"
|
66
|
-
cd "$SAVED" >&-
|
67
|
-
|
68
67
|
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
69
68
|
|
70
69
|
# Determine the Java command to use to start the JVM.
|
@@ -90,7 +89,7 @@ location of your Java installation."
|
|
90
89
|
fi
|
91
90
|
|
92
91
|
# Increase the maximum file descriptors if we can.
|
93
|
-
if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
|
92
|
+
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
94
93
|
MAX_FD_LIMIT=`ulimit -H -n`
|
95
94
|
if [ $? -eq 0 ] ; then
|
96
95
|
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
@@ -114,6 +113,7 @@ fi
|
|
114
113
|
if $cygwin ; then
|
115
114
|
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
116
115
|
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
116
|
+
JAVACMD=`cygpath --unix "$JAVACMD"`
|
117
117
|
|
118
118
|
# We build the pattern for arguments to be converted via cygpath
|
119
119
|
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
@@ -154,11 +154,19 @@ if $cygwin ; then
|
|
154
154
|
esac
|
155
155
|
fi
|
156
156
|
|
157
|
-
#
|
158
|
-
|
159
|
-
|
157
|
+
# Escape application args
|
158
|
+
save () {
|
159
|
+
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
|
+
echo " "
|
160
161
|
}
|
161
|
-
|
162
|
-
|
162
|
+
APP_ARGS=$(save "$@")
|
163
|
+
|
164
|
+
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
|
+
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
166
|
+
|
167
|
+
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
168
|
+
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
169
|
+
cd "$(dirname "$0")"
|
170
|
+
fi
|
163
171
|
|
164
|
-
exec "$JAVACMD" "
|
172
|
+
exec "$JAVACMD" "$@"
|
data/gradlew.bat
CHANGED
@@ -8,14 +8,14 @@
|
|
8
8
|
@rem Set local scope for the variables with windows NT shell
|
9
9
|
if "%OS%"=="Windows_NT" setlocal
|
10
10
|
|
11
|
-
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
12
|
-
set DEFAULT_JVM_OPTS=
|
13
|
-
|
14
11
|
set DIRNAME=%~dp0
|
15
12
|
if "%DIRNAME%" == "" set DIRNAME=.
|
16
13
|
set APP_BASE_NAME=%~n0
|
17
14
|
set APP_HOME=%DIRNAME%
|
18
15
|
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
19
|
@rem Find java.exe
|
20
20
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
21
|
|
@@ -46,10 +46,9 @@ echo location of your Java installation.
|
|
46
46
|
goto fail
|
47
47
|
|
48
48
|
:init
|
49
|
-
@rem Get command-line arguments, handling
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
50
|
|
51
51
|
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
-
if "%@eval[2+2]" == "4" goto 4NT_args
|
53
52
|
|
54
53
|
:win9xME_args
|
55
54
|
@rem Slurp the command line arguments.
|
@@ -60,11 +59,6 @@ set _SKIP=2
|
|
60
59
|
if "x%~1" == "x" goto execute
|
61
60
|
|
62
61
|
set CMD_LINE_ARGS=%*
|
63
|
-
goto execute
|
64
|
-
|
65
|
-
:4NT_args
|
66
|
-
@rem Get arguments from the 4NT Shell from JP Software
|
67
|
-
set CMD_LINE_ARGS=%$
|
68
62
|
|
69
63
|
:execute
|
70
64
|
@rem Setup the command line
|
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = 'embulk-output-hdfs'
|
@@ -0,0 +1,160 @@
|
|
1
|
+
package org.embulk.output.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.Path;
|
4
|
+
import org.embulk.config.TaskReport;
|
5
|
+
import org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask;
|
6
|
+
import org.embulk.output.hdfs.client.HdfsClient;
|
7
|
+
import org.embulk.spi.Buffer;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.FileOutput;
|
10
|
+
import org.embulk.spi.TransactionalFileOutput;
|
11
|
+
import org.embulk.spi.util.RetryExecutor;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.io.IOException;
|
15
|
+
import java.io.OutputStream;
|
16
|
+
|
17
|
+
public class HdfsFileOutput
|
18
|
+
implements FileOutput, TransactionalFileOutput
|
19
|
+
{
|
20
|
+
private static final Logger logger = Exec.getLogger(HdfsFileOutput.class);
|
21
|
+
private final RetryExecutor re = RetryExecutor.retryExecutor()
|
22
|
+
.withRetryLimit(3)
|
23
|
+
.withMaxRetryWait(500) // ms
|
24
|
+
.withMaxRetryWait(10 * 60 * 1000); // ms
|
25
|
+
|
26
|
+
private final HdfsClient hdfsClient;
|
27
|
+
private final int taskIdx;
|
28
|
+
private final String pathPrefix;
|
29
|
+
private final String sequenceFormat;
|
30
|
+
private final String fileExt;
|
31
|
+
private final boolean overwrite;
|
32
|
+
|
33
|
+
private int fileIdx = 0;
|
34
|
+
private Path currentPath = null;
|
35
|
+
private OutputStream o = null;
|
36
|
+
|
37
|
+
public HdfsFileOutput(PluginTask task, String pathPrefix, boolean overwrite, int taskIdx)
|
38
|
+
{
|
39
|
+
this.hdfsClient = HdfsClient.build(task);
|
40
|
+
this.pathPrefix = pathPrefix;
|
41
|
+
this.taskIdx = taskIdx;
|
42
|
+
this.sequenceFormat = task.getSequenceFormat();
|
43
|
+
this.fileExt = task.getFileExt();
|
44
|
+
this.overwrite = overwrite;
|
45
|
+
}
|
46
|
+
|
47
|
+
@Override
|
48
|
+
public void abort()
|
49
|
+
{
|
50
|
+
}
|
51
|
+
|
52
|
+
@Override
|
53
|
+
public TaskReport commit()
|
54
|
+
{
|
55
|
+
return Exec.newTaskReport();
|
56
|
+
}
|
57
|
+
|
58
|
+
@Override
|
59
|
+
public void nextFile()
|
60
|
+
{
|
61
|
+
closeCurrentStream();
|
62
|
+
currentPath = newPath();
|
63
|
+
fileIdx++;
|
64
|
+
}
|
65
|
+
|
66
|
+
@Override
|
67
|
+
public void add(Buffer buffer)
|
68
|
+
{
|
69
|
+
try {
|
70
|
+
// this implementation is for creating file when there is data.
|
71
|
+
if (o == null) {
|
72
|
+
o = hdfsClient.create(currentPath, overwrite);
|
73
|
+
logger.info("Uploading '{}'", currentPath);
|
74
|
+
}
|
75
|
+
write(buffer);
|
76
|
+
}
|
77
|
+
catch (RetryExecutor.RetryGiveupException e) {
|
78
|
+
throw new RuntimeException(e);
|
79
|
+
}
|
80
|
+
finally {
|
81
|
+
buffer.release();
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
@Override
|
86
|
+
public void finish()
|
87
|
+
{
|
88
|
+
closeCurrentStream();
|
89
|
+
}
|
90
|
+
|
91
|
+
@Override
|
92
|
+
public void close()
|
93
|
+
{
|
94
|
+
closeCurrentStream();
|
95
|
+
hdfsClient.close();
|
96
|
+
}
|
97
|
+
|
98
|
+
private void write(final Buffer buffer)
|
99
|
+
throws RetryExecutor.RetryGiveupException
|
100
|
+
{
|
101
|
+
re.run(new RetryExecutor.Retryable<Void>()
|
102
|
+
{
|
103
|
+
@Override
|
104
|
+
public Void call()
|
105
|
+
throws Exception
|
106
|
+
{
|
107
|
+
o.write(buffer.array(), buffer.offset(), buffer.limit());
|
108
|
+
return null;
|
109
|
+
}
|
110
|
+
|
111
|
+
@Override
|
112
|
+
public boolean isRetryableException(Exception exception)
|
113
|
+
{
|
114
|
+
return true; // TODO: which Exception is retryable?
|
115
|
+
}
|
116
|
+
|
117
|
+
@Override
|
118
|
+
public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
|
119
|
+
throws RetryExecutor.RetryGiveupException
|
120
|
+
{
|
121
|
+
String m = String.format(
|
122
|
+
"%s. (Retry: Count: %d, Limit: %d, Wait: %d ms)",
|
123
|
+
exception.getMessage(),
|
124
|
+
retryCount,
|
125
|
+
retryLimit,
|
126
|
+
retryWait);
|
127
|
+
logger.warn(m, exception);
|
128
|
+
}
|
129
|
+
|
130
|
+
@Override
|
131
|
+
public void onGiveup(Exception firstException, Exception lastException)
|
132
|
+
throws RetryExecutor.RetryGiveupException
|
133
|
+
{
|
134
|
+
}
|
135
|
+
});
|
136
|
+
}
|
137
|
+
|
138
|
+
private Path newPath()
|
139
|
+
{
|
140
|
+
return new Path(pathPrefix + getSequence() + fileExt);
|
141
|
+
}
|
142
|
+
|
143
|
+
private String getSequence()
|
144
|
+
{
|
145
|
+
return String.format(sequenceFormat, taskIdx, fileIdx);
|
146
|
+
}
|
147
|
+
|
148
|
+
private void closeCurrentStream()
|
149
|
+
{
|
150
|
+
if (o != null) {
|
151
|
+
try {
|
152
|
+
o.close();
|
153
|
+
o = null;
|
154
|
+
}
|
155
|
+
catch (IOException e) {
|
156
|
+
throw new RuntimeException(e);
|
157
|
+
}
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|