embulk-output-hdfs 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bc90ac0579a94f207537ad5106b3c733158a007b
4
- data.tar.gz: 6cf1c80f825e7fdb6609c8a54d4db6546ec4414b
3
+ metadata.gz: f9fa40ed9c13dbc656239643f3153f160f66ef08
4
+ data.tar.gz: 61fb3a7a55c94873e58f7edc4ac5e6ae1cf337ae
5
5
  SHA512:
6
- metadata.gz: 27a3f83d4299c8c8ea92a1710730987bbc444911872ba8b78d6e3d863fc6d8ae94c27ff3b70f580006e74cb5c3c0bb59739aaa342665caa9175a433885144a54
7
- data.tar.gz: 6b5fe6fba56c2843072f8f6ede29d40964f4ec6b4c86a4b6318fad6ed1643e8d752ed46a107330a3ab798594e88cdbaa621e7f54acc74c439f7a8225c85ea321
6
+ metadata.gz: e131f8221baaa36c20fcd8ee77b88cac8dde80bc20e1663f1d5dae3d54ed88aae08862a5113f928916a2f5ddd19321bbf7270d8cc43109336ffede61e2adc99f
7
+ data.tar.gz: 0afec8392aeb2d109ebe9beb738d6b5e09f029b5cc04ab16b5b9bc83246a13f8c0a2fd86393da2d924cb46791d44a5f9d78d84ad5f06c60d061f67e61b260c64
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: java
2
+ jdk:
3
+ - openjdk7
4
+ - oraclejdk7
5
+ - oraclejdk8
6
+ script:
7
+ - ./gradlew test
8
+ after_success:
9
+ - ./gradlew jacocoTestReport coveralls
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ 0.2.3 (2016-04-20)
2
+ ==================
3
+ - Add: `delete_in_advance` option
4
+
1
5
  0.2.2 (2016-02-02)
2
6
  ==================
3
7
  - Add: doas option
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Hdfs file output plugin for Embulk
2
2
 
3
+ [![Build Status](https://travis-ci.org/civitaspo/embulk-output-hdfs.svg)](https://travis-ci.org/civitaspo/embulk-output-hdfs)
4
+ [![Coverage Status](https://coveralls.io/repos/github/civitaspo/embulk-output-hdfs/badge.svg?branch=master)](https://coveralls.io/github/civitaspo/embulk-output-hdfs?branch=master)
5
+
3
6
  A File Output Plugin for Embulk to write HDFS.
4
7
 
5
8
  ## Overview
@@ -19,6 +22,17 @@ A File Output Plugin for Embulk to write HDFS.
19
22
  - **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
20
23
  - **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
21
24
  - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
25
+ - **doas** username which access to Hdfs (string, default: executed user)
26
+ - **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
27
+ - `NONE`: do nothing
28
+ - `FILE_ONLY`: delete files
29
+ - `RECURSIVE`: delete files and directories
30
+
31
+ ## CAUTION
32
+ If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
33
+ `embulk-output-hdfs` can delete any files and directories you indicate as `path_prefix`,
34
+ this means `embulk-output-hdfs` can destroy your hdfs.
35
+ So, please be careful when you use `delete_in_advance` option and `doas` option ...
22
36
 
23
37
  ## Example
24
38
 
data/build.gradle CHANGED
@@ -2,6 +2,9 @@ plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
+ id "checkstyle"
6
+ id "com.github.kt3k.coveralls" version "2.4.0"
7
+ id "jacoco"
5
8
  }
6
9
  import com.github.jrubygradle.JRubyExec
7
10
  repositories {
@@ -12,18 +15,20 @@ configurations {
12
15
  provided
13
16
  }
14
17
 
15
- version = "0.2.2"
18
+ version = "0.2.3"
16
19
 
17
20
  sourceCompatibility = 1.7
18
21
  targetCompatibility = 1.7
19
22
 
20
23
  dependencies {
21
- compile "org.embulk:embulk-core:0.7.0"
22
- provided "org.embulk:embulk-core:0.7.0"
24
+ compile "org.embulk:embulk-core:0.8.8"
25
+ provided "org.embulk:embulk-core:0.8.8"
23
26
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
24
27
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
25
28
  compile 'com.google.guava:guava:15.0'
26
29
  testCompile "junit:junit:4.+"
30
+ testCompile "org.embulk:embulk-core:0.8.8:tests"
31
+ testCompile "org.embulk:embulk-standards:0.8.8"
27
32
  }
28
33
 
29
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -33,6 +38,29 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
33
38
  }
34
39
  clean { delete "classpath" }
35
40
 
41
+ jacocoTestReport {
42
+ reports {
43
+ xml.enabled = true // coveralls plugin depends on xml format report
44
+ html.enabled = true
45
+ }
46
+ }
47
+ checkstyle {
48
+ configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
49
+ toolVersion = '6.14.1'
50
+ }
51
+ checkstyleMain {
52
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
53
+ ignoreFailures = true
54
+ }
55
+ checkstyleTest {
56
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
57
+ ignoreFailures = true
58
+ }
59
+ task checkstyle(type: Checkstyle) {
60
+ classpath = sourceSets.main.output + sourceSets.test.output
61
+ source = sourceSets.main.allJava + sourceSets.test.allJava
62
+ }
63
+
36
64
  task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
37
65
  jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
38
66
  script "${project.name}.gemspec"
@@ -0,0 +1,128 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <module name="Checker">
6
+ <!-- https://github.com/facebook/presto/blob/master/src/checkstyle/checks.xml -->
7
+ <module name="FileTabCharacter"/>
8
+ <module name="NewlineAtEndOfFile">
9
+ <property name="lineSeparator" value="lf"/>
10
+ </module>
11
+ <module name="RegexpMultiline">
12
+ <property name="format" value="\r"/>
13
+ <property name="message" value="Line contains carriage return"/>
14
+ </module>
15
+ <module name="RegexpMultiline">
16
+ <property name="format" value=" \n"/>
17
+ <property name="message" value="Line has trailing whitespace"/>
18
+ </module>
19
+ <module name="RegexpMultiline">
20
+ <property name="format" value="\{\n\n"/>
21
+ <property name="message" value="Blank line after opening brace"/>
22
+ </module>
23
+ <module name="RegexpMultiline">
24
+ <property name="format" value="\n\n\s*\}"/>
25
+ <property name="message" value="Blank line before closing brace"/>
26
+ </module>
27
+ <module name="RegexpMultiline">
28
+ <property name="format" value="\n\n\n"/>
29
+ <property name="message" value="Multiple consecutive blank lines"/>
30
+ </module>
31
+ <module name="RegexpMultiline">
32
+ <property name="format" value="\n\n\Z"/>
33
+ <property name="message" value="Blank line before end of file"/>
34
+ </module>
35
+ <module name="RegexpMultiline">
36
+ <property name="format" value="Preconditions\.checkNotNull"/>
37
+ <property name="message" value="Use of checkNotNull"/>
38
+ </module>
39
+
40
+ <module name="TreeWalker">
41
+ <module name="EmptyBlock">
42
+ <property name="option" value="text"/>
43
+ <property name="tokens" value="
44
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
45
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
46
+ </module>
47
+ <module name="EmptyStatement"/>
48
+ <module name="EmptyForInitializerPad"/>
49
+ <module name="EmptyForIteratorPad">
50
+ <property name="option" value="space"/>
51
+ </module>
52
+ <module name="MethodParamPad">
53
+ <property name="allowLineBreaks" value="true"/>
54
+ <property name="option" value="nospace"/>
55
+ </module>
56
+ <module name="ParenPad"/>
57
+ <module name="TypecastParenPad"/>
58
+ <module name="NeedBraces"/>
59
+ <module name="LeftCurly">
60
+ <property name="option" value="nl"/>
61
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
62
+ </module>
63
+ <module name="LeftCurly">
64
+ <property name="option" value="eol"/>
65
+ <property name="tokens" value="
66
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
67
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
68
+ </module>
69
+ <module name="RightCurly">
70
+ <property name="option" value="alone"/>
71
+ </module>
72
+ <module name="GenericWhitespace"/>
73
+ <module name="WhitespaceAfter"/>
74
+ <module name="NoWhitespaceBefore"/>
75
+
76
+ <module name="UpperEll"/>
77
+ <module name="DefaultComesLast"/>
78
+ <module name="ArrayTypeStyle"/>
79
+ <module name="MultipleVariableDeclarations"/>
80
+ <module name="ModifierOrder"/>
81
+ <module name="OneStatementPerLine"/>
82
+ <module name="StringLiteralEquality"/>
83
+ <module name="MutableException"/>
84
+ <module name="EqualsHashCode"/>
85
+ <module name="InnerAssignment"/>
86
+ <module name="InterfaceIsType"/>
87
+ <module name="HideUtilityClassConstructor"/>
88
+
89
+ <module name="MemberName"/>
90
+ <module name="LocalVariableName"/>
91
+ <module name="LocalFinalVariableName"/>
92
+ <module name="TypeName"/>
93
+ <module name="PackageName"/>
94
+ <module name="ParameterName"/>
95
+ <module name="StaticVariableName"/>
96
+ <module name="ClassTypeParameterName">
97
+ <property name="format" value="^[A-Z][0-9]?$"/>
98
+ </module>
99
+ <module name="MethodTypeParameterName">
100
+ <property name="format" value="^[A-Z][0-9]?$"/>
101
+ </module>
102
+
103
+ <module name="AvoidStarImport"/>
104
+ <module name="RedundantImport"/>
105
+ <module name="UnusedImports"/>
106
+ <module name="ImportOrder">
107
+ <property name="groups" value="*,javax,java"/>
108
+ <property name="separated" value="true"/>
109
+ <property name="option" value="bottom"/>
110
+ <property name="sortStaticImportsAlphabetically" value="true"/>
111
+ </module>
112
+
113
+ <module name="WhitespaceAround">
114
+ <property name="allowEmptyConstructors" value="true"/>
115
+ <property name="allowEmptyMethods" value="true"/>
116
+ <property name="ignoreEnhancedForColon" value="false"/>
117
+ <property name="tokens" value="
118
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
119
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
120
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
121
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
122
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
123
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
124
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
125
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
126
+ </module>
127
+ </module>
128
+ </module>
@@ -0,0 +1,108 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <!--
6
+ This is a subset of ./checkstyle.xml which allows some loose styles
7
+ -->
8
+ <module name="Checker">
9
+ <module name="FileTabCharacter"/>
10
+ <module name="NewlineAtEndOfFile">
11
+ <property name="lineSeparator" value="lf"/>
12
+ </module>
13
+ <module name="RegexpMultiline">
14
+ <property name="format" value="\r"/>
15
+ <property name="message" value="Line contains carriage return"/>
16
+ </module>
17
+ <module name="RegexpMultiline">
18
+ <property name="format" value=" \n"/>
19
+ <property name="message" value="Line has trailing whitespace"/>
20
+ </module>
21
+ <module name="RegexpMultiline">
22
+ <property name="format" value="\n\n\n"/>
23
+ <property name="message" value="Multiple consecutive blank lines"/>
24
+ </module>
25
+ <module name="RegexpMultiline">
26
+ <property name="format" value="\n\n\Z"/>
27
+ <property name="message" value="Blank line before end of file"/>
28
+ </module>
29
+
30
+ <module name="TreeWalker">
31
+ <module name="EmptyBlock">
32
+ <property name="option" value="text"/>
33
+ <property name="tokens" value="
34
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
35
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
36
+ </module>
37
+ <module name="EmptyStatement"/>
38
+ <module name="EmptyForInitializerPad"/>
39
+ <module name="EmptyForIteratorPad">
40
+ <property name="option" value="space"/>
41
+ </module>
42
+ <module name="MethodParamPad">
43
+ <property name="allowLineBreaks" value="true"/>
44
+ <property name="option" value="nospace"/>
45
+ </module>
46
+ <module name="ParenPad"/>
47
+ <module name="TypecastParenPad"/>
48
+ <module name="NeedBraces"/>
49
+ <module name="LeftCurly">
50
+ <property name="option" value="nl"/>
51
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
52
+ </module>
53
+ <module name="LeftCurly">
54
+ <property name="option" value="eol"/>
55
+ <property name="tokens" value="
56
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
57
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
58
+ </module>
59
+ <module name="RightCurly">
60
+ <property name="option" value="alone"/>
61
+ </module>
62
+ <module name="GenericWhitespace"/>
63
+ <module name="WhitespaceAfter"/>
64
+ <module name="NoWhitespaceBefore"/>
65
+
66
+ <module name="UpperEll"/>
67
+ <module name="DefaultComesLast"/>
68
+ <module name="ArrayTypeStyle"/>
69
+ <module name="MultipleVariableDeclarations"/>
70
+ <module name="ModifierOrder"/>
71
+ <module name="OneStatementPerLine"/>
72
+ <module name="StringLiteralEquality"/>
73
+ <module name="MutableException"/>
74
+ <module name="EqualsHashCode"/>
75
+ <module name="InnerAssignment"/>
76
+ <module name="InterfaceIsType"/>
77
+ <module name="HideUtilityClassConstructor"/>
78
+
79
+ <module name="MemberName"/>
80
+ <module name="LocalVariableName"/>
81
+ <module name="LocalFinalVariableName"/>
82
+ <module name="TypeName"/>
83
+ <module name="PackageName"/>
84
+ <module name="ParameterName"/>
85
+ <module name="StaticVariableName"/>
86
+ <module name="ClassTypeParameterName">
87
+ <property name="format" value="^[A-Z][0-9]?$"/>
88
+ </module>
89
+ <module name="MethodTypeParameterName">
90
+ <property name="format" value="^[A-Z][0-9]?$"/>
91
+ </module>
92
+
93
+ <module name="WhitespaceAround">
94
+ <property name="allowEmptyConstructors" value="true"/>
95
+ <property name="allowEmptyMethods" value="true"/>
96
+ <property name="ignoreEnhancedForColon" value="false"/>
97
+ <property name="tokens" value="
98
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
99
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
100
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
101
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
102
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
103
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
104
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
105
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
106
+ </module>
107
+ </module>
108
+ </module>
@@ -0,0 +1,52 @@
1
+ hdfs_example: &hdfs_example
2
+ config_files:
3
+ - /etc/hadoop/conf/core-site.xml
4
+ - /etc/hadoop/conf/hdfs-site.xml
5
+ config:
6
+ fs.defaultFS: 'hdfs://hadoop-nn1:8020'
7
+ fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
8
+ fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
9
+
10
+ local_fs_example: &local_fs_example
11
+ config:
12
+ fs.defaultFS: 'file:///'
13
+ fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
14
+ fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
15
+ io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
16
+
17
+ in:
18
+ type: file
19
+ path_prefix: example/data
20
+ parser:
21
+ charset: UTF-8
22
+ newline: CRLF
23
+ type: csv
24
+ delimiter: ','
25
+ quote: '"'
26
+ header_line: true
27
+ stop_on_invalid_record: true
28
+ columns:
29
+ - {name: id, type: long}
30
+ - {name: account, type: long}
31
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
32
+ - {name: purchase, type: timestamp, format: '%Y%m%d'}
33
+ - {name: comment, type: string}
34
+
35
+
36
+ out:
37
+ type: hdfs
38
+ <<: *local_fs_example
39
+ path_prefix: /tmp/embulk-output-hdfs_example/file_
40
+ file_ext: csv
41
+ delete_in_advance: FILE_ONLY
42
+ formatter:
43
+ type: csv
44
+ newline: CRLF
45
+ newline_in_field: LF
46
+ header_line: true
47
+ charset: UTF-8
48
+ quote_policy: NONE
49
+ quote: '"'
50
+ escape: '\'
51
+ null_string: ''
52
+ default_timezone: UTC
data/example/data.csv ADDED
@@ -0,0 +1,5 @@
1
+ id,account,time,purchase,comment
2
+ 1,32864,2015-01-27 19:23:49,20150127,embulk
3
+ 2,14824,2015-01-27 19:01:23,20150127,embulk jruby
4
+ 3,27559,2015-01-28 02:20:02,20150128,"Embulk ""csv"" parser plugin"
5
+ 4,11270,2015-01-29 11:54:36,20150129,NULL
Binary file
@@ -1,6 +1,6 @@
1
- #Tue Aug 11 00:26:20 PDT 2015
1
+ #Wed Jan 13 12:41:02 JST 2016
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
@@ -3,11 +3,13 @@ package org.embulk.output.hdfs;
3
3
  import com.google.common.base.Optional;
4
4
  import com.google.common.base.Throwables;
5
5
  import org.apache.hadoop.conf.Configuration;
6
+ import org.apache.hadoop.fs.FileStatus;
6
7
  import org.apache.hadoop.fs.FileSystem;
7
8
  import org.apache.hadoop.fs.Path;
8
9
  import org.embulk.config.Config;
9
10
  import org.embulk.config.ConfigDefault;
10
11
  import org.embulk.config.ConfigDiff;
12
+ import org.embulk.config.ConfigException;
11
13
  import org.embulk.config.ConfigSource;
12
14
  import org.embulk.config.Task;
13
15
  import org.embulk.config.TaskReport;
@@ -27,6 +29,8 @@ import java.util.ArrayList;
27
29
  import java.util.List;
28
30
  import java.util.Map;
29
31
 
32
+ import static org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask.*;
33
+
30
34
  public class HdfsFileOutputPlugin
31
35
  implements FileOutputPlugin
32
36
  {
@@ -37,33 +41,38 @@ public class HdfsFileOutputPlugin
37
41
  {
38
42
  @Config("config_files")
39
43
  @ConfigDefault("[]")
40
- public List<String> getConfigFiles();
44
+ List<String> getConfigFiles();
41
45
 
42
46
  @Config("config")
43
47
  @ConfigDefault("{}")
44
- public Map<String, String> getConfig();
48
+ Map<String, String> getConfig();
45
49
 
46
50
  @Config("path_prefix")
47
- public String getPathPrefix();
51
+ String getPathPrefix();
48
52
 
49
53
  @Config("file_ext")
50
- public String getFileNameExtension();
54
+ String getFileExt();
51
55
 
52
56
  @Config("sequence_format")
53
57
  @ConfigDefault("\"%03d.%02d.\"")
54
- public String getSequenceFormat();
58
+ String getSequenceFormat();
55
59
 
56
60
  @Config("rewind_seconds")
57
61
  @ConfigDefault("0")
58
- public int getRewindSeconds();
62
+ int getRewindSeconds();
59
63
 
60
64
  @Config("overwrite")
61
65
  @ConfigDefault("false")
62
- public boolean getOverwrite();
66
+ boolean getOverwrite();
63
67
 
64
68
  @Config("doas")
65
69
  @ConfigDefault("null")
66
- public Optional<String> getDoas();
70
+ Optional<String> getDoas();
71
+
72
+ enum DeleteInAdvancePolicy{ NONE, FILE_ONLY, RECURSIVE}
73
+ @Config("delete_in_advance")
74
+ @ConfigDefault("\"NONE\"")
75
+ DeleteInAdvancePolicy getDeleteInAdvance();
67
76
  }
68
77
 
69
78
  @Override
@@ -72,6 +81,15 @@ public class HdfsFileOutputPlugin
72
81
  {
73
82
  PluginTask task = config.loadConfig(PluginTask.class);
74
83
 
84
+ try {
85
+ String pathPrefix = strftime(task.getPathPrefix(), task.getRewindSeconds());
86
+ FileSystem fs = getFs(task);
87
+ deleteInAdvance(fs, pathPrefix, task.getDeleteInAdvance());
88
+ }
89
+ catch (IOException e) {
90
+ throw Throwables.propagate(e);
91
+ }
92
+
75
93
  control.run(task.dump());
76
94
  return Exec.newConfigDiff();
77
95
  }
@@ -97,7 +115,7 @@ public class HdfsFileOutputPlugin
97
115
  final PluginTask task = taskSource.loadTask(PluginTask.class);
98
116
 
99
117
  final String pathPrefix = strftime(task.getPathPrefix(), task.getRewindSeconds());
100
- final String pathSuffix = task.getFileNameExtension();
118
+ final String pathSuffix = task.getFileExt();
101
119
  final String sequenceFormat = task.getSequenceFormat();
102
120
 
103
121
  return new TransactionalFileOutput()
@@ -211,4 +229,31 @@ public class HdfsFileOutputPlugin
211
229
  String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
212
230
  return resolved.toString();
213
231
  }
232
+
233
+ private void deleteInAdvance(FileSystem fs, String pathPrefix, DeleteInAdvancePolicy deleteInAdvancePolicy)
234
+ throws IOException
235
+ {
236
+ final Path globPath = new Path(pathPrefix + "*");
237
+ switch (deleteInAdvancePolicy) {
238
+ case NONE:
239
+ // do nothing
240
+ break;
241
+ case FILE_ONLY:
242
+ for (FileStatus status : fs.globStatus(globPath)) {
243
+ if (status.isFile()) {
244
+ logger.debug("delete in advance: {}", status.getPath());
245
+ fs.delete(status.getPath(), false);
246
+ }
247
+ }
248
+ break;
249
+ case RECURSIVE:
250
+ for (FileStatus status : fs.globStatus(globPath)) {
251
+ logger.debug("delete in advance: {}", status.getPath());
252
+ fs.delete(status.getPath(), true);
253
+ }
254
+ break;
255
+ default:
256
+ throw new ConfigException("`delete_in_advance` must not null.");
257
+ }
258
+ }
214
259
  }
@@ -1,5 +1,280 @@
1
1
  package org.embulk.output.hdfs;
2
2
 
3
+ import com.google.common.base.Charsets;
4
+ import com.google.common.base.Optional;
5
+ import com.google.common.collect.Lists;
6
+ import com.google.common.collect.Maps;
7
+ import org.embulk.EmbulkTestRuntime;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.TaskReport;
11
+ import org.embulk.config.TaskSource;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FileOutputRunner;
14
+ import org.embulk.spi.OutputPlugin.Control;
15
+ import org.embulk.spi.Page;
16
+ import org.embulk.spi.PageTestUtils;
17
+ import org.embulk.spi.Schema;
18
+ import org.embulk.spi.TransactionalPageOutput;
19
+ import org.embulk.spi.time.Timestamp;
20
+ import org.junit.Before;
21
+ import org.junit.Rule;
22
+ import org.junit.Test;
23
+ import org.junit.rules.ExpectedException;
24
+ import org.junit.rules.TemporaryFolder;
25
+ import org.slf4j.Logger;
26
+
27
+ import java.io.File;
28
+ import java.io.IOException;
29
+ import java.nio.file.DirectoryStream;
30
+ import java.nio.file.Files;
31
+ import java.nio.file.Paths;
32
+ import java.util.List;
33
+
34
+ import static com.google.common.io.Files.readLines;
35
+ import static org.embulk.output.hdfs.HdfsFileOutputPlugin.*;
36
+ import static org.embulk.spi.type.Types.*;
37
+ import static org.hamcrest.CoreMatchers.containsString;
38
+ import static org.hamcrest.CoreMatchers.hasItem;
39
+ import static org.hamcrest.CoreMatchers.not;
40
+ import static org.junit.Assert.assertEquals;
41
+ import static org.junit.Assert.assertNotEquals;
42
+ import static org.junit.Assert.assertThat;
43
+ import static org.msgpack.value.ValueFactory.newMap;
44
+ import static org.msgpack.value.ValueFactory.newString;
45
+
3
46
  public class TestHdfsFileOutputPlugin
4
47
  {
48
+ @Rule
49
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
50
+
51
+ @Rule
52
+ public ExpectedException exception = ExpectedException.none();
53
+
54
+ @Rule
55
+ public TemporaryFolder tmpFolder = new TemporaryFolder();
56
+
57
+ private Logger logger = runtime.getExec().getLogger(TestHdfsFileOutputPlugin.class);
58
+ private HdfsFileOutputPlugin plugin;
59
+ private FileOutputRunner runner;
60
+ private String pathPrefix;
61
+
62
+ private static final Schema SCHEMA = new Schema.Builder()
63
+ .add("_c0", BOOLEAN)
64
+ .add("_c1", LONG)
65
+ .add("_c2", DOUBLE)
66
+ .add("_c3", STRING)
67
+ .add("_c4", TIMESTAMP)
68
+ .add("_c5", JSON)
69
+ .build();
70
+
71
+ @Before
72
+ public void createResources()
73
+ throws IOException
74
+ {
75
+ plugin = new HdfsFileOutputPlugin();
76
+ runner = new FileOutputRunner(runtime.getInstance(HdfsFileOutputPlugin.class));
77
+ pathPrefix = tmpFolder.getRoot().getAbsolutePath() + "/embulk-output-hdfs_";
78
+ }
79
+
80
+ private ConfigSource getBaseConfigSource()
81
+ {
82
+ return Exec.newConfigSource()
83
+ .set("type", "hdfs")
84
+ .set("path_prefix", pathPrefix)
85
+ .set("file_ext", "csv")
86
+ .setNested("formatter", Exec.newConfigSource()
87
+ .set("type", "csv")
88
+ .set("newline", "CRLF")
89
+ .set("newline_in_field", "LF")
90
+ .set("header_line", true)
91
+ .set("charset", "UTF-8")
92
+ .set("quote_policy", "NONE")
93
+ .set("quote", "\"")
94
+ .set("escape", "\\")
95
+ .set("null_string", "")
96
+ .set("default_timezone", "UTC"));
97
+ }
98
+
99
+ @Test
100
+ public void testDefaultValues()
101
+ {
102
+ ConfigSource config = getBaseConfigSource();
103
+ PluginTask task = config.loadConfig(PluginTask.class);
104
+ assertEquals(pathPrefix, task.getPathPrefix());
105
+ assertEquals("csv", task.getFileExt());
106
+ assertEquals("%03d.%02d.", task.getSequenceFormat());
107
+ assertEquals(Lists.newArrayList(), task.getConfigFiles());
108
+ assertEquals(Maps.newHashMap(), task.getConfig());
109
+ assertEquals(0, task.getRewindSeconds());
110
+ assertEquals(false, task.getOverwrite());
111
+ assertEquals(Optional.absent(), task.getDoas());
112
+ assertEquals(PluginTask.DeleteInAdvancePolicy.NONE, task.getDeleteInAdvance());
113
+ }
114
+
115
+ @Test(expected = ConfigException.class)
116
+ public void testRequiredValues()
117
+ {
118
+ ConfigSource config = Exec.newConfigSource();
119
+ PluginTask task = config.loadConfig(PluginTask.class);
120
+ }
121
+
122
+ private List<String> lsR(List<String> names, java.nio.file.Path dir)
123
+ {
124
+ try (DirectoryStream<java.nio.file.Path> stream = Files.newDirectoryStream(dir)) {
125
+ for (java.nio.file.Path path : stream) {
126
+ if (path.toFile().isDirectory()) {
127
+ logger.debug("[lsR] find a directory: {}", path.toAbsolutePath().toString());
128
+ names.add(path.toAbsolutePath().toString());
129
+ lsR(names, path);
130
+ }
131
+ else {
132
+ logger.debug("[lsR] find a file: {}", path.toAbsolutePath().toString());
133
+ names.add(path.toAbsolutePath().toString());
134
+ }
135
+ }
136
+ }
137
+ catch (IOException e) {
138
+ logger.debug(e.getMessage(), e);
139
+ }
140
+ return names;
141
+ }
142
+
143
+ private void run(ConfigSource config)
144
+ {
145
+ runner.transaction(config, SCHEMA, 1, new Control()
146
+ {
147
+ @Override
148
+ public List<TaskReport> run(TaskSource taskSource)
149
+ {
150
+ TransactionalPageOutput pageOutput = runner.open(taskSource, SCHEMA, 1);
151
+ boolean committed = false;
152
+ try {
153
+ // Result:
154
+ // _c0,_c1,_c2,_c3,_c4,_c5
155
+ // true,2,3.0,45,1970-01-01 00:00:00.678000 +0000,{\"k\":\"v\"}
156
+ // true,2,3.0,45,1970-01-01 00:00:00.678000 +0000,{\"k\":\"v\"}
157
+ for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), SCHEMA,
158
+ true, 2L, 3.0D, "45", Timestamp.ofEpochMilli(678L), newMap(newString("k"), newString("v")),
159
+ true, 2L, 3.0D, "45", Timestamp.ofEpochMilli(678L), newMap(newString("k"), newString("v")))) {
160
+ pageOutput.add(page);
161
+ }
162
+ pageOutput.commit();
163
+ committed = true;
164
+ }
165
+ finally {
166
+ if (!committed) {
167
+ pageOutput.abort();
168
+ }
169
+ pageOutput.close();
170
+ }
171
+ return Lists.newArrayList();
172
+ }
173
+ });
174
+ }
175
+
176
+ private void assertRecordsInFile(String filePath)
177
+ {
178
+ try {
179
+ List<String> lines = readLines(new File(filePath),
180
+ Charsets.UTF_8);
181
+ for (int i = 0; i < lines.size(); i++) {
182
+ String[] record = lines.get(i).split(",");
183
+ if (i == 0) {
184
+ for (int j = 0; j <= 4; j++) {
185
+ assertEquals("_c" + j, record[j]);
186
+ }
187
+ }
188
+ else {
189
+ // true,2,3.0,45,1970-01-01 00:00:00.678000 +0000
190
+ assertEquals("true", record[0]);
191
+ assertEquals("2", record[1]);
192
+ assertEquals("3.0", record[2]);
193
+ assertEquals("45", record[3]);
194
+ assertEquals("1970-01-01 00:00:00.678000 +0000", record[4]);
195
+ assertEquals("{\"k\":\"v\"}", record[5]);
196
+ }
197
+ }
198
+ }
199
+ catch (IOException e) {
200
+ logger.debug(e.getMessage(), e);
201
+ }
202
+ }
203
+
204
+ @Test
205
+ public void testBulkLoad()
206
+ {
207
+ ConfigSource config = getBaseConfigSource()
208
+ .setNested("config", Exec.newConfigSource()
209
+ .set("fs.hdfs.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
210
+ .set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
211
+ .set("fs.defaultFS", "file:///"));
212
+
213
+ run(config);
214
+ List<String> fileList = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
215
+ assertThat(fileList, hasItem(containsString(pathPrefix + "001.00.csv")));
216
+ assertRecordsInFile(String.format("%s/%s001.00.csv",
217
+ tmpFolder.getRoot().getAbsolutePath(),
218
+ pathPrefix));
219
+ }
220
+
221
+ @Test
222
+ public void testDeleteRECURSIVEInAdvance()
223
+ throws IOException
224
+ {
225
+ for (int n = 0; n <= 10; n++) {
226
+ tmpFolder.newFile("embulk-output-hdfs_file_" + n + ".txt");
227
+ tmpFolder.newFolder("embulk-output-hdfs_directory_" + n);
228
+ }
229
+
230
+ List<String> fileListBeforeRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
231
+
232
+ ConfigSource config = getBaseConfigSource()
233
+ .setNested("config", Exec.newConfigSource()
234
+ .set("fs.hdfs.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
235
+ .set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
236
+ .set("fs.defaultFS", "file:///"))
237
+ .set("delete_in_advance", "RECURSIVE");
238
+
239
+ run(config);
240
+
241
+ List<String> fileListAfterRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
242
+ assertNotEquals(fileListBeforeRun, fileListAfterRun);
243
+ assertThat(fileListAfterRun, not(hasItem(containsString("embulk-output-hdfs_directory"))));
244
+ assertThat(fileListAfterRun, not(hasItem(containsString("txt"))));
245
+ assertThat(fileListAfterRun, hasItem(containsString(pathPrefix + "001.00.csv")));
246
+ assertRecordsInFile(String.format("%s/%s001.00.csv",
247
+ tmpFolder.getRoot().getAbsolutePath(),
248
+ pathPrefix));
249
+ }
250
+
251
+ @Test
252
+ public void testDeleteFILE_ONLYInAdvance()
253
+ throws IOException
254
+ {
255
+ for (int n = 0; n <= 10; n++) {
256
+ tmpFolder.newFile("embulk-output-hdfs_file_" + n + ".txt");
257
+ tmpFolder.newFolder("embulk-output-hdfs_directory_" + n);
258
+ }
259
+
260
+ List<String> fileListBeforeRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
261
+
262
+ ConfigSource config = getBaseConfigSource()
263
+ .setNested("config", Exec.newConfigSource()
264
+ .set("fs.hdfs.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
265
+ .set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
266
+ .set("fs.defaultFS", "file:///"))
267
+ .set("delete_in_advance", "FILE_ONLY");
268
+
269
+ run(config);
270
+
271
+ List<String> fileListAfterRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
272
+ assertNotEquals(fileListBeforeRun, fileListAfterRun);
273
+ assertThat(fileListAfterRun, not(hasItem(containsString("txt"))));
274
+ assertThat(fileListAfterRun, hasItem(containsString("embulk-output-hdfs_directory")));
275
+ assertThat(fileListAfterRun, hasItem(containsString(pathPrefix + "001.00.csv")));
276
+ assertRecordsInFile(String.format("%s/%s001.00.csv",
277
+ tmpFolder.getRoot().getAbsolutePath(),
278
+ pathPrefix));
279
+ }
5
280
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-08 00:00:00.000000000 Z
11
+ date: 2016-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -46,10 +46,15 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - .gitignore
49
+ - .travis.yml
49
50
  - CHANGELOG.md
50
51
  - LICENSE.txt
51
52
  - README.md
52
53
  - build.gradle
54
+ - config/checkstyle/checkstyle.xml
55
+ - config/checkstyle/default.xml
56
+ - example/config.yml
57
+ - example/data.csv
53
58
  - gradle/wrapper/gradle-wrapper.jar
54
59
  - gradle/wrapper/gradle-wrapper.properties
55
60
  - gradlew
@@ -80,7 +85,7 @@ files:
80
85
  - classpath/curator-client-2.6.0.jar
81
86
  - classpath/curator-framework-2.6.0.jar
82
87
  - classpath/curator-recipes-2.6.0.jar
83
- - classpath/embulk-output-hdfs-0.2.2.jar
88
+ - classpath/embulk-output-hdfs-0.2.3.jar
84
89
  - classpath/gson-2.2.4.jar
85
90
  - classpath/hadoop-annotations-2.6.0.jar
86
91
  - classpath/hadoop-auth-2.6.0.jar