embulk-output-hdfs 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bc90ac0579a94f207537ad5106b3c733158a007b
4
- data.tar.gz: 6cf1c80f825e7fdb6609c8a54d4db6546ec4414b
3
+ metadata.gz: f9fa40ed9c13dbc656239643f3153f160f66ef08
4
+ data.tar.gz: 61fb3a7a55c94873e58f7edc4ac5e6ae1cf337ae
5
5
  SHA512:
6
- metadata.gz: 27a3f83d4299c8c8ea92a1710730987bbc444911872ba8b78d6e3d863fc6d8ae94c27ff3b70f580006e74cb5c3c0bb59739aaa342665caa9175a433885144a54
7
- data.tar.gz: 6b5fe6fba56c2843072f8f6ede29d40964f4ec6b4c86a4b6318fad6ed1643e8d752ed46a107330a3ab798594e88cdbaa621e7f54acc74c439f7a8225c85ea321
6
+ metadata.gz: e131f8221baaa36c20fcd8ee77b88cac8dde80bc20e1663f1d5dae3d54ed88aae08862a5113f928916a2f5ddd19321bbf7270d8cc43109336ffede61e2adc99f
7
+ data.tar.gz: 0afec8392aeb2d109ebe9beb738d6b5e09f029b5cc04ab16b5b9bc83246a13f8c0a2fd86393da2d924cb46791d44a5f9d78d84ad5f06c60d061f67e61b260c64
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: java
2
+ jdk:
3
+ - openjdk7
4
+ - oraclejdk7
5
+ - oraclejdk8
6
+ script:
7
+ - ./gradlew test
8
+ after_success:
9
+ - ./gradlew jacocoTestReport coveralls
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ 0.2.3 (2016-04-20)
2
+ ==================
3
+ - Add: `delete_in_advance` option
4
+
1
5
  0.2.2 (2016-02-02)
2
6
  ==================
3
7
  - Add: doas option
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Hdfs file output plugin for Embulk
2
2
 
3
+ [![Build Status](https://travis-ci.org/civitaspo/embulk-output-hdfs.svg)](https://travis-ci.org/civitaspo/embulk-output-hdfs)
4
+ [![Coverage Status](https://coveralls.io/repos/github/civitaspo/embulk-output-hdfs/badge.svg?branch=master)](https://coveralls.io/github/civitaspo/embulk-output-hdfs?branch=master)
5
+
3
6
  A File Output Plugin for Embulk to write HDFS.
4
7
 
5
8
  ## Overview
@@ -19,6 +22,17 @@ A File Output Plugin for Embulk to write HDFS.
19
22
  - **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
20
23
  - **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
21
24
  - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running.
25
+ - **doas** username which access to Hdfs (string, default: executed user)
26
+ - **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
27
+ - `NONE`: do nothing
28
+ - `FILE_ONLY`: delete files
29
+ - `RECURSIVE`: delete files and directories
30
+
31
+ ## CAUTION
32
+ If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
33
+ `embulk-output-hdfs` can delete any files and directories you indicate as `path_prefix`,
34
+ this means `embulk-output-hdfs` can destroy your hdfs.
35
+ So, please be careful when you use `delete_in_advance` option and `doas` option ...
22
36
 
23
37
  ## Example
24
38
 
data/build.gradle CHANGED
@@ -2,6 +2,9 @@ plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
+ id "checkstyle"
6
+ id "com.github.kt3k.coveralls" version "2.4.0"
7
+ id "jacoco"
5
8
  }
6
9
  import com.github.jrubygradle.JRubyExec
7
10
  repositories {
@@ -12,18 +15,20 @@ configurations {
12
15
  provided
13
16
  }
14
17
 
15
- version = "0.2.2"
18
+ version = "0.2.3"
16
19
 
17
20
  sourceCompatibility = 1.7
18
21
  targetCompatibility = 1.7
19
22
 
20
23
  dependencies {
21
- compile "org.embulk:embulk-core:0.7.0"
22
- provided "org.embulk:embulk-core:0.7.0"
24
+ compile "org.embulk:embulk-core:0.8.8"
25
+ provided "org.embulk:embulk-core:0.8.8"
23
26
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
24
27
  compile 'org.apache.hadoop:hadoop-client:2.6.0'
25
28
  compile 'com.google.guava:guava:15.0'
26
29
  testCompile "junit:junit:4.+"
30
+ testCompile "org.embulk:embulk-core:0.8.8:tests"
31
+ testCompile "org.embulk:embulk-standards:0.8.8"
27
32
  }
28
33
 
29
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -33,6 +38,29 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
33
38
  }
34
39
  clean { delete "classpath" }
35
40
 
41
+ jacocoTestReport {
42
+ reports {
43
+ xml.enabled = true // coveralls plugin depends on xml format report
44
+ html.enabled = true
45
+ }
46
+ }
47
+ checkstyle {
48
+ configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
49
+ toolVersion = '6.14.1'
50
+ }
51
+ checkstyleMain {
52
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
53
+ ignoreFailures = true
54
+ }
55
+ checkstyleTest {
56
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
57
+ ignoreFailures = true
58
+ }
59
+ task checkstyle(type: Checkstyle) {
60
+ classpath = sourceSets.main.output + sourceSets.test.output
61
+ source = sourceSets.main.allJava + sourceSets.test.allJava
62
+ }
63
+
36
64
  task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
37
65
  jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
38
66
  script "${project.name}.gemspec"
@@ -0,0 +1,128 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <module name="Checker">
6
+ <!-- https://github.com/facebook/presto/blob/master/src/checkstyle/checks.xml -->
7
+ <module name="FileTabCharacter"/>
8
+ <module name="NewlineAtEndOfFile">
9
+ <property name="lineSeparator" value="lf"/>
10
+ </module>
11
+ <module name="RegexpMultiline">
12
+ <property name="format" value="\r"/>
13
+ <property name="message" value="Line contains carriage return"/>
14
+ </module>
15
+ <module name="RegexpMultiline">
16
+ <property name="format" value=" \n"/>
17
+ <property name="message" value="Line has trailing whitespace"/>
18
+ </module>
19
+ <module name="RegexpMultiline">
20
+ <property name="format" value="\{\n\n"/>
21
+ <property name="message" value="Blank line after opening brace"/>
22
+ </module>
23
+ <module name="RegexpMultiline">
24
+ <property name="format" value="\n\n\s*\}"/>
25
+ <property name="message" value="Blank line before closing brace"/>
26
+ </module>
27
+ <module name="RegexpMultiline">
28
+ <property name="format" value="\n\n\n"/>
29
+ <property name="message" value="Multiple consecutive blank lines"/>
30
+ </module>
31
+ <module name="RegexpMultiline">
32
+ <property name="format" value="\n\n\Z"/>
33
+ <property name="message" value="Blank line before end of file"/>
34
+ </module>
35
+ <module name="RegexpMultiline">
36
+ <property name="format" value="Preconditions\.checkNotNull"/>
37
+ <property name="message" value="Use of checkNotNull"/>
38
+ </module>
39
+
40
+ <module name="TreeWalker">
41
+ <module name="EmptyBlock">
42
+ <property name="option" value="text"/>
43
+ <property name="tokens" value="
44
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
45
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
46
+ </module>
47
+ <module name="EmptyStatement"/>
48
+ <module name="EmptyForInitializerPad"/>
49
+ <module name="EmptyForIteratorPad">
50
+ <property name="option" value="space"/>
51
+ </module>
52
+ <module name="MethodParamPad">
53
+ <property name="allowLineBreaks" value="true"/>
54
+ <property name="option" value="nospace"/>
55
+ </module>
56
+ <module name="ParenPad"/>
57
+ <module name="TypecastParenPad"/>
58
+ <module name="NeedBraces"/>
59
+ <module name="LeftCurly">
60
+ <property name="option" value="nl"/>
61
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
62
+ </module>
63
+ <module name="LeftCurly">
64
+ <property name="option" value="eol"/>
65
+ <property name="tokens" value="
66
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
67
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
68
+ </module>
69
+ <module name="RightCurly">
70
+ <property name="option" value="alone"/>
71
+ </module>
72
+ <module name="GenericWhitespace"/>
73
+ <module name="WhitespaceAfter"/>
74
+ <module name="NoWhitespaceBefore"/>
75
+
76
+ <module name="UpperEll"/>
77
+ <module name="DefaultComesLast"/>
78
+ <module name="ArrayTypeStyle"/>
79
+ <module name="MultipleVariableDeclarations"/>
80
+ <module name="ModifierOrder"/>
81
+ <module name="OneStatementPerLine"/>
82
+ <module name="StringLiteralEquality"/>
83
+ <module name="MutableException"/>
84
+ <module name="EqualsHashCode"/>
85
+ <module name="InnerAssignment"/>
86
+ <module name="InterfaceIsType"/>
87
+ <module name="HideUtilityClassConstructor"/>
88
+
89
+ <module name="MemberName"/>
90
+ <module name="LocalVariableName"/>
91
+ <module name="LocalFinalVariableName"/>
92
+ <module name="TypeName"/>
93
+ <module name="PackageName"/>
94
+ <module name="ParameterName"/>
95
+ <module name="StaticVariableName"/>
96
+ <module name="ClassTypeParameterName">
97
+ <property name="format" value="^[A-Z][0-9]?$"/>
98
+ </module>
99
+ <module name="MethodTypeParameterName">
100
+ <property name="format" value="^[A-Z][0-9]?$"/>
101
+ </module>
102
+
103
+ <module name="AvoidStarImport"/>
104
+ <module name="RedundantImport"/>
105
+ <module name="UnusedImports"/>
106
+ <module name="ImportOrder">
107
+ <property name="groups" value="*,javax,java"/>
108
+ <property name="separated" value="true"/>
109
+ <property name="option" value="bottom"/>
110
+ <property name="sortStaticImportsAlphabetically" value="true"/>
111
+ </module>
112
+
113
+ <module name="WhitespaceAround">
114
+ <property name="allowEmptyConstructors" value="true"/>
115
+ <property name="allowEmptyMethods" value="true"/>
116
+ <property name="ignoreEnhancedForColon" value="false"/>
117
+ <property name="tokens" value="
118
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
119
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
120
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
121
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
122
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
123
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
124
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
125
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
126
+ </module>
127
+ </module>
128
+ </module>
@@ -0,0 +1,108 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <!--
6
+ This is a subset of ./checkstyle.xml which allows some loose styles
7
+ -->
8
+ <module name="Checker">
9
+ <module name="FileTabCharacter"/>
10
+ <module name="NewlineAtEndOfFile">
11
+ <property name="lineSeparator" value="lf"/>
12
+ </module>
13
+ <module name="RegexpMultiline">
14
+ <property name="format" value="\r"/>
15
+ <property name="message" value="Line contains carriage return"/>
16
+ </module>
17
+ <module name="RegexpMultiline">
18
+ <property name="format" value=" \n"/>
19
+ <property name="message" value="Line has trailing whitespace"/>
20
+ </module>
21
+ <module name="RegexpMultiline">
22
+ <property name="format" value="\n\n\n"/>
23
+ <property name="message" value="Multiple consecutive blank lines"/>
24
+ </module>
25
+ <module name="RegexpMultiline">
26
+ <property name="format" value="\n\n\Z"/>
27
+ <property name="message" value="Blank line before end of file"/>
28
+ </module>
29
+
30
+ <module name="TreeWalker">
31
+ <module name="EmptyBlock">
32
+ <property name="option" value="text"/>
33
+ <property name="tokens" value="
34
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
35
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
36
+ </module>
37
+ <module name="EmptyStatement"/>
38
+ <module name="EmptyForInitializerPad"/>
39
+ <module name="EmptyForIteratorPad">
40
+ <property name="option" value="space"/>
41
+ </module>
42
+ <module name="MethodParamPad">
43
+ <property name="allowLineBreaks" value="true"/>
44
+ <property name="option" value="nospace"/>
45
+ </module>
46
+ <module name="ParenPad"/>
47
+ <module name="TypecastParenPad"/>
48
+ <module name="NeedBraces"/>
49
+ <module name="LeftCurly">
50
+ <property name="option" value="nl"/>
51
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
52
+ </module>
53
+ <module name="LeftCurly">
54
+ <property name="option" value="eol"/>
55
+ <property name="tokens" value="
56
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
57
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
58
+ </module>
59
+ <module name="RightCurly">
60
+ <property name="option" value="alone"/>
61
+ </module>
62
+ <module name="GenericWhitespace"/>
63
+ <module name="WhitespaceAfter"/>
64
+ <module name="NoWhitespaceBefore"/>
65
+
66
+ <module name="UpperEll"/>
67
+ <module name="DefaultComesLast"/>
68
+ <module name="ArrayTypeStyle"/>
69
+ <module name="MultipleVariableDeclarations"/>
70
+ <module name="ModifierOrder"/>
71
+ <module name="OneStatementPerLine"/>
72
+ <module name="StringLiteralEquality"/>
73
+ <module name="MutableException"/>
74
+ <module name="EqualsHashCode"/>
75
+ <module name="InnerAssignment"/>
76
+ <module name="InterfaceIsType"/>
77
+ <module name="HideUtilityClassConstructor"/>
78
+
79
+ <module name="MemberName"/>
80
+ <module name="LocalVariableName"/>
81
+ <module name="LocalFinalVariableName"/>
82
+ <module name="TypeName"/>
83
+ <module name="PackageName"/>
84
+ <module name="ParameterName"/>
85
+ <module name="StaticVariableName"/>
86
+ <module name="ClassTypeParameterName">
87
+ <property name="format" value="^[A-Z][0-9]?$"/>
88
+ </module>
89
+ <module name="MethodTypeParameterName">
90
+ <property name="format" value="^[A-Z][0-9]?$"/>
91
+ </module>
92
+
93
+ <module name="WhitespaceAround">
94
+ <property name="allowEmptyConstructors" value="true"/>
95
+ <property name="allowEmptyMethods" value="true"/>
96
+ <property name="ignoreEnhancedForColon" value="false"/>
97
+ <property name="tokens" value="
98
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
99
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
100
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
101
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
102
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
103
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
104
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
105
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
106
+ </module>
107
+ </module>
108
+ </module>
@@ -0,0 +1,52 @@
1
+ hdfs_example: &hdfs_example
2
+ config_files:
3
+ - /etc/hadoop/conf/core-site.xml
4
+ - /etc/hadoop/conf/hdfs-site.xml
5
+ config:
6
+ fs.defaultFS: 'hdfs://hadoop-nn1:8020'
7
+ fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
8
+ fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
9
+
10
+ local_fs_example: &local_fs_example
11
+ config:
12
+ fs.defaultFS: 'file:///'
13
+ fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
14
+ fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
15
+ io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
16
+
17
+ in:
18
+ type: file
19
+ path_prefix: example/data
20
+ parser:
21
+ charset: UTF-8
22
+ newline: CRLF
23
+ type: csv
24
+ delimiter: ','
25
+ quote: '"'
26
+ header_line: true
27
+ stop_on_invalid_record: true
28
+ columns:
29
+ - {name: id, type: long}
30
+ - {name: account, type: long}
31
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
32
+ - {name: purchase, type: timestamp, format: '%Y%m%d'}
33
+ - {name: comment, type: string}
34
+
35
+
36
+ out:
37
+ type: hdfs
38
+ <<: *local_fs_example
39
+ path_prefix: /tmp/embulk-output-hdfs_example/file_
40
+ file_ext: csv
41
+ delete_in_advance: FILE_ONLY
42
+ formatter:
43
+ type: csv
44
+ newline: CRLF
45
+ newline_in_field: LF
46
+ header_line: true
47
+ charset: UTF-8
48
+ quote_policy: NONE
49
+ quote: '"'
50
+ escape: '\'
51
+ null_string: ''
52
+ default_timezone: UTC
data/example/data.csv ADDED
@@ -0,0 +1,5 @@
1
+ id,account,time,purchase,comment
2
+ 1,32864,2015-01-27 19:23:49,20150127,embulk
3
+ 2,14824,2015-01-27 19:01:23,20150127,embulk jruby
4
+ 3,27559,2015-01-28 02:20:02,20150128,"Embulk ""csv"" parser plugin"
5
+ 4,11270,2015-01-29 11:54:36,20150129,NULL
Binary file
@@ -1,6 +1,6 @@
1
- #Tue Aug 11 00:26:20 PDT 2015
1
+ #Wed Jan 13 12:41:02 JST 2016
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
@@ -3,11 +3,13 @@ package org.embulk.output.hdfs;
3
3
  import com.google.common.base.Optional;
4
4
  import com.google.common.base.Throwables;
5
5
  import org.apache.hadoop.conf.Configuration;
6
+ import org.apache.hadoop.fs.FileStatus;
6
7
  import org.apache.hadoop.fs.FileSystem;
7
8
  import org.apache.hadoop.fs.Path;
8
9
  import org.embulk.config.Config;
9
10
  import org.embulk.config.ConfigDefault;
10
11
  import org.embulk.config.ConfigDiff;
12
+ import org.embulk.config.ConfigException;
11
13
  import org.embulk.config.ConfigSource;
12
14
  import org.embulk.config.Task;
13
15
  import org.embulk.config.TaskReport;
@@ -27,6 +29,8 @@ import java.util.ArrayList;
27
29
  import java.util.List;
28
30
  import java.util.Map;
29
31
 
32
+ import static org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask.*;
33
+
30
34
  public class HdfsFileOutputPlugin
31
35
  implements FileOutputPlugin
32
36
  {
@@ -37,33 +41,38 @@ public class HdfsFileOutputPlugin
37
41
  {
38
42
  @Config("config_files")
39
43
  @ConfigDefault("[]")
40
- public List<String> getConfigFiles();
44
+ List<String> getConfigFiles();
41
45
 
42
46
  @Config("config")
43
47
  @ConfigDefault("{}")
44
- public Map<String, String> getConfig();
48
+ Map<String, String> getConfig();
45
49
 
46
50
  @Config("path_prefix")
47
- public String getPathPrefix();
51
+ String getPathPrefix();
48
52
 
49
53
  @Config("file_ext")
50
- public String getFileNameExtension();
54
+ String getFileExt();
51
55
 
52
56
  @Config("sequence_format")
53
57
  @ConfigDefault("\"%03d.%02d.\"")
54
- public String getSequenceFormat();
58
+ String getSequenceFormat();
55
59
 
56
60
  @Config("rewind_seconds")
57
61
  @ConfigDefault("0")
58
- public int getRewindSeconds();
62
+ int getRewindSeconds();
59
63
 
60
64
  @Config("overwrite")
61
65
  @ConfigDefault("false")
62
- public boolean getOverwrite();
66
+ boolean getOverwrite();
63
67
 
64
68
  @Config("doas")
65
69
  @ConfigDefault("null")
66
- public Optional<String> getDoas();
70
+ Optional<String> getDoas();
71
+
72
+ enum DeleteInAdvancePolicy{ NONE, FILE_ONLY, RECURSIVE}
73
+ @Config("delete_in_advance")
74
+ @ConfigDefault("\"NONE\"")
75
+ DeleteInAdvancePolicy getDeleteInAdvance();
67
76
  }
68
77
 
69
78
  @Override
@@ -72,6 +81,15 @@ public class HdfsFileOutputPlugin
72
81
  {
73
82
  PluginTask task = config.loadConfig(PluginTask.class);
74
83
 
84
+ try {
85
+ String pathPrefix = strftime(task.getPathPrefix(), task.getRewindSeconds());
86
+ FileSystem fs = getFs(task);
87
+ deleteInAdvance(fs, pathPrefix, task.getDeleteInAdvance());
88
+ }
89
+ catch (IOException e) {
90
+ throw Throwables.propagate(e);
91
+ }
92
+
75
93
  control.run(task.dump());
76
94
  return Exec.newConfigDiff();
77
95
  }
@@ -97,7 +115,7 @@ public class HdfsFileOutputPlugin
97
115
  final PluginTask task = taskSource.loadTask(PluginTask.class);
98
116
 
99
117
  final String pathPrefix = strftime(task.getPathPrefix(), task.getRewindSeconds());
100
- final String pathSuffix = task.getFileNameExtension();
118
+ final String pathSuffix = task.getFileExt();
101
119
  final String sequenceFormat = task.getSequenceFormat();
102
120
 
103
121
  return new TransactionalFileOutput()
@@ -211,4 +229,31 @@ public class HdfsFileOutputPlugin
211
229
  String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
212
230
  return resolved.toString();
213
231
  }
232
+
233
+ private void deleteInAdvance(FileSystem fs, String pathPrefix, DeleteInAdvancePolicy deleteInAdvancePolicy)
234
+ throws IOException
235
+ {
236
+ final Path globPath = new Path(pathPrefix + "*");
237
+ switch (deleteInAdvancePolicy) {
238
+ case NONE:
239
+ // do nothing
240
+ break;
241
+ case FILE_ONLY:
242
+ for (FileStatus status : fs.globStatus(globPath)) {
243
+ if (status.isFile()) {
244
+ logger.debug("delete in advance: {}", status.getPath());
245
+ fs.delete(status.getPath(), false);
246
+ }
247
+ }
248
+ break;
249
+ case RECURSIVE:
250
+ for (FileStatus status : fs.globStatus(globPath)) {
251
+ logger.debug("delete in advance: {}", status.getPath());
252
+ fs.delete(status.getPath(), true);
253
+ }
254
+ break;
255
+ default:
256
+ throw new ConfigException("`delete_in_advance` must not null.");
257
+ }
258
+ }
214
259
  }
@@ -1,5 +1,280 @@
1
1
  package org.embulk.output.hdfs;
2
2
 
3
+ import com.google.common.base.Charsets;
4
+ import com.google.common.base.Optional;
5
+ import com.google.common.collect.Lists;
6
+ import com.google.common.collect.Maps;
7
+ import org.embulk.EmbulkTestRuntime;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.TaskReport;
11
+ import org.embulk.config.TaskSource;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FileOutputRunner;
14
+ import org.embulk.spi.OutputPlugin.Control;
15
+ import org.embulk.spi.Page;
16
+ import org.embulk.spi.PageTestUtils;
17
+ import org.embulk.spi.Schema;
18
+ import org.embulk.spi.TransactionalPageOutput;
19
+ import org.embulk.spi.time.Timestamp;
20
+ import org.junit.Before;
21
+ import org.junit.Rule;
22
+ import org.junit.Test;
23
+ import org.junit.rules.ExpectedException;
24
+ import org.junit.rules.TemporaryFolder;
25
+ import org.slf4j.Logger;
26
+
27
+ import java.io.File;
28
+ import java.io.IOException;
29
+ import java.nio.file.DirectoryStream;
30
+ import java.nio.file.Files;
31
+ import java.nio.file.Paths;
32
+ import java.util.List;
33
+
34
+ import static com.google.common.io.Files.readLines;
35
+ import static org.embulk.output.hdfs.HdfsFileOutputPlugin.*;
36
+ import static org.embulk.spi.type.Types.*;
37
+ import static org.hamcrest.CoreMatchers.containsString;
38
+ import static org.hamcrest.CoreMatchers.hasItem;
39
+ import static org.hamcrest.CoreMatchers.not;
40
+ import static org.junit.Assert.assertEquals;
41
+ import static org.junit.Assert.assertNotEquals;
42
+ import static org.junit.Assert.assertThat;
43
+ import static org.msgpack.value.ValueFactory.newMap;
44
+ import static org.msgpack.value.ValueFactory.newString;
45
+
3
46
  public class TestHdfsFileOutputPlugin
4
47
  {
48
+ @Rule
49
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
50
+
51
+ @Rule
52
+ public ExpectedException exception = ExpectedException.none();
53
+
54
+ @Rule
55
+ public TemporaryFolder tmpFolder = new TemporaryFolder();
56
+
57
+ private Logger logger = runtime.getExec().getLogger(TestHdfsFileOutputPlugin.class);
58
+ private HdfsFileOutputPlugin plugin;
59
+ private FileOutputRunner runner;
60
+ private String pathPrefix;
61
+
62
+ private static final Schema SCHEMA = new Schema.Builder()
63
+ .add("_c0", BOOLEAN)
64
+ .add("_c1", LONG)
65
+ .add("_c2", DOUBLE)
66
+ .add("_c3", STRING)
67
+ .add("_c4", TIMESTAMP)
68
+ .add("_c5", JSON)
69
+ .build();
70
+
71
+ @Before
72
+ public void createResources()
73
+ throws IOException
74
+ {
75
+ plugin = new HdfsFileOutputPlugin();
76
+ runner = new FileOutputRunner(runtime.getInstance(HdfsFileOutputPlugin.class));
77
+ pathPrefix = tmpFolder.getRoot().getAbsolutePath() + "/embulk-output-hdfs_";
78
+ }
79
+
80
+ private ConfigSource getBaseConfigSource()
81
+ {
82
+ return Exec.newConfigSource()
83
+ .set("type", "hdfs")
84
+ .set("path_prefix", pathPrefix)
85
+ .set("file_ext", "csv")
86
+ .setNested("formatter", Exec.newConfigSource()
87
+ .set("type", "csv")
88
+ .set("newline", "CRLF")
89
+ .set("newline_in_field", "LF")
90
+ .set("header_line", true)
91
+ .set("charset", "UTF-8")
92
+ .set("quote_policy", "NONE")
93
+ .set("quote", "\"")
94
+ .set("escape", "\\")
95
+ .set("null_string", "")
96
+ .set("default_timezone", "UTC"));
97
+ }
98
+
99
+ @Test
100
+ public void testDefaultValues()
101
+ {
102
+ ConfigSource config = getBaseConfigSource();
103
+ PluginTask task = config.loadConfig(PluginTask.class);
104
+ assertEquals(pathPrefix, task.getPathPrefix());
105
+ assertEquals("csv", task.getFileExt());
106
+ assertEquals("%03d.%02d.", task.getSequenceFormat());
107
+ assertEquals(Lists.newArrayList(), task.getConfigFiles());
108
+ assertEquals(Maps.newHashMap(), task.getConfig());
109
+ assertEquals(0, task.getRewindSeconds());
110
+ assertEquals(false, task.getOverwrite());
111
+ assertEquals(Optional.absent(), task.getDoas());
112
+ assertEquals(PluginTask.DeleteInAdvancePolicy.NONE, task.getDeleteInAdvance());
113
+ }
114
+
115
+ @Test(expected = ConfigException.class)
116
+ public void testRequiredValues()
117
+ {
118
+ ConfigSource config = Exec.newConfigSource();
119
+ PluginTask task = config.loadConfig(PluginTask.class);
120
+ }
121
+
122
+ private List<String> lsR(List<String> names, java.nio.file.Path dir)
123
+ {
124
+ try (DirectoryStream<java.nio.file.Path> stream = Files.newDirectoryStream(dir)) {
125
+ for (java.nio.file.Path path : stream) {
126
+ if (path.toFile().isDirectory()) {
127
+ logger.debug("[lsR] find a directory: {}", path.toAbsolutePath().toString());
128
+ names.add(path.toAbsolutePath().toString());
129
+ lsR(names, path);
130
+ }
131
+ else {
132
+ logger.debug("[lsR] find a file: {}", path.toAbsolutePath().toString());
133
+ names.add(path.toAbsolutePath().toString());
134
+ }
135
+ }
136
+ }
137
+ catch (IOException e) {
138
+ logger.debug(e.getMessage(), e);
139
+ }
140
+ return names;
141
+ }
142
+
143
+ private void run(ConfigSource config)
144
+ {
145
+ runner.transaction(config, SCHEMA, 1, new Control()
146
+ {
147
+ @Override
148
+ public List<TaskReport> run(TaskSource taskSource)
149
+ {
150
+ TransactionalPageOutput pageOutput = runner.open(taskSource, SCHEMA, 1);
151
+ boolean committed = false;
152
+ try {
153
+ // Result:
154
+ // _c0,_c1,_c2,_c3,_c4,_c5
155
+ // true,2,3.0,45,1970-01-01 00:00:00.678000 +0000,{\"k\":\"v\"}
156
+ // true,2,3.0,45,1970-01-01 00:00:00.678000 +0000,{\"k\":\"v\"}
157
+ for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), SCHEMA,
158
+ true, 2L, 3.0D, "45", Timestamp.ofEpochMilli(678L), newMap(newString("k"), newString("v")),
159
+ true, 2L, 3.0D, "45", Timestamp.ofEpochMilli(678L), newMap(newString("k"), newString("v")))) {
160
+ pageOutput.add(page);
161
+ }
162
+ pageOutput.commit();
163
+ committed = true;
164
+ }
165
+ finally {
166
+ if (!committed) {
167
+ pageOutput.abort();
168
+ }
169
+ pageOutput.close();
170
+ }
171
+ return Lists.newArrayList();
172
+ }
173
+ });
174
+ }
175
+
176
+ private void assertRecordsInFile(String filePath)
177
+ {
178
+ try {
179
+ List<String> lines = readLines(new File(filePath),
180
+ Charsets.UTF_8);
181
+ for (int i = 0; i < lines.size(); i++) {
182
+ String[] record = lines.get(i).split(",");
183
+ if (i == 0) {
184
+ for (int j = 0; j <= 4; j++) {
185
+ assertEquals("_c" + j, record[j]);
186
+ }
187
+ }
188
+ else {
189
+ // true,2,3.0,45,1970-01-01 00:00:00.678000 +0000
190
+ assertEquals("true", record[0]);
191
+ assertEquals("2", record[1]);
192
+ assertEquals("3.0", record[2]);
193
+ assertEquals("45", record[3]);
194
+ assertEquals("1970-01-01 00:00:00.678000 +0000", record[4]);
195
+ assertEquals("{\"k\":\"v\"}", record[5]);
196
+ }
197
+ }
198
+ }
199
+ catch (IOException e) {
200
+ logger.debug(e.getMessage(), e);
201
+ }
202
+ }
203
+
204
+ @Test
205
+ public void testBulkLoad()
206
+ {
207
+ ConfigSource config = getBaseConfigSource()
208
+ .setNested("config", Exec.newConfigSource()
209
+ .set("fs.hdfs.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
210
+ .set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
211
+ .set("fs.defaultFS", "file:///"));
212
+
213
+ run(config);
214
+ List<String> fileList = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
215
+ assertThat(fileList, hasItem(containsString(pathPrefix + "001.00.csv")));
216
+ assertRecordsInFile(String.format("%s/%s001.00.csv",
217
+ tmpFolder.getRoot().getAbsolutePath(),
218
+ pathPrefix));
219
+ }
220
+
221
+ @Test
222
+ public void testDeleteRECURSIVEInAdvance()
223
+ throws IOException
224
+ {
225
+ for (int n = 0; n <= 10; n++) {
226
+ tmpFolder.newFile("embulk-output-hdfs_file_" + n + ".txt");
227
+ tmpFolder.newFolder("embulk-output-hdfs_directory_" + n);
228
+ }
229
+
230
+ List<String> fileListBeforeRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
231
+
232
+ ConfigSource config = getBaseConfigSource()
233
+ .setNested("config", Exec.newConfigSource()
234
+ .set("fs.hdfs.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
235
+ .set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
236
+ .set("fs.defaultFS", "file:///"))
237
+ .set("delete_in_advance", "RECURSIVE");
238
+
239
+ run(config);
240
+
241
+ List<String> fileListAfterRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
242
+ assertNotEquals(fileListBeforeRun, fileListAfterRun);
243
+ assertThat(fileListAfterRun, not(hasItem(containsString("embulk-output-hdfs_directory"))));
244
+ assertThat(fileListAfterRun, not(hasItem(containsString("txt"))));
245
+ assertThat(fileListAfterRun, hasItem(containsString(pathPrefix + "001.00.csv")));
246
+ assertRecordsInFile(String.format("%s/%s001.00.csv",
247
+ tmpFolder.getRoot().getAbsolutePath(),
248
+ pathPrefix));
249
+ }
250
+
251
+ @Test
252
+ public void testDeleteFILE_ONLYInAdvance()
253
+ throws IOException
254
+ {
255
+ for (int n = 0; n <= 10; n++) {
256
+ tmpFolder.newFile("embulk-output-hdfs_file_" + n + ".txt");
257
+ tmpFolder.newFolder("embulk-output-hdfs_directory_" + n);
258
+ }
259
+
260
+ List<String> fileListBeforeRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
261
+
262
+ ConfigSource config = getBaseConfigSource()
263
+ .setNested("config", Exec.newConfigSource()
264
+ .set("fs.hdfs.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
265
+ .set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem")
266
+ .set("fs.defaultFS", "file:///"))
267
+ .set("delete_in_advance", "FILE_ONLY");
268
+
269
+ run(config);
270
+
271
+ List<String> fileListAfterRun = lsR(Lists.<String>newArrayList(), Paths.get(tmpFolder.getRoot().getAbsolutePath()));
272
+ assertNotEquals(fileListBeforeRun, fileListAfterRun);
273
+ assertThat(fileListAfterRun, not(hasItem(containsString("txt"))));
274
+ assertThat(fileListAfterRun, hasItem(containsString("embulk-output-hdfs_directory")));
275
+ assertThat(fileListAfterRun, hasItem(containsString(pathPrefix + "001.00.csv")));
276
+ assertRecordsInFile(String.format("%s/%s001.00.csv",
277
+ tmpFolder.getRoot().getAbsolutePath(),
278
+ pathPrefix));
279
+ }
5
280
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Civitaspo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-08 00:00:00.000000000 Z
11
+ date: 2016-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -46,10 +46,15 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - .gitignore
49
+ - .travis.yml
49
50
  - CHANGELOG.md
50
51
  - LICENSE.txt
51
52
  - README.md
52
53
  - build.gradle
54
+ - config/checkstyle/checkstyle.xml
55
+ - config/checkstyle/default.xml
56
+ - example/config.yml
57
+ - example/data.csv
53
58
  - gradle/wrapper/gradle-wrapper.jar
54
59
  - gradle/wrapper/gradle-wrapper.properties
55
60
  - gradlew
@@ -80,7 +85,7 @@ files:
80
85
  - classpath/curator-client-2.6.0.jar
81
86
  - classpath/curator-framework-2.6.0.jar
82
87
  - classpath/curator-recipes-2.6.0.jar
83
- - classpath/embulk-output-hdfs-0.2.2.jar
88
+ - classpath/embulk-output-hdfs-0.2.3.jar
84
89
  - classpath/gson-2.2.4.jar
85
90
  - classpath/hadoop-annotations-2.6.0.jar
86
91
  - classpath/hadoop-auth-2.6.0.jar