embulk-input-hdfs 0.1.8 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +4 -1
- data/.travis.yml +9 -0
- data/README.md +6 -3
- data/build.gradle +33 -6
- data/classpath/commons-collections-3.2.2.jar +0 -0
- data/classpath/embulk-input-hdfs-0.1.9.jar +0 -0
- data/classpath/{hadoop-annotations-2.6.0.jar → hadoop-annotations-2.6.3.jar} +0 -0
- data/classpath/hadoop-auth-2.6.3.jar +0 -0
- data/classpath/hadoop-client-2.6.3.jar +0 -0
- data/classpath/{hadoop-common-2.6.0.jar → hadoop-common-2.6.3.jar} +0 -0
- data/classpath/{hadoop-hdfs-2.6.0.jar → hadoop-hdfs-2.6.3.jar} +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.3.jar +0 -0
- data/classpath/{hadoop-mapreduce-client-common-2.6.0.jar → hadoop-mapreduce-client-common-2.6.3.jar} +0 -0
- data/classpath/{hadoop-mapreduce-client-core-2.6.0.jar → hadoop-mapreduce-client-core-2.6.3.jar} +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.3.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.3.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.3.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.3.jar +0 -0
- data/classpath/{hadoop-yarn-common-2.6.0.jar → hadoop-yarn-common-2.6.3.jar} +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.3.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.3.jar +0 -0
- data/config/checkstyle/checkstyle.xml +128 -0
- data/config/checkstyle/default.xml +108 -0
- data/example/config.yml +35 -0
- data/example/data.csv +5 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +91 -13
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +2 -1
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +3 -4
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java +20 -11
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +227 -0
- data/src/test/resources/sample_01.csv +5 -0
- data/src/test/resources/sample_02.csv +5 -0
- metadata +27 -20
- data/classpath/commons-collections-3.2.1.jar +0 -0
- data/classpath/embulk-input-hdfs-0.1.8.jar +0 -0
- data/classpath/hadoop-auth-2.6.0.jar +0 -0
- data/classpath/hadoop-client-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce120e7049f33e30dd23af9f8b7bcedc1a246457
|
4
|
+
data.tar.gz: a2dc70fee60be2ab535df3549e99304e751a7b7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a37baf6f948dff41f694457dc9ea9ea9270e41473642114d4dc7a569c61550471b9dbc440478c638fe56ba79956f043097e2129302d3ae12511bdc9d33cef994
|
7
|
+
data.tar.gz: 16922c84dcdb9715cb1b0377886b36192acdda31a037352e18df83895f33b09a9f275cd02b9662f02ee411725a6dae65950cfc256c707f639312810839018037
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
# Hdfs file input plugin for Embulk
|
2
|
+
[![Build Status](https://travis-ci.org/civitaspo/embulk-input-hdfs.svg)](https://travis-ci.org/civitaspo/embulk-input-hdfs)
|
3
|
+
[![Coverage Status](https://coveralls.io/repos/civitaspo/embulk-input-hdfs/badge.svg?branch=master&service=github)](https://coveralls.io/github/civitaspo/embulk-input-hdfs?branch=master)
|
2
4
|
|
3
5
|
Read files on Hdfs.
|
4
6
|
|
@@ -16,6 +18,7 @@ Read files on Hdfs.
|
|
16
18
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
19
|
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
18
20
|
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
21
|
+
- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (default: `0`)
|
19
22
|
|
20
23
|
## Example
|
21
24
|
|
@@ -23,8 +26,8 @@ Read files on Hdfs.
|
|
23
26
|
in:
|
24
27
|
type: hdfs
|
25
28
|
config_files:
|
26
|
-
- /
|
27
|
-
- /
|
29
|
+
- /etc/hadoop/conf/core-site.xml
|
30
|
+
- /etc/hadoop/conf/hdfs-site.xml
|
28
31
|
config:
|
29
32
|
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
30
33
|
dfs.replication: 1
|
@@ -106,4 +109,4 @@ $ ./gradlew gem
|
|
106
109
|
```
|
107
110
|
$ ./gradlew classpath
|
108
111
|
$ bundle exec embulk run -I lib example.yml
|
109
|
-
```
|
112
|
+
```
|
data/build.gradle
CHANGED
@@ -2,6 +2,9 @@ plugins {
|
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
3
|
id "com.github.jruby-gradle.base" version "0.1.5"
|
4
4
|
id "java"
|
5
|
+
id "checkstyle"
|
6
|
+
id "com.github.kt3k.coveralls" version "2.4.0"
|
7
|
+
id "jacoco"
|
5
8
|
}
|
6
9
|
import com.github.jrubygradle.JRubyExec
|
7
10
|
repositories {
|
@@ -12,18 +15,19 @@ configurations {
|
|
12
15
|
provided
|
13
16
|
}
|
14
17
|
|
15
|
-
version = "0.1.
|
18
|
+
version = "0.1.9"
|
16
19
|
|
17
20
|
sourceCompatibility = 1.7
|
18
21
|
targetCompatibility = 1.7
|
19
22
|
|
20
23
|
dependencies {
|
21
|
-
compile "org.embulk:embulk-core:0.
|
22
|
-
provided "org.embulk:embulk-core:0.
|
24
|
+
compile "org.embulk:embulk-core:0.8.+"
|
25
|
+
provided "org.embulk:embulk-core:0.8.+"
|
23
26
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
24
|
-
compile 'org.apache.hadoop:hadoop-client:2.6
|
25
|
-
compile 'com.google.guava:guava:15.0'
|
27
|
+
compile 'org.apache.hadoop:hadoop-client:2.6.+'
|
26
28
|
testCompile "junit:junit:4.+"
|
29
|
+
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
30
|
+
testCompile "org.embulk:embulk-standards:0.8.+"
|
27
31
|
}
|
28
32
|
|
29
33
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -33,6 +37,29 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
|
|
33
37
|
}
|
34
38
|
clean { delete "classpath" }
|
35
39
|
|
40
|
+
jacocoTestReport {
|
41
|
+
reports {
|
42
|
+
xml.enabled = true // coveralls plugin depends on xml format report
|
43
|
+
html.enabled = true
|
44
|
+
}
|
45
|
+
}
|
46
|
+
checkstyle {
|
47
|
+
configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
|
48
|
+
toolVersion = '6.14.1'
|
49
|
+
}
|
50
|
+
checkstyleMain {
|
51
|
+
configFile = file("${project.rootDir}/config/checkstyle/default.xml")
|
52
|
+
ignoreFailures = true
|
53
|
+
}
|
54
|
+
checkstyleTest {
|
55
|
+
configFile = file("${project.rootDir}/config/checkstyle/default.xml")
|
56
|
+
ignoreFailures = true
|
57
|
+
}
|
58
|
+
task checkstyle(type: Checkstyle) {
|
59
|
+
classpath = sourceSets.main.output + sourceSets.test.output
|
60
|
+
source = sourceSets.main.allJava + sourceSets.test.allJava
|
61
|
+
}
|
62
|
+
|
36
63
|
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
37
64
|
jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
|
38
65
|
script "${project.name}.gemspec"
|
@@ -57,7 +84,7 @@ task gemspec {
|
|
57
84
|
Gem::Specification.new do |spec|
|
58
85
|
spec.name = "${project.name}"
|
59
86
|
spec.version = "${project.version}"
|
60
|
-
spec.authors = ["
|
87
|
+
spec.authors = ["Civitaspo"]
|
61
88
|
spec.summary = %[Hdfs file input plugin for Embulk]
|
62
89
|
spec.description = %[Reads files stored on Hdfs.]
|
63
90
|
spec.email = ["civitaspo@gmail.com"]
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/classpath/{hadoop-mapreduce-client-common-2.6.0.jar → hadoop-mapreduce-client-common-2.6.3.jar}
RENAMED
Binary file
|
data/classpath/{hadoop-mapreduce-client-core-2.6.0.jar → hadoop-mapreduce-client-core-2.6.3.jar}
RENAMED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,128 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE module PUBLIC
|
3
|
+
"-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
4
|
+
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
5
|
+
<module name="Checker">
|
6
|
+
<!-- https://github.com/facebook/presto/blob/master/src/checkstyle/checks.xml -->
|
7
|
+
<module name="FileTabCharacter"/>
|
8
|
+
<module name="NewlineAtEndOfFile">
|
9
|
+
<property name="lineSeparator" value="lf"/>
|
10
|
+
</module>
|
11
|
+
<module name="RegexpMultiline">
|
12
|
+
<property name="format" value="\r"/>
|
13
|
+
<property name="message" value="Line contains carriage return"/>
|
14
|
+
</module>
|
15
|
+
<module name="RegexpMultiline">
|
16
|
+
<property name="format" value=" \n"/>
|
17
|
+
<property name="message" value="Line has trailing whitespace"/>
|
18
|
+
</module>
|
19
|
+
<module name="RegexpMultiline">
|
20
|
+
<property name="format" value="\{\n\n"/>
|
21
|
+
<property name="message" value="Blank line after opening brace"/>
|
22
|
+
</module>
|
23
|
+
<module name="RegexpMultiline">
|
24
|
+
<property name="format" value="\n\n\s*\}"/>
|
25
|
+
<property name="message" value="Blank line before closing brace"/>
|
26
|
+
</module>
|
27
|
+
<module name="RegexpMultiline">
|
28
|
+
<property name="format" value="\n\n\n"/>
|
29
|
+
<property name="message" value="Multiple consecutive blank lines"/>
|
30
|
+
</module>
|
31
|
+
<module name="RegexpMultiline">
|
32
|
+
<property name="format" value="\n\n\Z"/>
|
33
|
+
<property name="message" value="Blank line before end of file"/>
|
34
|
+
</module>
|
35
|
+
<module name="RegexpMultiline">
|
36
|
+
<property name="format" value="Preconditions\.checkNotNull"/>
|
37
|
+
<property name="message" value="Use of checkNotNull"/>
|
38
|
+
</module>
|
39
|
+
|
40
|
+
<module name="TreeWalker">
|
41
|
+
<module name="EmptyBlock">
|
42
|
+
<property name="option" value="text"/>
|
43
|
+
<property name="tokens" value="
|
44
|
+
LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
|
45
|
+
LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
|
46
|
+
</module>
|
47
|
+
<module name="EmptyStatement"/>
|
48
|
+
<module name="EmptyForInitializerPad"/>
|
49
|
+
<module name="EmptyForIteratorPad">
|
50
|
+
<property name="option" value="space"/>
|
51
|
+
</module>
|
52
|
+
<module name="MethodParamPad">
|
53
|
+
<property name="allowLineBreaks" value="true"/>
|
54
|
+
<property name="option" value="nospace"/>
|
55
|
+
</module>
|
56
|
+
<module name="ParenPad"/>
|
57
|
+
<module name="TypecastParenPad"/>
|
58
|
+
<module name="NeedBraces"/>
|
59
|
+
<module name="LeftCurly">
|
60
|
+
<property name="option" value="nl"/>
|
61
|
+
<property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
|
62
|
+
</module>
|
63
|
+
<module name="LeftCurly">
|
64
|
+
<property name="option" value="eol"/>
|
65
|
+
<property name="tokens" value="
|
66
|
+
LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
|
67
|
+
LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
|
68
|
+
</module>
|
69
|
+
<module name="RightCurly">
|
70
|
+
<property name="option" value="alone"/>
|
71
|
+
</module>
|
72
|
+
<module name="GenericWhitespace"/>
|
73
|
+
<module name="WhitespaceAfter"/>
|
74
|
+
<module name="NoWhitespaceBefore"/>
|
75
|
+
|
76
|
+
<module name="UpperEll"/>
|
77
|
+
<module name="DefaultComesLast"/>
|
78
|
+
<module name="ArrayTypeStyle"/>
|
79
|
+
<module name="MultipleVariableDeclarations"/>
|
80
|
+
<module name="ModifierOrder"/>
|
81
|
+
<module name="OneStatementPerLine"/>
|
82
|
+
<module name="StringLiteralEquality"/>
|
83
|
+
<module name="MutableException"/>
|
84
|
+
<module name="EqualsHashCode"/>
|
85
|
+
<module name="InnerAssignment"/>
|
86
|
+
<module name="InterfaceIsType"/>
|
87
|
+
<module name="HideUtilityClassConstructor"/>
|
88
|
+
|
89
|
+
<module name="MemberName"/>
|
90
|
+
<module name="LocalVariableName"/>
|
91
|
+
<module name="LocalFinalVariableName"/>
|
92
|
+
<module name="TypeName"/>
|
93
|
+
<module name="PackageName"/>
|
94
|
+
<module name="ParameterName"/>
|
95
|
+
<module name="StaticVariableName"/>
|
96
|
+
<module name="ClassTypeParameterName">
|
97
|
+
<property name="format" value="^[A-Z][0-9]?$"/>
|
98
|
+
</module>
|
99
|
+
<module name="MethodTypeParameterName">
|
100
|
+
<property name="format" value="^[A-Z][0-9]?$"/>
|
101
|
+
</module>
|
102
|
+
|
103
|
+
<module name="AvoidStarImport"/>
|
104
|
+
<module name="RedundantImport"/>
|
105
|
+
<module name="UnusedImports"/>
|
106
|
+
<module name="ImportOrder">
|
107
|
+
<property name="groups" value="*,javax,java"/>
|
108
|
+
<property name="separated" value="true"/>
|
109
|
+
<property name="option" value="bottom"/>
|
110
|
+
<property name="sortStaticImportsAlphabetically" value="true"/>
|
111
|
+
</module>
|
112
|
+
|
113
|
+
<module name="WhitespaceAround">
|
114
|
+
<property name="allowEmptyConstructors" value="true"/>
|
115
|
+
<property name="allowEmptyMethods" value="true"/>
|
116
|
+
<property name="ignoreEnhancedForColon" value="false"/>
|
117
|
+
<property name="tokens" value="
|
118
|
+
ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
|
119
|
+
BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
|
120
|
+
LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
|
121
|
+
LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
|
122
|
+
LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
|
123
|
+
LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
|
124
|
+
PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
|
125
|
+
STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
|
126
|
+
</module>
|
127
|
+
</module>
|
128
|
+
</module>
|
@@ -0,0 +1,108 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE module PUBLIC
|
3
|
+
"-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
4
|
+
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
5
|
+
<!--
|
6
|
+
This is a subset of ./checkstyle.xml which allows some loose styles
|
7
|
+
-->
|
8
|
+
<module name="Checker">
|
9
|
+
<module name="FileTabCharacter"/>
|
10
|
+
<module name="NewlineAtEndOfFile">
|
11
|
+
<property name="lineSeparator" value="lf"/>
|
12
|
+
</module>
|
13
|
+
<module name="RegexpMultiline">
|
14
|
+
<property name="format" value="\r"/>
|
15
|
+
<property name="message" value="Line contains carriage return"/>
|
16
|
+
</module>
|
17
|
+
<module name="RegexpMultiline">
|
18
|
+
<property name="format" value=" \n"/>
|
19
|
+
<property name="message" value="Line has trailing whitespace"/>
|
20
|
+
</module>
|
21
|
+
<module name="RegexpMultiline">
|
22
|
+
<property name="format" value="\n\n\n"/>
|
23
|
+
<property name="message" value="Multiple consecutive blank lines"/>
|
24
|
+
</module>
|
25
|
+
<module name="RegexpMultiline">
|
26
|
+
<property name="format" value="\n\n\Z"/>
|
27
|
+
<property name="message" value="Blank line before end of file"/>
|
28
|
+
</module>
|
29
|
+
|
30
|
+
<module name="TreeWalker">
|
31
|
+
<module name="EmptyBlock">
|
32
|
+
<property name="option" value="text"/>
|
33
|
+
<property name="tokens" value="
|
34
|
+
LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
|
35
|
+
LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
|
36
|
+
</module>
|
37
|
+
<module name="EmptyStatement"/>
|
38
|
+
<module name="EmptyForInitializerPad"/>
|
39
|
+
<module name="EmptyForIteratorPad">
|
40
|
+
<property name="option" value="space"/>
|
41
|
+
</module>
|
42
|
+
<module name="MethodParamPad">
|
43
|
+
<property name="allowLineBreaks" value="true"/>
|
44
|
+
<property name="option" value="nospace"/>
|
45
|
+
</module>
|
46
|
+
<module name="ParenPad"/>
|
47
|
+
<module name="TypecastParenPad"/>
|
48
|
+
<module name="NeedBraces"/>
|
49
|
+
<module name="LeftCurly">
|
50
|
+
<property name="option" value="nl"/>
|
51
|
+
<property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
|
52
|
+
</module>
|
53
|
+
<module name="LeftCurly">
|
54
|
+
<property name="option" value="eol"/>
|
55
|
+
<property name="tokens" value="
|
56
|
+
LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
|
57
|
+
LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
|
58
|
+
</module>
|
59
|
+
<module name="RightCurly">
|
60
|
+
<property name="option" value="alone"/>
|
61
|
+
</module>
|
62
|
+
<module name="GenericWhitespace"/>
|
63
|
+
<module name="WhitespaceAfter"/>
|
64
|
+
<module name="NoWhitespaceBefore"/>
|
65
|
+
|
66
|
+
<module name="UpperEll"/>
|
67
|
+
<module name="DefaultComesLast"/>
|
68
|
+
<module name="ArrayTypeStyle"/>
|
69
|
+
<module name="MultipleVariableDeclarations"/>
|
70
|
+
<module name="ModifierOrder"/>
|
71
|
+
<module name="OneStatementPerLine"/>
|
72
|
+
<module name="StringLiteralEquality"/>
|
73
|
+
<module name="MutableException"/>
|
74
|
+
<module name="EqualsHashCode"/>
|
75
|
+
<module name="InnerAssignment"/>
|
76
|
+
<module name="InterfaceIsType"/>
|
77
|
+
<module name="HideUtilityClassConstructor"/>
|
78
|
+
|
79
|
+
<module name="MemberName"/>
|
80
|
+
<module name="LocalVariableName"/>
|
81
|
+
<module name="LocalFinalVariableName"/>
|
82
|
+
<module name="TypeName"/>
|
83
|
+
<module name="PackageName"/>
|
84
|
+
<module name="ParameterName"/>
|
85
|
+
<module name="StaticVariableName"/>
|
86
|
+
<module name="ClassTypeParameterName">
|
87
|
+
<property name="format" value="^[A-Z][0-9]?$"/>
|
88
|
+
</module>
|
89
|
+
<module name="MethodTypeParameterName">
|
90
|
+
<property name="format" value="^[A-Z][0-9]?$"/>
|
91
|
+
</module>
|
92
|
+
|
93
|
+
<module name="WhitespaceAround">
|
94
|
+
<property name="allowEmptyConstructors" value="true"/>
|
95
|
+
<property name="allowEmptyMethods" value="true"/>
|
96
|
+
<property name="ignoreEnhancedForColon" value="false"/>
|
97
|
+
<property name="tokens" value="
|
98
|
+
ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
|
99
|
+
BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
|
100
|
+
LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
|
101
|
+
LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
|
102
|
+
LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
|
103
|
+
LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
|
104
|
+
PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
|
105
|
+
STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
|
106
|
+
</module>
|
107
|
+
</module>
|
108
|
+
</module>
|
data/example/config.yml
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
hdfs_example: &hdfs_example
|
2
|
+
config_files:
|
3
|
+
- /etc/hadoop/conf/core-site.xml
|
4
|
+
- /etc/hadoop/conf/hdfs-site.xml
|
5
|
+
config:
|
6
|
+
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
7
|
+
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
8
|
+
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
9
|
+
|
10
|
+
local_fs_example: &local_fs_example
|
11
|
+
config:
|
12
|
+
fs.defaultFS: 'file:///'
|
13
|
+
fs.hdfs.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
14
|
+
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
15
|
+
|
16
|
+
in:
|
17
|
+
type: hdfs
|
18
|
+
<<: *local_fs_example
|
19
|
+
path: example/data.csv
|
20
|
+
parser:
|
21
|
+
charset: UTF-8
|
22
|
+
newline: CRLF
|
23
|
+
type: csv
|
24
|
+
delimiter: ','
|
25
|
+
quote: '"'
|
26
|
+
header_line: true
|
27
|
+
columns:
|
28
|
+
- {name: id, type: long}
|
29
|
+
- {name: account, type: long}
|
30
|
+
- {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
|
31
|
+
- {name: purchase, type: timestamp, format: '%Y%m%d'}
|
32
|
+
- {name: comment, type: string}
|
33
|
+
|
34
|
+
out:
|
35
|
+
type: stdout
|
data/example/data.csv
ADDED
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
#
|
1
|
+
#Wed Jan 13 12:41:02 JST 2016
|
2
2
|
distributionBase=GRADLE_USER_HOME
|
3
3
|
distributionPath=wrapper/dists
|
4
4
|
zipStoreBase=GRADLE_USER_HOME
|
5
5
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-2.
|
6
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
|
@@ -24,18 +24,26 @@ import org.jruby.embed.ScriptingContainer;
|
|
24
24
|
import org.slf4j.Logger;
|
25
25
|
|
26
26
|
import javax.annotation.Nullable;
|
27
|
+
|
28
|
+
import java.io.BufferedInputStream;
|
29
|
+
import java.io.ByteArrayInputStream;
|
30
|
+
import java.io.ByteArrayOutputStream;
|
27
31
|
import java.io.File;
|
28
32
|
import java.io.IOException;
|
29
33
|
import java.io.InputStream;
|
34
|
+
import java.io.SequenceInputStream;
|
30
35
|
import java.util.ArrayList;
|
31
36
|
import java.util.List;
|
32
37
|
import java.util.Map;
|
33
38
|
|
34
|
-
public class HdfsFileInputPlugin
|
39
|
+
public class HdfsFileInputPlugin
|
40
|
+
implements FileInputPlugin
|
35
41
|
{
|
36
42
|
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
43
|
+
private static FileSystem fs;
|
37
44
|
|
38
|
-
public interface PluginTask
|
45
|
+
public interface PluginTask
|
46
|
+
extends Task
|
39
47
|
{
|
40
48
|
@Config("config_files")
|
41
49
|
@ConfigDefault("[]")
|
@@ -60,7 +68,12 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
60
68
|
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
61
69
|
public long getApproximateNumPartitions();
|
62
70
|
|
71
|
+
@Config("skip_header_lines") // Skip this number of lines first. Set 1 if the file has header line.
|
72
|
+
@ConfigDefault("0") // The reason why the parameter is configured is that this plugin splits files.
|
73
|
+
public int getSkipHeaderLines();
|
74
|
+
|
63
75
|
public List<HdfsPartialFile> getFiles();
|
76
|
+
|
64
77
|
public void setFiles(List<HdfsPartialFile> hdfsFiles);
|
65
78
|
|
66
79
|
@ConfigInject
|
@@ -81,8 +94,8 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
81
94
|
throw new PathNotFoundException(pathString);
|
82
95
|
}
|
83
96
|
|
97
|
+
logger.debug("embulk-input-hdfs: Loading target files: {}", originalFileList);
|
84
98
|
task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
|
85
|
-
logger.info("embulk-input-hdfs: Loading target files: {}", originalFileList);
|
86
99
|
}
|
87
100
|
catch (IOException e) {
|
88
101
|
logger.error(e.getMessage());
|
@@ -104,8 +117,8 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
104
117
|
|
105
118
|
@Override
|
106
119
|
public ConfigDiff resume(TaskSource taskSource,
|
107
|
-
|
108
|
-
|
120
|
+
int taskCount,
|
121
|
+
FileInputPlugin.Control control)
|
109
122
|
{
|
110
123
|
control.run(taskSource, taskCount);
|
111
124
|
|
@@ -127,8 +140,8 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
127
140
|
|
128
141
|
@Override
|
129
142
|
public void cleanup(TaskSource taskSource,
|
130
|
-
|
131
|
-
|
143
|
+
int taskCount,
|
144
|
+
List<TaskReport> successTaskReports)
|
132
145
|
{
|
133
146
|
}
|
134
147
|
|
@@ -138,15 +151,22 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
138
151
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
139
152
|
|
140
153
|
InputStream input;
|
154
|
+
final HdfsPartialFile file = task.getFiles().get(taskIndex);
|
141
155
|
try {
|
142
|
-
|
156
|
+
if (file.getStart() > 0 && task.getSkipHeaderLines() > 0) {
|
157
|
+
input = new SequenceInputStream(getHeadersInputStream(task, file), openInputStream(task, file));
|
158
|
+
}
|
159
|
+
else {
|
160
|
+
input = openInputStream(task, file);
|
161
|
+
}
|
143
162
|
}
|
144
163
|
catch (IOException e) {
|
145
164
|
logger.error(e.getMessage());
|
146
165
|
throw new RuntimeException(e);
|
147
166
|
}
|
148
167
|
|
149
|
-
return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
|
168
|
+
return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
|
169
|
+
{
|
150
170
|
@Override
|
151
171
|
public void abort()
|
152
172
|
{ }
|
@@ -159,6 +179,42 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
159
179
|
};
|
160
180
|
}
|
161
181
|
|
182
|
+
private InputStream getHeadersInputStream(PluginTask task, HdfsPartialFile partialFile)
|
183
|
+
throws IOException
|
184
|
+
{
|
185
|
+
FileSystem fs = getFs(task);
|
186
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
187
|
+
int skippedHeaders = 0;
|
188
|
+
|
189
|
+
try (BufferedInputStream in = new BufferedInputStream(fs.open(new Path(partialFile.getPath())))) {
|
190
|
+
while (true) {
|
191
|
+
int c = in.read();
|
192
|
+
if (c < 0) {
|
193
|
+
break;
|
194
|
+
}
|
195
|
+
|
196
|
+
header.write(c);
|
197
|
+
|
198
|
+
if (c == '\n') {
|
199
|
+
skippedHeaders++;
|
200
|
+
}
|
201
|
+
else if (c == '\r') {
|
202
|
+
int c2 = in.read();
|
203
|
+
if (c2 == '\n') {
|
204
|
+
header.write(c2);
|
205
|
+
}
|
206
|
+
skippedHeaders++;
|
207
|
+
}
|
208
|
+
|
209
|
+
if (skippedHeaders >= task.getSkipHeaderLines()) {
|
210
|
+
break;
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
header.close();
|
215
|
+
return new ByteArrayInputStream(header.toByteArray());
|
216
|
+
}
|
217
|
+
|
162
218
|
private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
|
163
219
|
throws IOException
|
164
220
|
{
|
@@ -168,6 +224,18 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
168
224
|
}
|
169
225
|
|
170
226
|
private static FileSystem getFs(final PluginTask task)
|
227
|
+
throws IOException
|
228
|
+
{
|
229
|
+
if (fs == null) {
|
230
|
+
setFs(task);
|
231
|
+
return fs;
|
232
|
+
}
|
233
|
+
else {
|
234
|
+
return fs;
|
235
|
+
}
|
236
|
+
}
|
237
|
+
|
238
|
+
private static FileSystem setFs(final PluginTask task)
|
171
239
|
throws IOException
|
172
240
|
{
|
173
241
|
Configuration configuration = new Configuration();
|
@@ -177,18 +245,25 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
177
245
|
configuration.addResource(file.toURI().toURL());
|
178
246
|
}
|
179
247
|
|
180
|
-
for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
|
248
|
+
for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
|
181
249
|
configuration.set(entry.getKey(), entry.getValue());
|
182
250
|
}
|
183
251
|
|
184
|
-
|
252
|
+
// For debug
|
253
|
+
for (Map.Entry<String, String> entry : configuration) {
|
254
|
+
logger.trace("{}: {}", entry.getKey(), entry.getValue());
|
255
|
+
}
|
256
|
+
logger.debug("Resource Files: {}", configuration);
|
257
|
+
|
258
|
+
fs = FileSystem.get(configuration);
|
259
|
+
return fs;
|
185
260
|
}
|
186
261
|
|
187
|
-
private String strftime(final String raw, final int
|
262
|
+
private String strftime(final String raw, final int rewindSeconds)
|
188
263
|
{
|
189
264
|
ScriptingContainer jruby = new ScriptingContainer();
|
190
265
|
Object resolved = jruby.runScriptlet(
|
191
|
-
String.format("(Time.now - %s).strftime('%s')", String.valueOf(
|
266
|
+
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewindSeconds), raw));
|
192
267
|
return resolved.toString();
|
193
268
|
}
|
194
269
|
|
@@ -255,6 +330,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
255
330
|
long approximateNumPartitions =
|
256
331
|
(task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
|
257
332
|
long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
333
|
+
if (partitionSizeByOneTask <= 0) {
|
334
|
+
partitionSizeByOneTask = 1;
|
335
|
+
}
|
258
336
|
|
259
337
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
260
338
|
for (Path path : pathList) {
|
@@ -23,7 +23,8 @@ public class HdfsFilePartitioner
|
|
23
23
|
this.numPartitions = numPartitions;
|
24
24
|
}
|
25
25
|
|
26
|
-
public List<HdfsPartialFile> getHdfsPartialFiles()
|
26
|
+
public List<HdfsPartialFile> getHdfsPartialFiles()
|
27
|
+
throws IOException
|
27
28
|
{
|
28
29
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
29
30
|
long size = fs.getFileStatus(path).getLen();
|
@@ -1,7 +1,5 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
import org.apache.hadoop.fs.Path;
|
4
|
-
|
5
3
|
/**
|
6
4
|
* Created by takahiro.nakayama on 8/20/15.
|
7
5
|
*/
|
@@ -20,7 +18,9 @@ public class HdfsPartialFile
|
|
20
18
|
}
|
21
19
|
|
22
20
|
// see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
|
23
|
-
public HdfsPartialFile()
|
21
|
+
public HdfsPartialFile()
|
22
|
+
{
|
23
|
+
}
|
24
24
|
|
25
25
|
public String getPath()
|
26
26
|
{
|
@@ -36,5 +36,4 @@ public class HdfsPartialFile
|
|
36
36
|
{
|
37
37
|
return end;
|
38
38
|
}
|
39
|
-
|
40
39
|
}
|
@@ -6,7 +6,8 @@ import java.io.InputStream;
|
|
6
6
|
import java.io.PushbackInputStream;
|
7
7
|
|
8
8
|
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
|
9
|
-
public class HdfsPartialFileInputStream
|
9
|
+
public class HdfsPartialFileInputStream
|
10
|
+
extends InputStream
|
10
11
|
{
|
11
12
|
private final PushbackInputStream original;
|
12
13
|
private long start;
|
@@ -23,13 +24,15 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
23
24
|
}
|
24
25
|
|
25
26
|
@Override
|
26
|
-
public int read(byte[] b)
|
27
|
+
public int read(byte[] b)
|
28
|
+
throws IOException
|
27
29
|
{
|
28
30
|
return read(b, 0, b.length);
|
29
31
|
}
|
30
32
|
|
31
33
|
@Override
|
32
|
-
public int read(byte[] b, int off, int len)
|
34
|
+
public int read(byte[] b, int off, int len)
|
35
|
+
throws IOException
|
33
36
|
{
|
34
37
|
initializeIfNeeded();
|
35
38
|
|
@@ -45,7 +48,7 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
45
48
|
|
46
49
|
current += read;
|
47
50
|
if (current >= end) {
|
48
|
-
for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
|
51
|
+
for (int i = Math.max((int) (end - 1 - current + read), 0); i < read; i++) {
|
49
52
|
if (b[off + i] == '\n') {
|
50
53
|
eof = true;
|
51
54
|
return i + 1;
|
@@ -65,7 +68,8 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
65
68
|
}
|
66
69
|
|
67
70
|
@Override
|
68
|
-
public int read()
|
71
|
+
public int read()
|
72
|
+
throws IOException
|
69
73
|
{
|
70
74
|
initializeIfNeeded();
|
71
75
|
|
@@ -91,7 +95,8 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
91
95
|
}
|
92
96
|
|
93
97
|
@Override
|
94
|
-
public long skip(long n)
|
98
|
+
public long skip(long n)
|
99
|
+
throws IOException
|
95
100
|
{
|
96
101
|
throw new IOException("Skip not supported.");
|
97
102
|
/*
|
@@ -102,18 +107,21 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
102
107
|
}
|
103
108
|
|
104
109
|
@Override
|
105
|
-
public int available()
|
110
|
+
public int available()
|
111
|
+
throws IOException
|
106
112
|
{
|
107
113
|
return 0;
|
108
114
|
}
|
109
115
|
|
110
116
|
@Override
|
111
|
-
public void close()
|
117
|
+
public void close()
|
118
|
+
throws IOException
|
112
119
|
{
|
113
120
|
original.close();
|
114
121
|
}
|
115
122
|
|
116
|
-
private void initializeIfNeeded()
|
123
|
+
private void initializeIfNeeded()
|
124
|
+
throws IOException
|
117
125
|
{
|
118
126
|
if (current >= start) {
|
119
127
|
return;
|
@@ -144,7 +152,8 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
144
152
|
}
|
145
153
|
}
|
146
154
|
|
147
|
-
private int prefetch()
|
155
|
+
private int prefetch()
|
156
|
+
throws IOException
|
148
157
|
{
|
149
158
|
int c = original.read();
|
150
159
|
if (c >= 0) {
|
@@ -152,4 +161,4 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
152
161
|
}
|
153
162
|
return c;
|
154
163
|
}
|
155
|
-
}
|
164
|
+
}
|
@@ -1,5 +1,232 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
+
import com.google.common.base.Function;
|
4
|
+
import com.google.common.collect.ImmutableList;
|
5
|
+
import com.google.common.collect.ImmutableMap;
|
6
|
+
import com.google.common.collect.Lists;
|
7
|
+
import com.google.common.collect.Maps;
|
8
|
+
import org.apache.hadoop.fs.Path;
|
9
|
+
import org.embulk.EmbulkTestRuntime;
|
10
|
+
import org.embulk.config.ConfigException;
|
11
|
+
import org.embulk.config.ConfigSource;
|
12
|
+
import org.embulk.config.TaskReport;
|
13
|
+
import org.embulk.config.TaskSource;
|
14
|
+
import org.embulk.input.hdfs.HdfsFileInputPlugin.PluginTask;
|
15
|
+
import org.embulk.spi.Exec;
|
16
|
+
import org.embulk.spi.FileInputPlugin;
|
17
|
+
import org.embulk.spi.FileInputRunner;
|
18
|
+
import org.embulk.spi.InputPlugin;
|
19
|
+
import org.embulk.spi.Schema;
|
20
|
+
import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
|
21
|
+
import org.embulk.spi.util.Pages;
|
22
|
+
import org.embulk.standards.CsvParserPlugin;
|
23
|
+
import org.junit.Before;
|
24
|
+
import org.junit.Rule;
|
25
|
+
import org.junit.Test;
|
26
|
+
import org.junit.rules.ExpectedException;
|
27
|
+
import org.slf4j.Logger;
|
28
|
+
|
29
|
+
import javax.annotation.Nullable;
|
30
|
+
|
31
|
+
import java.io.File;
|
32
|
+
import java.util.ArrayList;
|
33
|
+
import java.util.List;
|
34
|
+
|
35
|
+
import static org.junit.Assert.assertEquals;
|
36
|
+
|
3
37
|
public class TestHdfsFileInputPlugin
|
4
38
|
{
|
39
|
+
@Rule
|
40
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
41
|
+
|
42
|
+
@Rule
|
43
|
+
public ExpectedException exception = ExpectedException.none();
|
44
|
+
|
45
|
+
private Logger logger = runtime.getExec().getLogger(TestHdfsFileInputPlugin.class);
|
46
|
+
private HdfsFileInputPlugin plugin;
|
47
|
+
private FileInputRunner runner;
|
48
|
+
private MockPageOutput output;
|
49
|
+
private Path path;
|
50
|
+
|
51
|
+
@Before
|
52
|
+
public void createResources()
|
53
|
+
{
|
54
|
+
plugin = new HdfsFileInputPlugin();
|
55
|
+
runner = new FileInputRunner(runtime.getInstance(HdfsFileInputPlugin.class));
|
56
|
+
output = new MockPageOutput();
|
57
|
+
path = new Path(new File(getClass().getResource("/sample_01.csv").getPath()).getParent());
|
58
|
+
}
|
59
|
+
|
60
|
+
@Test
|
61
|
+
public void testDefaultValues()
|
62
|
+
{
|
63
|
+
ConfigSource config = Exec.newConfigSource()
|
64
|
+
.set("path", path.toString());
|
65
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
66
|
+
assertEquals(path.toString(), task.getPath());
|
67
|
+
assertEquals(Lists.newArrayList(), task.getConfigFiles());
|
68
|
+
assertEquals(Maps.newHashMap(), task.getConfig());
|
69
|
+
assertEquals(true, task.getPartition());
|
70
|
+
assertEquals(0, task.getRewindSeconds());
|
71
|
+
assertEquals(-1, task.getApproximateNumPartitions());
|
72
|
+
}
|
73
|
+
|
74
|
+
@Test(expected = ConfigException.class)
|
75
|
+
public void testRequiredValues()
|
76
|
+
{
|
77
|
+
ConfigSource config = Exec.newConfigSource();
|
78
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
79
|
+
}
|
80
|
+
|
81
|
+
@Test
|
82
|
+
public void testFileList()
|
83
|
+
{
|
84
|
+
ConfigSource config = getConfigWithDefaultValues();
|
85
|
+
config.set("num_partitions", 1);
|
86
|
+
plugin.transaction(config, new FileInputPlugin.Control()
|
87
|
+
{
|
88
|
+
@Override
|
89
|
+
public List<TaskReport> run(TaskSource taskSource, int taskCount)
|
90
|
+
{
|
91
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
92
|
+
List<String> fileList = Lists.transform(Lists.newArrayList(new File(path.toString()).list()), new Function<String, String>()
|
93
|
+
{
|
94
|
+
@Nullable
|
95
|
+
@Override
|
96
|
+
public String apply(@Nullable String input)
|
97
|
+
{
|
98
|
+
return new File(path.toString() + "/" + input).toURI().toString();
|
99
|
+
}
|
100
|
+
});
|
101
|
+
|
102
|
+
List<String> resultFList = Lists.transform(task.getFiles(), new Function<HdfsPartialFile, String>()
|
103
|
+
{
|
104
|
+
@Nullable
|
105
|
+
@Override
|
106
|
+
public String apply(@Nullable HdfsPartialFile input)
|
107
|
+
{
|
108
|
+
assert input != null;
|
109
|
+
return input.getPath();
|
110
|
+
}
|
111
|
+
});
|
112
|
+
assertEquals(fileList, resultFList);
|
113
|
+
return emptyTaskReports(taskCount);
|
114
|
+
}
|
115
|
+
});
|
116
|
+
}
|
117
|
+
|
118
|
+
@Test
|
119
|
+
public void testHdfsFileInputByOpen()
|
120
|
+
{
|
121
|
+
ConfigSource config = getConfigWithDefaultValues();
|
122
|
+
config.set("num_partitions", 10);
|
123
|
+
runner.transaction(config, new Control());
|
124
|
+
assertRecords(config, output);
|
125
|
+
}
|
126
|
+
|
127
|
+
@Test
|
128
|
+
public void testHdfsFileInputByOpenWithoutPartition()
|
129
|
+
{
|
130
|
+
ConfigSource config = getConfigWithDefaultValues();
|
131
|
+
config.set("partition", false);
|
132
|
+
runner.transaction(config, new Control());
|
133
|
+
assertRecords(config, output);
|
134
|
+
}
|
135
|
+
|
136
|
+
private class Control
|
137
|
+
implements InputPlugin.Control
|
138
|
+
{
|
139
|
+
@Override
|
140
|
+
public List<TaskReport> run(TaskSource taskSource, Schema schema, int taskCount)
|
141
|
+
{
|
142
|
+
List<TaskReport> reports = new ArrayList<>();
|
143
|
+
for (int i = 0; i < taskCount; i++) {
|
144
|
+
reports.add(runner.run(taskSource, schema, i, output));
|
145
|
+
}
|
146
|
+
return reports;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
private ConfigSource getConfigWithDefaultValues()
|
151
|
+
{
|
152
|
+
return Exec.newConfigSource()
|
153
|
+
.set("path", path.toString())
|
154
|
+
.set("config", hdfsLocalFSConfig())
|
155
|
+
.set("skip_header_lines", 1)
|
156
|
+
.set("parser", parserConfig(schemaConfig()));
|
157
|
+
}
|
158
|
+
|
159
|
+
static List<TaskReport> emptyTaskReports(int taskCount)
|
160
|
+
{
|
161
|
+
ImmutableList.Builder<TaskReport> reports = new ImmutableList.Builder<>();
|
162
|
+
for (int i = 0; i < taskCount; i++) {
|
163
|
+
reports.add(Exec.newTaskReport());
|
164
|
+
}
|
165
|
+
return reports.build();
|
166
|
+
}
|
167
|
+
|
168
|
+
private ImmutableMap<String, Object> hdfsLocalFSConfig()
|
169
|
+
{
|
170
|
+
ImmutableMap.Builder<String, Object> builder = ImmutableMap.builder();
|
171
|
+
builder.put("fs.hdfs.impl", "org.apache.hadoop.fs.LocalFileSystem");
|
172
|
+
builder.put("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
|
173
|
+
builder.put("fs.defaultFS", "file:///");
|
174
|
+
return builder.build();
|
175
|
+
}
|
176
|
+
|
177
|
+
private ImmutableMap<String, Object> parserConfig(ImmutableList<Object> schemaConfig)
|
178
|
+
{
|
179
|
+
ImmutableMap.Builder<String, Object> builder = new ImmutableMap.Builder<>();
|
180
|
+
builder.put("type", "csv");
|
181
|
+
builder.put("newline", "CRLF");
|
182
|
+
builder.put("delimiter", ",");
|
183
|
+
builder.put("quote", "\"");
|
184
|
+
builder.put("escape", "\"");
|
185
|
+
builder.put("trim_if_not_quoted", false);
|
186
|
+
builder.put("skip_header_lines", 1);
|
187
|
+
builder.put("allow_extra_columns", false);
|
188
|
+
builder.put("allow_optional_columns", false);
|
189
|
+
builder.put("columns", schemaConfig);
|
190
|
+
return builder.build();
|
191
|
+
}
|
192
|
+
|
193
|
+
private ImmutableList<Object> schemaConfig()
|
194
|
+
{
|
195
|
+
ImmutableList.Builder<Object> builder = new ImmutableList.Builder<>();
|
196
|
+
builder.add(ImmutableMap.of("name", "id", "type", "long"));
|
197
|
+
builder.add(ImmutableMap.of("name", "account", "type", "long"));
|
198
|
+
builder.add(ImmutableMap.of("name", "time", "type", "timestamp", "format", "%Y-%m-%d %H:%M:%S"));
|
199
|
+
builder.add(ImmutableMap.of("name", "purchase", "type", "timestamp", "format", "%Y%m%d"));
|
200
|
+
builder.add(ImmutableMap.of("name", "comment", "type", "string"));
|
201
|
+
return builder.build();
|
202
|
+
}
|
203
|
+
|
204
|
+
private void assertRecords(ConfigSource config, MockPageOutput output)
|
205
|
+
{
|
206
|
+
List<Object[]> records = getRecords(config, output);
|
207
|
+
assertEquals(8, records.size());
|
208
|
+
{
|
209
|
+
Object[] record = records.get(0);
|
210
|
+
assertEquals(1L, record[0]);
|
211
|
+
assertEquals(32864L, record[1]);
|
212
|
+
assertEquals("2015-01-27 19:23:49 UTC", record[2].toString());
|
213
|
+
assertEquals("2015-01-27 00:00:00 UTC", record[3].toString());
|
214
|
+
assertEquals("embulk", record[4]);
|
215
|
+
}
|
216
|
+
|
217
|
+
{
|
218
|
+
Object[] record = records.get(1);
|
219
|
+
assertEquals(2L, record[0]);
|
220
|
+
assertEquals(14824L, record[1]);
|
221
|
+
assertEquals("2015-01-27 19:01:23 UTC", record[2].toString());
|
222
|
+
assertEquals("2015-01-27 00:00:00 UTC", record[3].toString());
|
223
|
+
assertEquals("embulk jruby", record[4]);
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
private List<Object[]> getRecords(ConfigSource config, MockPageOutput output)
|
228
|
+
{
|
229
|
+
Schema schema = config.getNested("parser").loadConfig(CsvParserPlugin.PluginTask.class).getSchemaConfig().toSchema();
|
230
|
+
return Pages.toObjects(schema, output.pages);
|
231
|
+
}
|
5
232
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- Civitaspo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -46,9 +46,14 @@ extensions: []
|
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
48
|
- .gitignore
|
49
|
+
- .travis.yml
|
49
50
|
- LICENSE.txt
|
50
51
|
- README.md
|
51
52
|
- build.gradle
|
53
|
+
- config/checkstyle/checkstyle.xml
|
54
|
+
- config/checkstyle/default.xml
|
55
|
+
- example/config.yml
|
56
|
+
- example/data.csv
|
52
57
|
- gradle/wrapper/gradle-wrapper.jar
|
53
58
|
- gradle/wrapper/gradle-wrapper.properties
|
54
59
|
- gradlew
|
@@ -59,6 +64,8 @@ files:
|
|
59
64
|
- src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java
|
60
65
|
- src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java
|
61
66
|
- src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
|
67
|
+
- src/test/resources/sample_01.csv
|
68
|
+
- src/test/resources/sample_02.csv
|
62
69
|
- classpath/activation-1.1.jar
|
63
70
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
64
71
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -69,7 +76,7 @@ files:
|
|
69
76
|
- classpath/commons-beanutils-1.7.0.jar
|
70
77
|
- classpath/commons-cli-1.2.jar
|
71
78
|
- classpath/commons-codec-1.6.jar
|
72
|
-
- classpath/commons-collections-3.2.
|
79
|
+
- classpath/commons-collections-3.2.2.jar
|
73
80
|
- classpath/commons-compress-1.4.1.jar
|
74
81
|
- classpath/commons-configuration-1.6.jar
|
75
82
|
- classpath/commons-digester-1.8.jar
|
@@ -82,23 +89,23 @@ files:
|
|
82
89
|
- classpath/curator-client-2.6.0.jar
|
83
90
|
- classpath/curator-framework-2.6.0.jar
|
84
91
|
- classpath/curator-recipes-2.6.0.jar
|
85
|
-
- classpath/embulk-input-hdfs-0.1.
|
92
|
+
- classpath/embulk-input-hdfs-0.1.9.jar
|
86
93
|
- classpath/gson-2.2.4.jar
|
87
|
-
- classpath/hadoop-annotations-2.6.
|
88
|
-
- classpath/hadoop-auth-2.6.
|
89
|
-
- classpath/hadoop-client-2.6.
|
90
|
-
- classpath/hadoop-common-2.6.
|
91
|
-
- classpath/hadoop-hdfs-2.6.
|
92
|
-
- classpath/hadoop-mapreduce-client-app-2.6.
|
93
|
-
- classpath/hadoop-mapreduce-client-common-2.6.
|
94
|
-
- classpath/hadoop-mapreduce-client-core-2.6.
|
95
|
-
- classpath/hadoop-mapreduce-client-jobclient-2.6.
|
96
|
-
- classpath/hadoop-mapreduce-client-shuffle-2.6.
|
97
|
-
- classpath/hadoop-yarn-api-2.6.
|
98
|
-
- classpath/hadoop-yarn-client-2.6.
|
99
|
-
- classpath/hadoop-yarn-common-2.6.
|
100
|
-
- classpath/hadoop-yarn-server-common-2.6.
|
101
|
-
- classpath/hadoop-yarn-server-nodemanager-2.6.
|
94
|
+
- classpath/hadoop-annotations-2.6.3.jar
|
95
|
+
- classpath/hadoop-auth-2.6.3.jar
|
96
|
+
- classpath/hadoop-client-2.6.3.jar
|
97
|
+
- classpath/hadoop-common-2.6.3.jar
|
98
|
+
- classpath/hadoop-hdfs-2.6.3.jar
|
99
|
+
- classpath/hadoop-mapreduce-client-app-2.6.3.jar
|
100
|
+
- classpath/hadoop-mapreduce-client-common-2.6.3.jar
|
101
|
+
- classpath/hadoop-mapreduce-client-core-2.6.3.jar
|
102
|
+
- classpath/hadoop-mapreduce-client-jobclient-2.6.3.jar
|
103
|
+
- classpath/hadoop-mapreduce-client-shuffle-2.6.3.jar
|
104
|
+
- classpath/hadoop-yarn-api-2.6.3.jar
|
105
|
+
- classpath/hadoop-yarn-client-2.6.3.jar
|
106
|
+
- classpath/hadoop-yarn-common-2.6.3.jar
|
107
|
+
- classpath/hadoop-yarn-server-common-2.6.3.jar
|
108
|
+
- classpath/hadoop-yarn-server-nodemanager-2.6.3.jar
|
102
109
|
- classpath/htrace-core-3.0.4.jar
|
103
110
|
- classpath/httpclient-4.2.5.jar
|
104
111
|
- classpath/httpcore-4.2.4.jar
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|