embulk-decoder-commons-compress 0.3.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -1
- data/README.md +9 -4
- data/build.gradle +6 -6
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +3 -3
- data/gradlew +43 -35
- data/gradlew.bat +4 -10
- data/src/integration-test/java/org/embulk/filter/TestIntegration.java +14 -0
- data/src/integration-test/resources/config_concatenated_bzip2.yml +0 -1
- data/src/integration-test/resources/config_concatenated_gz.yml +0 -1
- data/src/integration-test/resources/config_no_concatenated_bzip2.yml +27 -0
- data/src/integration-test/resources/config_no_concatenated_gzip.yml +27 -0
- data/src/main/java/org/embulk/decoder/ArchiveInputStreamIterator.java +19 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressDecoderPlugin.java +8 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressProvider.java +11 -8
- data/src/test/java/org/embulk/decoder/TestArchiveInputStreamIterator.java +15 -0
- data/src/test/java/org/embulk/decoder/TestCommonsCompressDecoderPlugin.java +24 -0
- data/src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java +16 -0
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5f9220df5b031a29034546701f855a47ec8dd58
|
4
|
+
data.tar.gz: 096289231bfcfc5a70e67faf28bb5252a2f308c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd02b792a6d97303e70ba516a47ac2d49406426e6adcfa65be63d09accf89068a1d030967a7e0e3a013e4598257cbf9526aa3b5cb75ea66c519f150393026370
|
7
|
+
data.tar.gz: bc60b5fc44b7e7b9802e23f4868dffcb72d779f0233f36c39f2ab75934480a4e71109b82008e177f2c89935fb63b31450a1734ba5cb7b23e1d2bbfdc40979f6a
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -17,6 +17,8 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
|
|
17
17
|
- The format type is one of supported formats by by [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/).
|
18
18
|
- Auto detect is used when there is no configuration. This can use for a single format. If a file format is solid compression like tar.gz, please set format config explicitly.
|
19
19
|
- Some listing formats in [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) may not work in your environment. I could confirm the following formats work well. Your environment may be able to use other formats listed in the site.
|
20
|
+
- **decompress_concatenated**: gzip, bzip2, and xz formats support multiple concatenated streams. The default value of this parameter is true. If you want to disable it, then set to false. See [CompressorStreamFactory.setDecompressConcatenated()](https://commons.apache.org/proper/commons-compress/apidocs/org/apache/commons/compress/compressors/CompressorStreamFactory.html#setDecompressConcatenated(boolean)) in ver.1.9 for more details.
|
21
|
+
- **match_name**: Only the files in an archive which match to match_name are processed. match_name is set by regular expression.
|
20
22
|
|
21
23
|
## Formats
|
22
24
|
|
@@ -29,7 +31,6 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
|
|
29
31
|
- tbz, tbz2, tb2, tar.bz2
|
30
32
|
- taz, tz, tar.Z
|
31
33
|
|
32
|
-
If input files are concatenated gzip or bzip2 format, please set format parameter explicitly.
|
33
34
|
|
34
35
|
## Example
|
35
36
|
|
@@ -62,24 +63,28 @@ in:
|
|
62
63
|
format: tgz
|
63
64
|
```
|
64
65
|
|
65
|
-
- Set
|
66
|
+
- Set decompress_concatenated to false if you would like to read the first concatenated gzip/bzip2 archive only.
|
67
|
+
|
66
68
|
```yaml
|
67
69
|
in:
|
68
70
|
type: any input plugin type
|
69
71
|
decoders:
|
70
72
|
- type: commons-compress
|
71
|
-
|
73
|
+
decompress_concatenated: false
|
72
74
|
```
|
73
75
|
|
76
|
+
- Set match_name to extract only the files whose suffix is '.csv' from an archive.
|
77
|
+
|
74
78
|
```yaml
|
75
79
|
in:
|
76
80
|
type: any input plugin type
|
77
81
|
decoders:
|
78
82
|
- type: commons-compress
|
79
|
-
|
83
|
+
match_name: ".*\\.csv"
|
80
84
|
```
|
81
85
|
|
82
86
|
|
87
|
+
|
83
88
|
## Build
|
84
89
|
|
85
90
|
```
|
data/build.gradle
CHANGED
@@ -7,8 +7,8 @@ import com.github.jrubygradle.JRubyExec
|
|
7
7
|
|
8
8
|
apply from: 'https://raw.githubusercontent.com/hata/gradle-plugins/master/embulk-integration-test.gradle'
|
9
9
|
|
10
|
-
sourceCompatibility = '1.
|
11
|
-
targetCompatibility = '1.
|
10
|
+
sourceCompatibility = '1.8'
|
11
|
+
targetCompatibility = '1.8'
|
12
12
|
|
13
13
|
repositories {
|
14
14
|
mavenCentral()
|
@@ -18,12 +18,12 @@ configurations {
|
|
18
18
|
provided
|
19
19
|
}
|
20
20
|
|
21
|
-
version = "0.
|
21
|
+
version = "0.5.0"
|
22
22
|
|
23
23
|
dependencies {
|
24
|
-
compile "org.embulk:embulk-core:0.
|
25
|
-
compile "org.apache.commons:commons-compress:1.
|
26
|
-
provided "org.embulk:embulk-core:0.
|
24
|
+
compile "org.embulk:embulk-core:0.9.23"
|
25
|
+
compile "org.apache.commons:commons-compress:1.20"
|
26
|
+
provided "org.embulk:embulk-core:0.9.23"
|
27
27
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
28
28
|
testCompile "org.jmockit:jmockit:1.15"
|
29
29
|
testCompile "junit:junit:4.+"
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
#
|
1
|
+
#Fri Jun 05 07:10:25 JST 2020
|
2
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-4.8.1-all.zip
|
2
3
|
distributionBase=GRADLE_USER_HOME
|
3
4
|
distributionPath=wrapper/dists
|
4
|
-
zipStoreBase=GRADLE_USER_HOME
|
5
5
|
zipStorePath=wrapper/dists
|
6
|
-
|
6
|
+
zipStoreBase=GRADLE_USER_HOME
|
data/gradlew
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env sh
|
2
2
|
|
3
3
|
##############################################################################
|
4
4
|
##
|
@@ -6,20 +6,38 @@
|
|
6
6
|
##
|
7
7
|
##############################################################################
|
8
8
|
|
9
|
-
#
|
10
|
-
|
9
|
+
# Attempt to set APP_HOME
|
10
|
+
# Resolve links: $0 may be a link
|
11
|
+
PRG="$0"
|
12
|
+
# Need this for relative symlinks.
|
13
|
+
while [ -h "$PRG" ] ; do
|
14
|
+
ls=`ls -ld "$PRG"`
|
15
|
+
link=`expr "$ls" : '.*-> \(.*\)$'`
|
16
|
+
if expr "$link" : '/.*' > /dev/null; then
|
17
|
+
PRG="$link"
|
18
|
+
else
|
19
|
+
PRG=`dirname "$PRG"`"/$link"
|
20
|
+
fi
|
21
|
+
done
|
22
|
+
SAVED="`pwd`"
|
23
|
+
cd "`dirname \"$PRG\"`/" >/dev/null
|
24
|
+
APP_HOME="`pwd -P`"
|
25
|
+
cd "$SAVED" >/dev/null
|
11
26
|
|
12
27
|
APP_NAME="Gradle"
|
13
28
|
APP_BASE_NAME=`basename "$0"`
|
14
29
|
|
30
|
+
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
+
DEFAULT_JVM_OPTS=""
|
32
|
+
|
15
33
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
16
34
|
MAX_FD="maximum"
|
17
35
|
|
18
|
-
warn (
|
36
|
+
warn () {
|
19
37
|
echo "$*"
|
20
38
|
}
|
21
39
|
|
22
|
-
die (
|
40
|
+
die () {
|
23
41
|
echo
|
24
42
|
echo "$*"
|
25
43
|
echo
|
@@ -30,6 +48,7 @@ die ( ) {
|
|
30
48
|
cygwin=false
|
31
49
|
msys=false
|
32
50
|
darwin=false
|
51
|
+
nonstop=false
|
33
52
|
case "`uname`" in
|
34
53
|
CYGWIN* )
|
35
54
|
cygwin=true
|
@@ -40,31 +59,11 @@ case "`uname`" in
|
|
40
59
|
MINGW* )
|
41
60
|
msys=true
|
42
61
|
;;
|
62
|
+
NONSTOP* )
|
63
|
+
nonstop=true
|
64
|
+
;;
|
43
65
|
esac
|
44
66
|
|
45
|
-
# For Cygwin, ensure paths are in UNIX format before anything is touched.
|
46
|
-
if $cygwin ; then
|
47
|
-
[ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
|
48
|
-
fi
|
49
|
-
|
50
|
-
# Attempt to set APP_HOME
|
51
|
-
# Resolve links: $0 may be a link
|
52
|
-
PRG="$0"
|
53
|
-
# Need this for relative symlinks.
|
54
|
-
while [ -h "$PRG" ] ; do
|
55
|
-
ls=`ls -ld "$PRG"`
|
56
|
-
link=`expr "$ls" : '.*-> \(.*\)$'`
|
57
|
-
if expr "$link" : '/.*' > /dev/null; then
|
58
|
-
PRG="$link"
|
59
|
-
else
|
60
|
-
PRG=`dirname "$PRG"`"/$link"
|
61
|
-
fi
|
62
|
-
done
|
63
|
-
SAVED="`pwd`"
|
64
|
-
cd "`dirname \"$PRG\"`/" >&-
|
65
|
-
APP_HOME="`pwd -P`"
|
66
|
-
cd "$SAVED" >&-
|
67
|
-
|
68
67
|
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
69
68
|
|
70
69
|
# Determine the Java command to use to start the JVM.
|
@@ -90,7 +89,7 @@ location of your Java installation."
|
|
90
89
|
fi
|
91
90
|
|
92
91
|
# Increase the maximum file descriptors if we can.
|
93
|
-
if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
|
92
|
+
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
94
93
|
MAX_FD_LIMIT=`ulimit -H -n`
|
95
94
|
if [ $? -eq 0 ] ; then
|
96
95
|
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
@@ -114,6 +113,7 @@ fi
|
|
114
113
|
if $cygwin ; then
|
115
114
|
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
116
115
|
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
116
|
+
JAVACMD=`cygpath --unix "$JAVACMD"`
|
117
117
|
|
118
118
|
# We build the pattern for arguments to be converted via cygpath
|
119
119
|
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
@@ -154,11 +154,19 @@ if $cygwin ; then
|
|
154
154
|
esac
|
155
155
|
fi
|
156
156
|
|
157
|
-
#
|
158
|
-
|
159
|
-
|
157
|
+
# Escape application args
|
158
|
+
save () {
|
159
|
+
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
|
+
echo " "
|
160
161
|
}
|
161
|
-
|
162
|
-
|
162
|
+
APP_ARGS=$(save "$@")
|
163
|
+
|
164
|
+
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
|
+
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
166
|
+
|
167
|
+
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
168
|
+
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
169
|
+
cd "$(dirname "$0")"
|
170
|
+
fi
|
163
171
|
|
164
|
-
exec "$JAVACMD" "
|
172
|
+
exec "$JAVACMD" "$@"
|
data/gradlew.bat
CHANGED
@@ -8,14 +8,14 @@
|
|
8
8
|
@rem Set local scope for the variables with windows NT shell
|
9
9
|
if "%OS%"=="Windows_NT" setlocal
|
10
10
|
|
11
|
-
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
12
|
-
set DEFAULT_JVM_OPTS=
|
13
|
-
|
14
11
|
set DIRNAME=%~dp0
|
15
12
|
if "%DIRNAME%" == "" set DIRNAME=.
|
16
13
|
set APP_BASE_NAME=%~n0
|
17
14
|
set APP_HOME=%DIRNAME%
|
18
15
|
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
19
|
@rem Find java.exe
|
20
20
|
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
21
|
|
@@ -46,10 +46,9 @@ echo location of your Java installation.
|
|
46
46
|
goto fail
|
47
47
|
|
48
48
|
:init
|
49
|
-
@rem Get command-line arguments, handling
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
50
|
|
51
51
|
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
-
if "%@eval[2+2]" == "4" goto 4NT_args
|
53
52
|
|
54
53
|
:win9xME_args
|
55
54
|
@rem Slurp the command line arguments.
|
@@ -60,11 +59,6 @@ set _SKIP=2
|
|
60
59
|
if "x%~1" == "x" goto execute
|
61
60
|
|
62
61
|
set CMD_LINE_ARGS=%*
|
63
|
-
goto execute
|
64
|
-
|
65
|
-
:4NT_args
|
66
|
-
@rem Get arguments from the 4NT Shell from JP Software
|
67
|
-
set CMD_LINE_ARGS=%$
|
68
62
|
|
69
63
|
:execute
|
70
64
|
@rem Setup the command line
|
@@ -111,6 +111,20 @@ public class TestIntegration {
|
|
111
111
|
getChecksumFromFiles("result_concatenated_bzip2_000.00.csv"));
|
112
112
|
}
|
113
113
|
|
114
|
+
@Test
|
115
|
+
public void testNoConcatenatedGzip() throws Exception {
|
116
|
+
assertEquals("Verify input and output contents are identical.",
|
117
|
+
getChecksumFromFiles(SAMPLE_1_SRC_FILES),
|
118
|
+
getChecksumFromFiles("result_no_concatenated_gzip_000.00.csv"));
|
119
|
+
}
|
120
|
+
|
121
|
+
@Test
|
122
|
+
public void testNoConcatenatedBzip2() throws Exception {
|
123
|
+
assertEquals("Verify input and output contents are identical.",
|
124
|
+
getChecksumFromFiles(SAMPLE_1_SRC_FILES),
|
125
|
+
getChecksumFromFiles("result_no_concatenated_bzip2_000.00.csv"));
|
126
|
+
}
|
127
|
+
|
114
128
|
private long getChecksumFromFiles(String ... files) throws IOException {
|
115
129
|
Checksum cksum = new CRC32();
|
116
130
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.bz2
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
decompress_concatenated: false
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_no_concatenated_bzip2_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.gz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
decompress_concatenated: false
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_no_concatenated_gzip_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -10,6 +10,7 @@ import org.apache.commons.compress.archivers.ArchiveInputStream;
|
|
10
10
|
class ArchiveInputStreamIterator implements Iterator<InputStream> {
|
11
11
|
private ArchiveInputStream ain;
|
12
12
|
private ArchiveEntry entry;
|
13
|
+
private String matchRegex = "";
|
13
14
|
private boolean endOfArchive = false;
|
14
15
|
|
15
16
|
ArchiveInputStreamIterator(ArchiveInputStream ain)
|
@@ -17,6 +18,11 @@ class ArchiveInputStreamIterator implements Iterator
|
|
17
18
|
this.ain = ain;
|
18
19
|
}
|
19
20
|
|
21
|
+
ArchiveInputStreamIterator(ArchiveInputStream ain, String matchRegex) {
|
22
|
+
this.ain = ain;
|
23
|
+
this.matchRegex = matchRegex;
|
24
|
+
}
|
25
|
+
|
20
26
|
@Override
|
21
27
|
public boolean hasNext() {
|
22
28
|
try {
|
@@ -60,9 +66,22 @@ class ArchiveInputStreamIterator implements Iterator
|
|
60
66
|
return false;
|
61
67
|
} else if (entry.isDirectory()) {
|
62
68
|
continue;
|
69
|
+
} else if (!matchName(entry, matchRegex)){
|
70
|
+
continue;
|
63
71
|
} else {
|
64
72
|
return true;
|
65
73
|
}
|
66
74
|
}
|
67
75
|
}
|
76
|
+
|
77
|
+
private boolean matchName(ArchiveEntry entry, String regex) {
|
78
|
+
String name = entry.getName();
|
79
|
+
if(regex == null || regex.equals("")){
|
80
|
+
return true;
|
81
|
+
} else if(name == null) {
|
82
|
+
return false;
|
83
|
+
} else {
|
84
|
+
return name.matches(regex);
|
85
|
+
}
|
86
|
+
}
|
68
87
|
}
|
@@ -21,6 +21,14 @@ public class CommonsCompressDecoderPlugin
|
|
21
21
|
@ConfigDefault("\"\"")
|
22
22
|
public String getFormat();
|
23
23
|
|
24
|
+
@Config("decompress_concatenated")
|
25
|
+
@ConfigDefault("true")
|
26
|
+
public boolean getDecompressConcatenated();
|
27
|
+
|
28
|
+
@Config("match_name")
|
29
|
+
@ConfigDefault("\"\"")
|
30
|
+
public String getMatchName();
|
31
|
+
|
24
32
|
@ConfigInject
|
25
33
|
public BufferAllocator getBufferAllocator();
|
26
34
|
}
|
@@ -26,6 +26,8 @@ class CommonsCompressProvider implements Provider {
|
|
26
26
|
private final boolean formatAutoDetection;
|
27
27
|
private Iterator<InputStream> inputStreamIterator;
|
28
28
|
private String[] formats;
|
29
|
+
private final boolean decompressConcatenated;
|
30
|
+
private final String matchName;
|
29
31
|
|
30
32
|
CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
|
31
33
|
this.files = files;
|
@@ -37,6 +39,9 @@ class CommonsCompressProvider implements Provider {
|
|
37
39
|
throw new RuntimeException("Failed to get a format.");
|
38
40
|
}
|
39
41
|
}
|
42
|
+
this.decompressConcatenated = task == null
|
43
|
+
|| task.getDecompressConcatenated();
|
44
|
+
this.matchName = (task == null)? "" : task.getMatchName();
|
40
45
|
}
|
41
46
|
|
42
47
|
@Override
|
@@ -85,7 +90,9 @@ class CommonsCompressProvider implements Provider {
|
|
85
90
|
in = in.markSupported() ? in : new BufferedInputStream(in);
|
86
91
|
try {
|
87
92
|
return new ArchiveInputStreamIterator(
|
88
|
-
createArchiveInputStream(AUTO_DETECT_FORMAT, in)
|
93
|
+
createArchiveInputStream(AUTO_DETECT_FORMAT, in),
|
94
|
+
this.matchName
|
95
|
+
);
|
89
96
|
} catch (IOException | ArchiveException e) {
|
90
97
|
// ArchiveStreamFactory set mark and reset the stream.
|
91
98
|
// So, we can use the same stream to check compressor.
|
@@ -117,7 +124,8 @@ class CommonsCompressProvider implements Provider {
|
|
117
124
|
String format = inputFormats[pos];
|
118
125
|
if (CommonsCompressUtil.isArchiveFormat(format)) {
|
119
126
|
return new ArchiveInputStreamIterator(
|
120
|
-
createArchiveInputStream(format, in)
|
127
|
+
createArchiveInputStream(format, in),
|
128
|
+
this.matchName);
|
121
129
|
} else if (CommonsCompressUtil.isCompressorFormat(format)) {
|
122
130
|
return createInputStreamIterator(inputFormats, pos + 1,
|
123
131
|
createCompressorInputStream(format, in));
|
@@ -162,6 +170,7 @@ class CommonsCompressProvider implements Provider {
|
|
162
170
|
CompressorInputStream createCompressorInputStream(String format,
|
163
171
|
InputStream in) throws IOException, CompressorException {
|
164
172
|
CompressorStreamFactory factory = new CompressorStreamFactory();
|
173
|
+
factory.setDecompressConcatenated(decompressConcatenated);
|
165
174
|
if (CommonsCompressUtil.isAutoDetect(format)) {
|
166
175
|
in = in.markSupported() ? in : new BufferedInputStream(in);
|
167
176
|
try {
|
@@ -171,12 +180,6 @@ class CommonsCompressProvider implements Provider {
|
|
171
180
|
"Failed to detect a file format. Please try to set a format explicitly.",
|
172
181
|
e);
|
173
182
|
}
|
174
|
-
}
|
175
|
-
|
176
|
-
if (CompressorStreamFactory.GZIP.equalsIgnoreCase(format)) {
|
177
|
-
return new GzipCompressorInputStream(in, true);
|
178
|
-
} else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(format)) {
|
179
|
-
return new BZip2CompressorInputStream(in, true);
|
180
183
|
} else {
|
181
184
|
return factory.createCompressorInputStream(format, in);
|
182
185
|
}
|
@@ -70,6 +70,21 @@ public class TestArchiveInputStreamIterator {
|
|
70
70
|
assertNull("Verify there is no stream.", it.next());
|
71
71
|
}
|
72
72
|
|
73
|
+
@Test
|
74
|
+
public void testHasNextForNameMatch(@Mocked final ArchiveInputStream ain, @Mocked final ArchiveEntry entry) throws Exception {
|
75
|
+
new NonStrictExpectations() {{
|
76
|
+
ain.getNextEntry(); result = entry; result = entry; result = entry; result = null;
|
77
|
+
entry.getName(); result = "first.csv"; result = "second.txt"; result = "third.csv";
|
78
|
+
}};
|
79
|
+
ArchiveInputStreamIterator it = new ArchiveInputStreamIterator(ain, ".*\\.csv");
|
80
|
+
assertTrue("Verify 1st file match", it.hasNext());
|
81
|
+
assertEquals("Verify ArchiveInputStream is return.", (InputStream)ain, it.next());
|
82
|
+
assertTrue("Verify 3rd file match", it.hasNext());
|
83
|
+
assertEquals("Verify ArchiveInputStream is return.", (InputStream)ain, it.next());
|
84
|
+
assertFalse("Veryfy no more entry because second.txt is skipped.", it.hasNext());
|
85
|
+
assertNull("Verify there is no stream.", it.next());
|
86
|
+
}
|
87
|
+
|
73
88
|
@Test
|
74
89
|
public void testArchiveFile() throws Exception {
|
75
90
|
InputStream in = getClass().getResourceAsStream("samples.tar");
|
@@ -59,6 +59,16 @@ public class TestCommonsCompressDecoderPlugin
|
|
59
59
|
Assert.assertEquals("Verify the default config value.", DEFAULT_FORMAT_CONFIG, configDefault.value());
|
60
60
|
}
|
61
61
|
|
62
|
+
@Test
|
63
|
+
public void testPluginTaskGetDecompressConcatenated() throws Exception {
|
64
|
+
Method method = CommonsCompressDecoderPlugin.PluginTask.class.getMethod("getDecompressConcatenated");
|
65
|
+
Config config = method.getAnnotation(Config.class);
|
66
|
+
ConfigDefault configDefault = method.getAnnotation(ConfigDefault.class);
|
67
|
+
|
68
|
+
Assert.assertEquals("Verify the config name.", "decompress_concatenated", config.value());
|
69
|
+
Assert.assertEquals("Verify the default config value.", "true", configDefault.value());
|
70
|
+
}
|
71
|
+
|
62
72
|
@Test
|
63
73
|
public void testTransaction(@Mocked final ConfigSource config, @Mocked final DecoderPlugin.Control control)
|
64
74
|
{
|
@@ -552,9 +562,13 @@ public class TestCommonsCompressDecoderPlugin
|
|
552
562
|
|
553
563
|
private class MockPluginTask implements CommonsCompressDecoderPlugin.PluginTask {
|
554
564
|
private final String format;
|
565
|
+
private final boolean decompressConcatenated;
|
566
|
+
private final String matchName;
|
555
567
|
|
556
568
|
MockPluginTask(String format) {
|
557
569
|
this.format = format;
|
570
|
+
this.decompressConcatenated = true;
|
571
|
+
this.matchName = "";
|
558
572
|
}
|
559
573
|
|
560
574
|
@Override
|
@@ -571,6 +585,16 @@ public class TestCommonsCompressDecoderPlugin
|
|
571
585
|
return format;
|
572
586
|
}
|
573
587
|
|
588
|
+
@Override
|
589
|
+
public boolean getDecompressConcatenated() {
|
590
|
+
return decompressConcatenated;
|
591
|
+
}
|
592
|
+
|
593
|
+
@Override
|
594
|
+
public String getMatchName() {
|
595
|
+
return matchName;
|
596
|
+
}
|
597
|
+
|
574
598
|
@Override
|
575
599
|
public BufferAllocator getBufferAllocator() {
|
576
600
|
return newBufferAllocator();
|
@@ -243,6 +243,10 @@ public class TestCommonsCompressProvider {
|
|
243
243
|
|
244
244
|
@Test
|
245
245
|
public void testCreateInputStreamConcatenatedGZ() throws Exception {
|
246
|
+
new NonStrictExpectations() {{
|
247
|
+
task.getDecompressConcatenated(); result = true;
|
248
|
+
}};
|
249
|
+
|
246
250
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
247
251
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
248
252
|
new String[]{CompressorStreamFactory.GZIP}, 0, getResourceInputStream("concatenated.csv.gz"));
|
@@ -252,6 +256,10 @@ public class TestCommonsCompressProvider {
|
|
252
256
|
|
253
257
|
@Test
|
254
258
|
public void testCreateInputStreamConcatenatedGZip() throws Exception {
|
259
|
+
new NonStrictExpectations() {{
|
260
|
+
task.getDecompressConcatenated(); result = true;
|
261
|
+
}};
|
262
|
+
|
255
263
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
256
264
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
257
265
|
CommonsCompressUtil.toFormats("gzip"), 0, getResourceInputStream("concatenated.csv.gz"));
|
@@ -261,6 +269,10 @@ public class TestCommonsCompressProvider {
|
|
261
269
|
|
262
270
|
@Test
|
263
271
|
public void testCreateInputStreamConcatenatedBZip2() throws Exception {
|
272
|
+
new NonStrictExpectations() {{
|
273
|
+
task.getDecompressConcatenated(); result = true;
|
274
|
+
}};
|
275
|
+
|
264
276
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
265
277
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
266
278
|
new String[]{CompressorStreamFactory.BZIP2}, 0, getResourceInputStream("concatenated.csv.bz2"));
|
@@ -270,6 +282,10 @@ public class TestCommonsCompressProvider {
|
|
270
282
|
|
271
283
|
@Test
|
272
284
|
public void testCreateInputStreamConcatenatedBZ2() throws Exception {
|
285
|
+
new NonStrictExpectations() {{
|
286
|
+
task.getDecompressConcatenated(); result = true;
|
287
|
+
}};
|
288
|
+
|
273
289
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
274
290
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
275
291
|
CommonsCompressUtil.toFormats("bz2"), 0, getResourceInputStream("concatenated.csv.bz2"));
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-decoder-commons-compress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hata
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -65,6 +65,8 @@ files:
|
|
65
65
|
- src/integration-test/resources/config_concatenated_gz.yml
|
66
66
|
- src/integration-test/resources/config_concatenated_gzip.yml
|
67
67
|
- src/integration-test/resources/config_gz.yml
|
68
|
+
- src/integration-test/resources/config_no_concatenated_bzip2.yml
|
69
|
+
- src/integration-test/resources/config_no_concatenated_gzip.yml
|
68
70
|
- src/integration-test/resources/config_tar.Z.yml
|
69
71
|
- src/integration-test/resources/config_tar.bz2.yml
|
70
72
|
- src/integration-test/resources/config_tar.gz.yml
|
@@ -110,8 +112,8 @@ files:
|
|
110
112
|
- src/test/resources/org/embulk/decoder/samples.tar.xz
|
111
113
|
- src/test/resources/org/embulk/decoder/samples.tgz
|
112
114
|
- src/test/resources/org/embulk/decoder/samples.zip
|
113
|
-
- classpath/commons-compress-1.
|
114
|
-
- classpath/embulk-decoder-commons-compress-0.
|
115
|
+
- classpath/commons-compress-1.20.jar
|
116
|
+
- classpath/embulk-decoder-commons-compress-0.5.0.jar
|
115
117
|
homepage: https://github.com/hata/embulk-decoder-commons-compress
|
116
118
|
licenses:
|
117
119
|
- MIT
|