embulk-decoder-commons-compress 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -0
- data/README.md +29 -0
- data/build.gradle +5 -1
- data/src/integration-test/java/org/embulk/filter/TestIntegration.java +138 -0
- data/src/integration-test/resources/concatenated.csv.bz2 +0 -0
- data/src/integration-test/resources/concatenated.csv.gz +0 -0
- data/src/integration-test/resources/config_ar.yml +26 -0
- data/src/integration-test/resources/config_bz2.yml +26 -0
- data/src/integration-test/resources/config_concatenated_bz2.yml +27 -0
- data/src/integration-test/resources/config_concatenated_bzip2.yml +27 -0
- data/src/integration-test/resources/config_concatenated_gz.yml +27 -0
- data/src/integration-test/resources/config_concatenated_gzip.yml +27 -0
- data/src/integration-test/resources/config_gz.yml +26 -0
- data/src/integration-test/resources/config_tar.Z.yml +27 -0
- data/src/integration-test/resources/config_tar.bz2.yml +27 -0
- data/src/integration-test/resources/config_tar.gz.yml +27 -0
- data/src/integration-test/resources/config_tar.yml +27 -0
- data/src/integration-test/resources/config_tgz.yml +27 -0
- data/src/integration-test/resources/config_zip.yml +26 -0
- data/src/integration-test/resources/header.csv +2 -0
- data/src/integration-test/resources/sample_0.tar +0 -0
- data/src/integration-test/resources/sample_1.csv +1 -0
- data/src/integration-test/resources/sample_1.csv.bz2 +0 -0
- data/src/integration-test/resources/sample_1.csv.gz +0 -0
- data/src/integration-test/resources/sample_1.tar +0 -0
- data/src/integration-test/resources/sample_2.csv +1 -0
- data/src/integration-test/resources/samples.ar +5 -0
- data/src/integration-test/resources/samples.tar +0 -0
- data/src/integration-test/resources/samples.tar.Z +0 -0
- data/src/integration-test/resources/samples.tar.bz2 +0 -0
- data/src/integration-test/resources/samples.tar.gz +0 -0
- data/src/integration-test/resources/samples.tgz +0 -0
- data/src/integration-test/resources/samples.zip +0 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressProvider.java +12 -4
- data/src/main/java/org/embulk/decoder/CommonsCompressUtil.java +22 -2
- data/src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java +38 -2
- data/src/test/resources/org/embulk/decoder/concatenated.csv.bz2 +0 -0
- data/src/test/resources/org/embulk/decoder/concatenated.csv.gz +0 -0
- metadata +36 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c492e93cc8decc06fa54a952930d446748d1bc3
|
4
|
+
data.tar.gz: 3bc26ce03ac3fd8cbd450b12d5c798f1f0d88326
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a3e7102ce968c9d4c7b7733025d8b141ee55c9dd4f3e730c05a0e9ea0d11f0ddadab2fb9ca34b107eb30949108cdb6c3eb669a31eb5111c95eae52c65c6360dd
|
7
|
+
data.tar.gz: aa39557828feb9906a43d061b2edecaf558bdcae8ff6b00bd6f3623ff37c09d5d3e905f18789c5b8b170a91cb41058ea229fd2df376a6487e644aacefe36c45d
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
# Commons Compress decoder plugin for Embulk
|
2
2
|
|
3
|
+
[](https://travis-ci.org/hata/embulk-decoder-commons-compress)
|
4
|
+
|
5
|
+
|
3
6
|
This decoder plugin for Embulk supports various archive formats using [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) library.
|
4
7
|
|
5
8
|
## Overview
|
@@ -26,6 +29,8 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
|
|
26
29
|
- tbz, tbz2, tb2, tar.bz2
|
27
30
|
- taz, tz, tar.Z
|
28
31
|
|
32
|
+
If input files are concatenated gzip or bzip2 format, please set format parameter explicitly.
|
33
|
+
|
29
34
|
## Example
|
30
35
|
|
31
36
|
- Use auto detection. This can use for 1 format like tar and zip. If you would like to use a solid compression format like tar.gz, please set the format to your configuration file.
|
@@ -57,12 +62,36 @@ in:
|
|
57
62
|
format: tgz
|
58
63
|
```
|
59
64
|
|
65
|
+
- Set *format* parameter to handle concatenated gzip(or bzip2) file.
|
66
|
+
```yaml
|
67
|
+
in:
|
68
|
+
type: any input plugin type
|
69
|
+
decoders:
|
70
|
+
- type: commons-compress
|
71
|
+
format: gz
|
72
|
+
```
|
73
|
+
|
74
|
+
```yaml
|
75
|
+
in:
|
76
|
+
type: any input plugin type
|
77
|
+
decoders:
|
78
|
+
- type: commons-compress
|
79
|
+
format: bzip2
|
80
|
+
```
|
81
|
+
|
82
|
+
|
60
83
|
## Build
|
61
84
|
|
62
85
|
```
|
63
86
|
$ ./gradlew gem
|
64
87
|
```
|
65
88
|
|
89
|
+
To build with integrationTest(It works on OSX or Linux)
|
90
|
+
```
|
91
|
+
$ ./gradlew clean
|
92
|
+
$ ./gradlew -DenableIntegrationTest=true gem
|
93
|
+
```
|
94
|
+
|
66
95
|
## Reference
|
67
96
|
|
68
97
|
- [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/)
|
data/build.gradle
CHANGED
@@ -5,6 +5,8 @@ plugins {
|
|
5
5
|
}
|
6
6
|
import com.github.jrubygradle.JRubyExec
|
7
7
|
|
8
|
+
apply from: 'https://raw.githubusercontent.com/hata/gradle-plugins/master/embulk-integration-test.gradle'
|
9
|
+
|
8
10
|
sourceCompatibility = '1.7'
|
9
11
|
targetCompatibility = '1.7'
|
10
12
|
|
@@ -16,7 +18,7 @@ configurations {
|
|
16
18
|
provided
|
17
19
|
}
|
18
20
|
|
19
|
-
version = "0.3.
|
21
|
+
version = "0.3.2"
|
20
22
|
|
21
23
|
dependencies {
|
22
24
|
compile "org.embulk:embulk-core:0.7.0"
|
@@ -61,3 +63,5 @@ Gem::Specification.new do |spec|
|
|
61
63
|
end
|
62
64
|
/$)
|
63
65
|
}
|
66
|
+
|
67
|
+
project.tasks.integrationTest.dependsOn(classpath)
|
@@ -0,0 +1,138 @@
|
|
1
|
+
package org.embulk.filter;
|
2
|
+
|
3
|
+
import static org.junit.Assert.assertEquals;
|
4
|
+
|
5
|
+
import java.io.BufferedReader;
|
6
|
+
import java.io.File;
|
7
|
+
import java.io.FileReader;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.util.zip.CRC32;
|
10
|
+
import java.util.zip.Checksum;
|
11
|
+
|
12
|
+
import org.junit.Test;
|
13
|
+
|
14
|
+
public class TestIntegration {
|
15
|
+
static final String TEST_DIR = System.getProperty("embulk.integrationtest.dir");
|
16
|
+
private static final String[] SAMPLE_SRC_FILES = {"header.csv", "sample_1.csv", "sample_2.csv"};
|
17
|
+
private static final String[] SAMPLE_1_SRC_FILES = {"header.csv", "sample_1.csv"};
|
18
|
+
|
19
|
+
private static String getTestFile(String name) {
|
20
|
+
return TEST_DIR + File.separator + name;
|
21
|
+
}
|
22
|
+
|
23
|
+
@Test
|
24
|
+
public void testArchiveFormatZip() throws Exception {
|
25
|
+
assertEquals("Verify input and output contents are identical.",
|
26
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
27
|
+
getChecksumFromFiles("result_zip_000.00.csv"));
|
28
|
+
}
|
29
|
+
|
30
|
+
@Test
|
31
|
+
public void testArchiveFormatAr() throws Exception {
|
32
|
+
assertEquals("Verify input and output contents are identical.",
|
33
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
34
|
+
getChecksumFromFiles("result_ar_000.00.csv"));
|
35
|
+
}
|
36
|
+
|
37
|
+
@Test
|
38
|
+
public void testArchiveFormatTar() throws Exception {
|
39
|
+
assertEquals("Verify input and output contents are identical.",
|
40
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
41
|
+
getChecksumFromFiles("result_tar_000.00.csv"));
|
42
|
+
}
|
43
|
+
|
44
|
+
@Test
|
45
|
+
public void testCompressionFormatBzip2() throws Exception {
|
46
|
+
assertEquals("Verify input and output contents are identical.",
|
47
|
+
getChecksumFromFiles(SAMPLE_1_SRC_FILES),
|
48
|
+
getChecksumFromFiles("result_bz2_000.00.csv"));
|
49
|
+
}
|
50
|
+
|
51
|
+
@Test
|
52
|
+
public void testCompressionFormatGzip() throws Exception {
|
53
|
+
assertEquals("Verify input and output contents are identical.",
|
54
|
+
getChecksumFromFiles(SAMPLE_1_SRC_FILES),
|
55
|
+
getChecksumFromFiles("result_gz_000.00.csv"));
|
56
|
+
}
|
57
|
+
|
58
|
+
@Test
|
59
|
+
public void testSolidCompressionFormatTgz() throws Exception {
|
60
|
+
assertEquals("Verify input and output contents are identical.",
|
61
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
62
|
+
getChecksumFromFiles("result_tgz_000.00.csv"));
|
63
|
+
}
|
64
|
+
|
65
|
+
@Test
|
66
|
+
public void testSolidCompressionFormatTarBz2() throws Exception {
|
67
|
+
assertEquals("Verify input and output contents are identical.",
|
68
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
69
|
+
getChecksumFromFiles("result_tar.bz2_000.00.csv"));
|
70
|
+
}
|
71
|
+
|
72
|
+
@Test
|
73
|
+
public void testSolidCompressionFormatTarGz() throws Exception {
|
74
|
+
assertEquals("Verify input and output contents are identical.",
|
75
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
76
|
+
getChecksumFromFiles("result_tar.gz_000.00.csv"));
|
77
|
+
}
|
78
|
+
|
79
|
+
@Test
|
80
|
+
public void testSolidCompressionFormatTarZ() throws Exception {
|
81
|
+
assertEquals("Verify input and output contents are identical.",
|
82
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
83
|
+
getChecksumFromFiles("result_tar.Z_000.00.csv"));
|
84
|
+
}
|
85
|
+
|
86
|
+
@Test
|
87
|
+
public void testConcatenatedGZ() throws Exception {
|
88
|
+
assertEquals("Verify input and output contents are identical.",
|
89
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
90
|
+
getChecksumFromFiles("result_concatenated_gz_000.00.csv"));
|
91
|
+
}
|
92
|
+
|
93
|
+
@Test
|
94
|
+
public void testConcatenatedGzip() throws Exception {
|
95
|
+
assertEquals("Verify input and output contents are identical.",
|
96
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
97
|
+
getChecksumFromFiles("result_concatenated_gzip_000.00.csv"));
|
98
|
+
}
|
99
|
+
|
100
|
+
@Test
|
101
|
+
public void testConcatenatedBz2() throws Exception {
|
102
|
+
assertEquals("Verify input and output contents are identical.",
|
103
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
104
|
+
getChecksumFromFiles("result_concatenated_bz2_000.00.csv"));
|
105
|
+
}
|
106
|
+
|
107
|
+
@Test
|
108
|
+
public void testConcatenatedBzip2() throws Exception {
|
109
|
+
assertEquals("Verify input and output contents are identical.",
|
110
|
+
getChecksumFromFiles(SAMPLE_SRC_FILES),
|
111
|
+
getChecksumFromFiles("result_concatenated_bzip2_000.00.csv"));
|
112
|
+
}
|
113
|
+
|
114
|
+
private long getChecksumFromFiles(String ... files) throws IOException {
|
115
|
+
Checksum cksum = new CRC32();
|
116
|
+
|
117
|
+
for (String srcFile : files) {
|
118
|
+
try (BufferedReader reader = new BufferedReader(new FileReader(getTestFile(srcFile)))) {
|
119
|
+
getChecksum(cksum, reader);
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
return cksum.getValue();
|
124
|
+
}
|
125
|
+
|
126
|
+
private long getChecksum(Checksum cksum, BufferedReader reader) throws IOException {
|
127
|
+
String line = reader.readLine();
|
128
|
+
while (line != null) {
|
129
|
+
byte[] lineBuf = line.trim().getBytes();
|
130
|
+
if (lineBuf.length > 0) {
|
131
|
+
// System.out.println("line:" + new String(lineBuf));
|
132
|
+
cksum.update(lineBuf, 0, lineBuf.length);
|
133
|
+
}
|
134
|
+
line = reader.readLine();
|
135
|
+
}
|
136
|
+
return cksum.getValue();
|
137
|
+
}
|
138
|
+
}
|
Binary file
|
Binary file
|
@@ -0,0 +1,26 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.ar
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
parser:
|
7
|
+
charset: UTF-8
|
8
|
+
newline: CRLF
|
9
|
+
type: csv
|
10
|
+
delimiter: ','
|
11
|
+
quote: '"'
|
12
|
+
trim_if_not_quoted: false
|
13
|
+
skip_header_lines: 0
|
14
|
+
allow_extra_columns: false
|
15
|
+
allow_optional_columns: false
|
16
|
+
columns:
|
17
|
+
- {name: id, type: long}
|
18
|
+
- {name: comment, type: string}
|
19
|
+
out:
|
20
|
+
type: file
|
21
|
+
path_prefix: ./result_ar_
|
22
|
+
file_ext: csv
|
23
|
+
formatter:
|
24
|
+
type: csv
|
25
|
+
quote_policy: MINIMAL
|
26
|
+
newline: LF
|
@@ -0,0 +1,26 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./sample_1.csv.bz2
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
parser:
|
7
|
+
charset: UTF-8
|
8
|
+
newline: CRLF
|
9
|
+
type: csv
|
10
|
+
delimiter: ','
|
11
|
+
quote: '"'
|
12
|
+
trim_if_not_quoted: false
|
13
|
+
skip_header_lines: 0
|
14
|
+
allow_extra_columns: false
|
15
|
+
allow_optional_columns: false
|
16
|
+
columns:
|
17
|
+
- {name: id, type: long}
|
18
|
+
- {name: comment, type: string}
|
19
|
+
out:
|
20
|
+
type: file
|
21
|
+
path_prefix: ./result_bz2_
|
22
|
+
file_ext: csv
|
23
|
+
formatter:
|
24
|
+
type: csv
|
25
|
+
quote_policy: MINIMAL
|
26
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.bz2
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: bz2
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_concatenated_bz2_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.bz2
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: bzip2
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_concatenated_bzip2_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.gz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: gz
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_concatenated_gz_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.gz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: gzip
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_concatenated_gzip_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,26 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./sample_1.csv.gz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
parser:
|
7
|
+
charset: UTF-8
|
8
|
+
newline: CRLF
|
9
|
+
type: csv
|
10
|
+
delimiter: ','
|
11
|
+
quote: '"'
|
12
|
+
trim_if_not_quoted: false
|
13
|
+
skip_header_lines: 0
|
14
|
+
allow_extra_columns: false
|
15
|
+
allow_optional_columns: false
|
16
|
+
columns:
|
17
|
+
- {name: id, type: long}
|
18
|
+
- {name: comment, type: string}
|
19
|
+
out:
|
20
|
+
type: file
|
21
|
+
path_prefix: ./result_gz_
|
22
|
+
file_ext: csv
|
23
|
+
formatter:
|
24
|
+
type: csv
|
25
|
+
quote_policy: MINIMAL
|
26
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.tar.Z
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: tar.Z
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_tar.Z_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.tar.bz2
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: tar.bz2
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_tar.bz2_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.tar.gz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: tar.gz
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_tar.gz_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.tar
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: tar
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_tar_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.tgz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
format: tgz
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_tgz_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,26 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./samples.zip
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
parser:
|
7
|
+
charset: UTF-8
|
8
|
+
newline: CRLF
|
9
|
+
type: csv
|
10
|
+
delimiter: ','
|
11
|
+
quote: '"'
|
12
|
+
trim_if_not_quoted: false
|
13
|
+
skip_header_lines: 0
|
14
|
+
allow_extra_columns: false
|
15
|
+
allow_optional_columns: false
|
16
|
+
columns:
|
17
|
+
- {name: id, type: long}
|
18
|
+
- {name: comment, type: string}
|
19
|
+
out:
|
20
|
+
type: file
|
21
|
+
path_prefix: ./result_zip_
|
22
|
+
file_ext: csv
|
23
|
+
formatter:
|
24
|
+
type: csv
|
25
|
+
quote_policy: MINIMAL
|
26
|
+
newline: LF
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
1,foo
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
2,bar
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -13,6 +13,8 @@ import org.apache.commons.compress.archivers.ArchiveStreamFactory;
|
|
13
13
|
import org.apache.commons.compress.compressors.CompressorException;
|
14
14
|
import org.apache.commons.compress.compressors.CompressorInputStream;
|
15
15
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
16
|
+
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
|
17
|
+
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
16
18
|
import org.embulk.decoder.CommonsCompressDecoderPlugin.PluginTask;
|
17
19
|
import org.embulk.spi.util.FileInputInputStream;
|
18
20
|
import org.embulk.spi.util.InputStreamFileInput.Provider;
|
@@ -105,19 +107,19 @@ class CommonsCompressProvider implements Provider {
|
|
105
107
|
* (Actually, compressor formats can use two or more times in this code.
|
106
108
|
* But it is not common case.)
|
107
109
|
*/
|
108
|
-
Iterator<InputStream> createInputStreamIterator(String[]
|
110
|
+
Iterator<InputStream> createInputStreamIterator(String[] inputFormats,
|
109
111
|
int pos, InputStream in) throws IOException {
|
110
|
-
if (pos >=
|
112
|
+
if (pos >= inputFormats.length) {
|
111
113
|
return toIterator(in);
|
112
114
|
}
|
113
115
|
|
114
116
|
try {
|
115
|
-
String format =
|
117
|
+
String format = inputFormats[pos];
|
116
118
|
if (CommonsCompressUtil.isArchiveFormat(format)) {
|
117
119
|
return new ArchiveInputStreamIterator(
|
118
120
|
createArchiveInputStream(format, in));
|
119
121
|
} else if (CommonsCompressUtil.isCompressorFormat(format)) {
|
120
|
-
return createInputStreamIterator(
|
122
|
+
return createInputStreamIterator(inputFormats, pos + 1,
|
121
123
|
createCompressorInputStream(format, in));
|
122
124
|
}
|
123
125
|
throw new IOException("Unsupported format is configured. format:"
|
@@ -169,6 +171,12 @@ class CommonsCompressProvider implements Provider {
|
|
169
171
|
"Failed to detect a file format. Please try to set a format explicitly.",
|
170
172
|
e);
|
171
173
|
}
|
174
|
+
}
|
175
|
+
|
176
|
+
if (CompressorStreamFactory.GZIP.equalsIgnoreCase(format)) {
|
177
|
+
return new GzipCompressorInputStream(in, true);
|
178
|
+
} else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(format)) {
|
179
|
+
return new BZip2CompressorInputStream(in, true);
|
172
180
|
} else {
|
173
181
|
return factory.createCompressorInputStream(format, in);
|
174
182
|
}
|
@@ -22,6 +22,8 @@ class CommonsCompressUtil {
|
|
22
22
|
ArchiveStreamFactory.ZIP,
|
23
23
|
};
|
24
24
|
|
25
|
+
// Even indexes have both extensions and aliases. And odd indexes are
|
26
|
+
// CompressorStreamFactory values.
|
25
27
|
static final String[] compressorFormats = {
|
26
28
|
CompressorStreamFactory.BZIP2,
|
27
29
|
CompressorStreamFactory.DEFLATE,
|
@@ -32,6 +34,8 @@ class CommonsCompressUtil {
|
|
32
34
|
CompressorStreamFactory.SNAPPY_RAW,
|
33
35
|
CompressorStreamFactory.XZ,
|
34
36
|
CompressorStreamFactory.Z,
|
37
|
+
"bz2", // These values should be handled by normalizeFormats
|
38
|
+
"gzip",
|
35
39
|
};
|
36
40
|
|
37
41
|
// This table is even indexes have short extensions and odd indexes has
|
@@ -94,7 +98,7 @@ class CommonsCompressUtil {
|
|
94
98
|
if (isAutoDetect(format)) {
|
95
99
|
return null;
|
96
100
|
} else if (isArchiveFormat(format) || isCompressorFormat(format)) {
|
97
|
-
return splitAndReverse(format);
|
101
|
+
return normalizeFormats(splitAndReverse(format));
|
98
102
|
}
|
99
103
|
|
100
104
|
String[] formats = toSolidCompressionFormats(format);
|
@@ -102,7 +106,7 @@ class CommonsCompressUtil {
|
|
102
106
|
return formats;
|
103
107
|
}
|
104
108
|
|
105
|
-
formats = splitAndReverse(format);
|
109
|
+
formats = normalizeFormats(splitAndReverse(format));
|
106
110
|
|
107
111
|
for (String s : formats) {
|
108
112
|
if (!(isArchiveFormat(s) || isCompressorFormat(s))) {
|
@@ -132,4 +136,20 @@ class CommonsCompressUtil {
|
|
132
136
|
Collections.reverse(result);
|
133
137
|
return result.toArray(new String[result.size()]);
|
134
138
|
}
|
139
|
+
|
140
|
+
private static String[] normalizeFormats(String... formats) {
|
141
|
+
if (formats == null || formats.length == 0) {
|
142
|
+
return formats;
|
143
|
+
}
|
144
|
+
|
145
|
+
for (int i = 0;i < formats.length;i++) {
|
146
|
+
if (formats[i].equalsIgnoreCase("gzip")) {
|
147
|
+
formats[i] = CompressorStreamFactory.GZIP;
|
148
|
+
} else if (formats[i].equalsIgnoreCase("bz2")) {
|
149
|
+
formats[i] = CompressorStreamFactory.BZIP2;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
return formats;
|
154
|
+
}
|
135
155
|
}
|
@@ -240,7 +240,43 @@ public class TestCommonsCompressProvider {
|
|
240
240
|
verifyContents(it, "1,foo", "2,bar");
|
241
241
|
}
|
242
242
|
}
|
243
|
-
|
243
|
+
|
244
|
+
@Test
|
245
|
+
public void testCreateInputStreamConcatenatedGZ() throws Exception {
|
246
|
+
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
247
|
+
Iterator<InputStream> it = provider.createInputStreamIterator(
|
248
|
+
new String[]{CompressorStreamFactory.GZIP}, 0, getResourceInputStream("concatenated.csv.gz"));
|
249
|
+
verifyContents(it, "1,foo\n2,bar");
|
250
|
+
}
|
251
|
+
}
|
252
|
+
|
253
|
+
@Test
|
254
|
+
public void testCreateInputStreamConcatenatedGZip() throws Exception {
|
255
|
+
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
256
|
+
Iterator<InputStream> it = provider.createInputStreamIterator(
|
257
|
+
CommonsCompressUtil.toFormats("gzip"), 0, getResourceInputStream("concatenated.csv.gz"));
|
258
|
+
verifyContents(it, "1,foo\n2,bar");
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
@Test
|
263
|
+
public void testCreateInputStreamConcatenatedBZip2() throws Exception {
|
264
|
+
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
265
|
+
Iterator<InputStream> it = provider.createInputStreamIterator(
|
266
|
+
new String[]{CompressorStreamFactory.BZIP2}, 0, getResourceInputStream("concatenated.csv.bz2"));
|
267
|
+
verifyContents(it, "1,foo\n2,bar");
|
268
|
+
}
|
269
|
+
}
|
270
|
+
|
271
|
+
@Test
|
272
|
+
public void testCreateInputStreamConcatenatedBZ2() throws Exception {
|
273
|
+
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
274
|
+
Iterator<InputStream> it = provider.createInputStreamIterator(
|
275
|
+
CommonsCompressUtil.toFormats("bz2"), 0, getResourceInputStream("concatenated.csv.bz2"));
|
276
|
+
verifyContents(it, "1,foo\n2,bar");
|
277
|
+
}
|
278
|
+
}
|
279
|
+
|
244
280
|
@Test
|
245
281
|
public void testClose() throws Exception {
|
246
282
|
CommonsCompressProvider provider = new CommonsCompressProvider(task, files);
|
@@ -303,7 +339,7 @@ public class TestCommonsCompressProvider {
|
|
303
339
|
}
|
304
340
|
}
|
305
341
|
|
306
|
-
@Test(expected=
|
342
|
+
@Test(expected=Exception.class)
|
307
343
|
public void testCreateCompressorInputStreamWrongFormat() throws Exception {
|
308
344
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
309
345
|
provider.createCompressorInputStream("bzip2",
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-decoder-commons-compress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hata
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -46,6 +46,7 @@ extensions: []
|
|
46
46
|
extra_rdoc_files: []
|
47
47
|
files:
|
48
48
|
- .gitignore
|
49
|
+
- .travis.yml
|
49
50
|
- LICENSE.txt
|
50
51
|
- README.md
|
51
52
|
- build.gradle
|
@@ -54,6 +55,36 @@ files:
|
|
54
55
|
- gradlew
|
55
56
|
- gradlew.bat
|
56
57
|
- lib/embulk/decoder/commons-compress.rb
|
58
|
+
- src/integration-test/java/org/embulk/filter/TestIntegration.java
|
59
|
+
- src/integration-test/resources/concatenated.csv.bz2
|
60
|
+
- src/integration-test/resources/concatenated.csv.gz
|
61
|
+
- src/integration-test/resources/config_ar.yml
|
62
|
+
- src/integration-test/resources/config_bz2.yml
|
63
|
+
- src/integration-test/resources/config_concatenated_bz2.yml
|
64
|
+
- src/integration-test/resources/config_concatenated_bzip2.yml
|
65
|
+
- src/integration-test/resources/config_concatenated_gz.yml
|
66
|
+
- src/integration-test/resources/config_concatenated_gzip.yml
|
67
|
+
- src/integration-test/resources/config_gz.yml
|
68
|
+
- src/integration-test/resources/config_tar.Z.yml
|
69
|
+
- src/integration-test/resources/config_tar.bz2.yml
|
70
|
+
- src/integration-test/resources/config_tar.gz.yml
|
71
|
+
- src/integration-test/resources/config_tar.yml
|
72
|
+
- src/integration-test/resources/config_tgz.yml
|
73
|
+
- src/integration-test/resources/config_zip.yml
|
74
|
+
- src/integration-test/resources/header.csv
|
75
|
+
- src/integration-test/resources/sample_0.tar
|
76
|
+
- src/integration-test/resources/sample_1.csv
|
77
|
+
- src/integration-test/resources/sample_1.csv.bz2
|
78
|
+
- src/integration-test/resources/sample_1.csv.gz
|
79
|
+
- src/integration-test/resources/sample_1.tar
|
80
|
+
- src/integration-test/resources/sample_2.csv
|
81
|
+
- src/integration-test/resources/samples.ar
|
82
|
+
- src/integration-test/resources/samples.tar
|
83
|
+
- src/integration-test/resources/samples.tar.Z
|
84
|
+
- src/integration-test/resources/samples.tar.bz2
|
85
|
+
- src/integration-test/resources/samples.tar.gz
|
86
|
+
- src/integration-test/resources/samples.tgz
|
87
|
+
- src/integration-test/resources/samples.zip
|
57
88
|
- src/main/java/org/embulk/decoder/ArchiveInputStreamIterator.java
|
58
89
|
- src/main/java/org/embulk/decoder/CommonsCompressDecoderPlugin.java
|
59
90
|
- src/main/java/org/embulk/decoder/CommonsCompressFileInput.java
|
@@ -64,6 +95,8 @@ files:
|
|
64
95
|
- src/test/java/org/embulk/decoder/TestCommonsCompressFileInput.java
|
65
96
|
- src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java
|
66
97
|
- src/test/java/org/embulk/decoder/TestCommonsCompressUtil.java
|
98
|
+
- src/test/resources/org/embulk/decoder/concatenated.csv.bz2
|
99
|
+
- src/test/resources/org/embulk/decoder/concatenated.csv.gz
|
67
100
|
- src/test/resources/org/embulk/decoder/sample_0.tar
|
68
101
|
- src/test/resources/org/embulk/decoder/sample_1.csv
|
69
102
|
- src/test/resources/org/embulk/decoder/sample_1.csv.bz2
|
@@ -78,7 +111,7 @@ files:
|
|
78
111
|
- src/test/resources/org/embulk/decoder/samples.tgz
|
79
112
|
- src/test/resources/org/embulk/decoder/samples.zip
|
80
113
|
- classpath/commons-compress-1.9.jar
|
81
|
-
- classpath/embulk-decoder-commons-compress-0.3.
|
114
|
+
- classpath/embulk-decoder-commons-compress-0.3.2.jar
|
82
115
|
homepage: https://github.com/hata/embulk-decoder-commons-compress
|
83
116
|
licenses:
|
84
117
|
- MIT
|