embulk-decoder-commons-compress 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +3 -0
  3. data/README.md +29 -0
  4. data/build.gradle +5 -1
  5. data/src/integration-test/java/org/embulk/filter/TestIntegration.java +138 -0
  6. data/src/integration-test/resources/concatenated.csv.bz2 +0 -0
  7. data/src/integration-test/resources/concatenated.csv.gz +0 -0
  8. data/src/integration-test/resources/config_ar.yml +26 -0
  9. data/src/integration-test/resources/config_bz2.yml +26 -0
  10. data/src/integration-test/resources/config_concatenated_bz2.yml +27 -0
  11. data/src/integration-test/resources/config_concatenated_bzip2.yml +27 -0
  12. data/src/integration-test/resources/config_concatenated_gz.yml +27 -0
  13. data/src/integration-test/resources/config_concatenated_gzip.yml +27 -0
  14. data/src/integration-test/resources/config_gz.yml +26 -0
  15. data/src/integration-test/resources/config_tar.Z.yml +27 -0
  16. data/src/integration-test/resources/config_tar.bz2.yml +27 -0
  17. data/src/integration-test/resources/config_tar.gz.yml +27 -0
  18. data/src/integration-test/resources/config_tar.yml +27 -0
  19. data/src/integration-test/resources/config_tgz.yml +27 -0
  20. data/src/integration-test/resources/config_zip.yml +26 -0
  21. data/src/integration-test/resources/header.csv +2 -0
  22. data/src/integration-test/resources/sample_0.tar +0 -0
  23. data/src/integration-test/resources/sample_1.csv +1 -0
  24. data/src/integration-test/resources/sample_1.csv.bz2 +0 -0
  25. data/src/integration-test/resources/sample_1.csv.gz +0 -0
  26. data/src/integration-test/resources/sample_1.tar +0 -0
  27. data/src/integration-test/resources/sample_2.csv +1 -0
  28. data/src/integration-test/resources/samples.ar +5 -0
  29. data/src/integration-test/resources/samples.tar +0 -0
  30. data/src/integration-test/resources/samples.tar.Z +0 -0
  31. data/src/integration-test/resources/samples.tar.bz2 +0 -0
  32. data/src/integration-test/resources/samples.tar.gz +0 -0
  33. data/src/integration-test/resources/samples.tgz +0 -0
  34. data/src/integration-test/resources/samples.zip +0 -0
  35. data/src/main/java/org/embulk/decoder/CommonsCompressProvider.java +12 -4
  36. data/src/main/java/org/embulk/decoder/CommonsCompressUtil.java +22 -2
  37. data/src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java +38 -2
  38. data/src/test/resources/org/embulk/decoder/concatenated.csv.bz2 +0 -0
  39. data/src/test/resources/org/embulk/decoder/concatenated.csv.gz +0 -0
  40. metadata +36 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 47b9aa18792d34a7a3797a6d62d6a2fb2b53cd05
4
- data.tar.gz: 0bc509836d83388502cc224d51a88abb4f3e8832
3
+ metadata.gz: 8c492e93cc8decc06fa54a952930d446748d1bc3
4
+ data.tar.gz: 3bc26ce03ac3fd8cbd450b12d5c798f1f0d88326
5
5
  SHA512:
6
- metadata.gz: 39d37955797132ed222662abeddb0c7c03a3d455970107576c03291b9320f706ae6fd85cd2c5b163bb92ed26624231f13d24dcddfb213474759e5ff2337107a8
7
- data.tar.gz: 8693b5648dd1975257d6be8269d82ac69e259150542e7e03d5764bc8c41cf24c1564ff65d7d051c8d9d344baa9fdb4277dab462f619027115d42d3c53586e5e5
6
+ metadata.gz: a3e7102ce968c9d4c7b7733025d8b141ee55c9dd4f3e730c05a0e9ea0d11f0ddadab2fb9ca34b107eb30949108cdb6c3eb669a31eb5111c95eae52c65c6360dd
7
+ data.tar.gz: aa39557828feb9906a43d061b2edecaf558bdcae8ff6b00bd6f3623ff37c09d5d3e905f18789c5b8b170a91cb41058ea229fd2df376a6487e644aacefe36c45d
@@ -0,0 +1,3 @@
1
+ language: java
2
+ script: ./gradlew -DenableIntegrationTest=true gem
3
+
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Commons Compress decoder plugin for Embulk
2
2
 
3
+ [![Build Status](https://travis-ci.org/hata/embulk-decoder-commons-compress.svg)](https://travis-ci.org/hata/embulk-decoder-commons-compress)
4
+
5
+
3
6
  This decoder plugin for Embulk supports various archive formats using [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) library.
4
7
 
5
8
  ## Overview
@@ -26,6 +29,8 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
26
29
  - tbz, tbz2, tb2, tar.bz2
27
30
  - taz, tz, tar.Z
28
31
 
32
+ If input files are concatenated gzip or bzip2 format, please set format parameter explicitly.
33
+
29
34
  ## Example
30
35
 
31
36
  - Use auto detection. This can use for 1 format like tar and zip. If you would like to use a solid compression format like tar.gz, please set the format to your configuration file.
@@ -57,12 +62,36 @@ in:
57
62
  format: tgz
58
63
  ```
59
64
 
65
+ - Set *format* parameter to handle concatenated gzip(or bzip2) file.
66
+ ```yaml
67
+ in:
68
+ type: any input plugin type
69
+ decoders:
70
+ - type: commons-compress
71
+ format: gz
72
+ ```
73
+
74
+ ```yaml
75
+ in:
76
+ type: any input plugin type
77
+ decoders:
78
+ - type: commons-compress
79
+ format: bzip2
80
+ ```
81
+
82
+
60
83
  ## Build
61
84
 
62
85
  ```
63
86
  $ ./gradlew gem
64
87
  ```
65
88
 
89
+ To build with integrationTest(It works on OSX or Linux)
90
+ ```
91
+ $ ./gradlew clean
92
+ $ ./gradlew -DenableIntegrationTest=true gem
93
+ ```
94
+
66
95
  ## Reference
67
96
 
68
97
  - [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/)
@@ -5,6 +5,8 @@ plugins {
5
5
  }
6
6
  import com.github.jrubygradle.JRubyExec
7
7
 
8
+ apply from: 'https://raw.githubusercontent.com/hata/gradle-plugins/master/embulk-integration-test.gradle'
9
+
8
10
  sourceCompatibility = '1.7'
9
11
  targetCompatibility = '1.7'
10
12
 
@@ -16,7 +18,7 @@ configurations {
16
18
  provided
17
19
  }
18
20
 
19
- version = "0.3.1"
21
+ version = "0.3.2"
20
22
 
21
23
  dependencies {
22
24
  compile "org.embulk:embulk-core:0.7.0"
@@ -61,3 +63,5 @@ Gem::Specification.new do |spec|
61
63
  end
62
64
  /$)
63
65
  }
66
+
67
+ project.tasks.integrationTest.dependsOn(classpath)
@@ -0,0 +1,138 @@
1
+ package org.embulk.filter;
2
+
3
+ import static org.junit.Assert.assertEquals;
4
+
5
+ import java.io.BufferedReader;
6
+ import java.io.File;
7
+ import java.io.FileReader;
8
+ import java.io.IOException;
9
+ import java.util.zip.CRC32;
10
+ import java.util.zip.Checksum;
11
+
12
+ import org.junit.Test;
13
+
14
+ public class TestIntegration {
15
+ static final String TEST_DIR = System.getProperty("embulk.integrationtest.dir");
16
+ private static final String[] SAMPLE_SRC_FILES = {"header.csv", "sample_1.csv", "sample_2.csv"};
17
+ private static final String[] SAMPLE_1_SRC_FILES = {"header.csv", "sample_1.csv"};
18
+
19
+ private static String getTestFile(String name) {
20
+ return TEST_DIR + File.separator + name;
21
+ }
22
+
23
+ @Test
24
+ public void testArchiveFormatZip() throws Exception {
25
+ assertEquals("Verify input and output contents are identical.",
26
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
27
+ getChecksumFromFiles("result_zip_000.00.csv"));
28
+ }
29
+
30
+ @Test
31
+ public void testArchiveFormatAr() throws Exception {
32
+ assertEquals("Verify input and output contents are identical.",
33
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
34
+ getChecksumFromFiles("result_ar_000.00.csv"));
35
+ }
36
+
37
+ @Test
38
+ public void testArchiveFormatTar() throws Exception {
39
+ assertEquals("Verify input and output contents are identical.",
40
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
41
+ getChecksumFromFiles("result_tar_000.00.csv"));
42
+ }
43
+
44
+ @Test
45
+ public void testCompressionFormatBzip2() throws Exception {
46
+ assertEquals("Verify input and output contents are identical.",
47
+ getChecksumFromFiles(SAMPLE_1_SRC_FILES),
48
+ getChecksumFromFiles("result_bz2_000.00.csv"));
49
+ }
50
+
51
+ @Test
52
+ public void testCompressionFormatGzip() throws Exception {
53
+ assertEquals("Verify input and output contents are identical.",
54
+ getChecksumFromFiles(SAMPLE_1_SRC_FILES),
55
+ getChecksumFromFiles("result_gz_000.00.csv"));
56
+ }
57
+
58
+ @Test
59
+ public void testSolidCompressionFormatTgz() throws Exception {
60
+ assertEquals("Verify input and output contents are identical.",
61
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
62
+ getChecksumFromFiles("result_tgz_000.00.csv"));
63
+ }
64
+
65
+ @Test
66
+ public void testSolidCompressionFormatTarBz2() throws Exception {
67
+ assertEquals("Verify input and output contents are identical.",
68
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
69
+ getChecksumFromFiles("result_tar.bz2_000.00.csv"));
70
+ }
71
+
72
+ @Test
73
+ public void testSolidCompressionFormatTarGz() throws Exception {
74
+ assertEquals("Verify input and output contents are identical.",
75
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
76
+ getChecksumFromFiles("result_tar.gz_000.00.csv"));
77
+ }
78
+
79
+ @Test
80
+ public void testSolidCompressionFormatTarZ() throws Exception {
81
+ assertEquals("Verify input and output contents are identical.",
82
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
83
+ getChecksumFromFiles("result_tar.Z_000.00.csv"));
84
+ }
85
+
86
+ @Test
87
+ public void testConcatenatedGZ() throws Exception {
88
+ assertEquals("Verify input and output contents are identical.",
89
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
90
+ getChecksumFromFiles("result_concatenated_gz_000.00.csv"));
91
+ }
92
+
93
+ @Test
94
+ public void testConcatenatedGzip() throws Exception {
95
+ assertEquals("Verify input and output contents are identical.",
96
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
97
+ getChecksumFromFiles("result_concatenated_gzip_000.00.csv"));
98
+ }
99
+
100
+ @Test
101
+ public void testConcatenatedBz2() throws Exception {
102
+ assertEquals("Verify input and output contents are identical.",
103
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
104
+ getChecksumFromFiles("result_concatenated_bz2_000.00.csv"));
105
+ }
106
+
107
+ @Test
108
+ public void testConcatenatedBzip2() throws Exception {
109
+ assertEquals("Verify input and output contents are identical.",
110
+ getChecksumFromFiles(SAMPLE_SRC_FILES),
111
+ getChecksumFromFiles("result_concatenated_bzip2_000.00.csv"));
112
+ }
113
+
114
+ private long getChecksumFromFiles(String ... files) throws IOException {
115
+ Checksum cksum = new CRC32();
116
+
117
+ for (String srcFile : files) {
118
+ try (BufferedReader reader = new BufferedReader(new FileReader(getTestFile(srcFile)))) {
119
+ getChecksum(cksum, reader);
120
+ }
121
+ }
122
+
123
+ return cksum.getValue();
124
+ }
125
+
126
+ private long getChecksum(Checksum cksum, BufferedReader reader) throws IOException {
127
+ String line = reader.readLine();
128
+ while (line != null) {
129
+ byte[] lineBuf = line.trim().getBytes();
130
+ if (lineBuf.length > 0) {
131
+ // System.out.println("line:" + new String(lineBuf));
132
+ cksum.update(lineBuf, 0, lineBuf.length);
133
+ }
134
+ line = reader.readLine();
135
+ }
136
+ return cksum.getValue();
137
+ }
138
+ }
@@ -0,0 +1,26 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.ar
4
+ decoders:
5
+ - type: commons-compress
6
+ parser:
7
+ charset: UTF-8
8
+ newline: CRLF
9
+ type: csv
10
+ delimiter: ','
11
+ quote: '"'
12
+ trim_if_not_quoted: false
13
+ skip_header_lines: 0
14
+ allow_extra_columns: false
15
+ allow_optional_columns: false
16
+ columns:
17
+ - {name: id, type: long}
18
+ - {name: comment, type: string}
19
+ out:
20
+ type: file
21
+ path_prefix: ./result_ar_
22
+ file_ext: csv
23
+ formatter:
24
+ type: csv
25
+ quote_policy: MINIMAL
26
+ newline: LF
@@ -0,0 +1,26 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./sample_1.csv.bz2
4
+ decoders:
5
+ - type: commons-compress
6
+ parser:
7
+ charset: UTF-8
8
+ newline: CRLF
9
+ type: csv
10
+ delimiter: ','
11
+ quote: '"'
12
+ trim_if_not_quoted: false
13
+ skip_header_lines: 0
14
+ allow_extra_columns: false
15
+ allow_optional_columns: false
16
+ columns:
17
+ - {name: id, type: long}
18
+ - {name: comment, type: string}
19
+ out:
20
+ type: file
21
+ path_prefix: ./result_bz2_
22
+ file_ext: csv
23
+ formatter:
24
+ type: csv
25
+ quote_policy: MINIMAL
26
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./concatenated.csv.bz2
4
+ decoders:
5
+ - type: commons-compress
6
+ format: bz2
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_concatenated_bz2_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./concatenated.csv.bz2
4
+ decoders:
5
+ - type: commons-compress
6
+ format: bzip2
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_concatenated_bzip2_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./concatenated.csv.gz
4
+ decoders:
5
+ - type: commons-compress
6
+ format: gz
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_concatenated_gz_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./concatenated.csv.gz
4
+ decoders:
5
+ - type: commons-compress
6
+ format: gzip
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_concatenated_gzip_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,26 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./sample_1.csv.gz
4
+ decoders:
5
+ - type: commons-compress
6
+ parser:
7
+ charset: UTF-8
8
+ newline: CRLF
9
+ type: csv
10
+ delimiter: ','
11
+ quote: '"'
12
+ trim_if_not_quoted: false
13
+ skip_header_lines: 0
14
+ allow_extra_columns: false
15
+ allow_optional_columns: false
16
+ columns:
17
+ - {name: id, type: long}
18
+ - {name: comment, type: string}
19
+ out:
20
+ type: file
21
+ path_prefix: ./result_gz_
22
+ file_ext: csv
23
+ formatter:
24
+ type: csv
25
+ quote_policy: MINIMAL
26
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.tar.Z
4
+ decoders:
5
+ - type: commons-compress
6
+ format: tar.Z
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_tar.Z_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.tar.bz2
4
+ decoders:
5
+ - type: commons-compress
6
+ format: tar.bz2
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_tar.bz2_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.tar.gz
4
+ decoders:
5
+ - type: commons-compress
6
+ format: tar.gz
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_tar.gz_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.tar
4
+ decoders:
5
+ - type: commons-compress
6
+ format: tar
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_tar_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.tgz
4
+ decoders:
5
+ - type: commons-compress
6
+ format: tgz
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_tgz_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,26 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./samples.zip
4
+ decoders:
5
+ - type: commons-compress
6
+ parser:
7
+ charset: UTF-8
8
+ newline: CRLF
9
+ type: csv
10
+ delimiter: ','
11
+ quote: '"'
12
+ trim_if_not_quoted: false
13
+ skip_header_lines: 0
14
+ allow_extra_columns: false
15
+ allow_optional_columns: false
16
+ columns:
17
+ - {name: id, type: long}
18
+ - {name: comment, type: string}
19
+ out:
20
+ type: file
21
+ path_prefix: ./result_zip_
22
+ file_ext: csv
23
+ formatter:
24
+ type: csv
25
+ quote_policy: MINIMAL
26
+ newline: LF
@@ -0,0 +1,2 @@
1
+ id,comment
2
+
@@ -0,0 +1,5 @@
1
+ !<arch>
2
+ sample_1.csv 1425248767 501 20 100644 6 `
3
+ 1,foo
4
+ sample_2.csv 1425099808 501 20 100644 6 `
5
+ 2,bar
@@ -13,6 +13,8 @@ import org.apache.commons.compress.archivers.ArchiveStreamFactory;
13
13
  import org.apache.commons.compress.compressors.CompressorException;
14
14
  import org.apache.commons.compress.compressors.CompressorInputStream;
15
15
  import org.apache.commons.compress.compressors.CompressorStreamFactory;
16
+ import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
17
+ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
16
18
  import org.embulk.decoder.CommonsCompressDecoderPlugin.PluginTask;
17
19
  import org.embulk.spi.util.FileInputInputStream;
18
20
  import org.embulk.spi.util.InputStreamFileInput.Provider;
@@ -105,19 +107,19 @@ class CommonsCompressProvider implements Provider {
105
107
  * (Actually, compressor formats can use two or more times in this code.
106
108
  * But it is not common case.)
107
109
  */
108
- Iterator<InputStream> createInputStreamIterator(String[] formats,
110
+ Iterator<InputStream> createInputStreamIterator(String[] inputFormats,
109
111
  int pos, InputStream in) throws IOException {
110
- if (pos >= formats.length) {
112
+ if (pos >= inputFormats.length) {
111
113
  return toIterator(in);
112
114
  }
113
115
 
114
116
  try {
115
- String format = formats[pos];
117
+ String format = inputFormats[pos];
116
118
  if (CommonsCompressUtil.isArchiveFormat(format)) {
117
119
  return new ArchiveInputStreamIterator(
118
120
  createArchiveInputStream(format, in));
119
121
  } else if (CommonsCompressUtil.isCompressorFormat(format)) {
120
- return createInputStreamIterator(formats, pos + 1,
122
+ return createInputStreamIterator(inputFormats, pos + 1,
121
123
  createCompressorInputStream(format, in));
122
124
  }
123
125
  throw new IOException("Unsupported format is configured. format:"
@@ -169,6 +171,12 @@ class CommonsCompressProvider implements Provider {
169
171
  "Failed to detect a file format. Please try to set a format explicitly.",
170
172
  e);
171
173
  }
174
+ }
175
+
176
+ if (CompressorStreamFactory.GZIP.equalsIgnoreCase(format)) {
177
+ return new GzipCompressorInputStream(in, true);
178
+ } else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(format)) {
179
+ return new BZip2CompressorInputStream(in, true);
172
180
  } else {
173
181
  return factory.createCompressorInputStream(format, in);
174
182
  }
@@ -22,6 +22,8 @@ class CommonsCompressUtil {
22
22
  ArchiveStreamFactory.ZIP,
23
23
  };
24
24
 
25
+ // Even indexes have both extensions and aliases. And odd indexes are
26
+ // CompressorStreamFactory values.
25
27
  static final String[] compressorFormats = {
26
28
  CompressorStreamFactory.BZIP2,
27
29
  CompressorStreamFactory.DEFLATE,
@@ -32,6 +34,8 @@ class CommonsCompressUtil {
32
34
  CompressorStreamFactory.SNAPPY_RAW,
33
35
  CompressorStreamFactory.XZ,
34
36
  CompressorStreamFactory.Z,
37
+ "bz2", // These values should be handled by normalizeFormats
38
+ "gzip",
35
39
  };
36
40
 
37
41
  // This table is even indexes have short extensions and odd indexes has
@@ -94,7 +98,7 @@ class CommonsCompressUtil {
94
98
  if (isAutoDetect(format)) {
95
99
  return null;
96
100
  } else if (isArchiveFormat(format) || isCompressorFormat(format)) {
97
- return splitAndReverse(format);
101
+ return normalizeFormats(splitAndReverse(format));
98
102
  }
99
103
 
100
104
  String[] formats = toSolidCompressionFormats(format);
@@ -102,7 +106,7 @@ class CommonsCompressUtil {
102
106
  return formats;
103
107
  }
104
108
 
105
- formats = splitAndReverse(format);
109
+ formats = normalizeFormats(splitAndReverse(format));
106
110
 
107
111
  for (String s : formats) {
108
112
  if (!(isArchiveFormat(s) || isCompressorFormat(s))) {
@@ -132,4 +136,20 @@ class CommonsCompressUtil {
132
136
  Collections.reverse(result);
133
137
  return result.toArray(new String[result.size()]);
134
138
  }
139
+
140
+ private static String[] normalizeFormats(String... formats) {
141
+ if (formats == null || formats.length == 0) {
142
+ return formats;
143
+ }
144
+
145
+ for (int i = 0;i < formats.length;i++) {
146
+ if (formats[i].equalsIgnoreCase("gzip")) {
147
+ formats[i] = CompressorStreamFactory.GZIP;
148
+ } else if (formats[i].equalsIgnoreCase("bz2")) {
149
+ formats[i] = CompressorStreamFactory.BZIP2;
150
+ }
151
+ }
152
+
153
+ return formats;
154
+ }
135
155
  }
@@ -240,7 +240,43 @@ public class TestCommonsCompressProvider {
240
240
  verifyContents(it, "1,foo", "2,bar");
241
241
  }
242
242
  }
243
-
243
+
244
+ @Test
245
+ public void testCreateInputStreamConcatenatedGZ() throws Exception {
246
+ try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
247
+ Iterator<InputStream> it = provider.createInputStreamIterator(
248
+ new String[]{CompressorStreamFactory.GZIP}, 0, getResourceInputStream("concatenated.csv.gz"));
249
+ verifyContents(it, "1,foo\n2,bar");
250
+ }
251
+ }
252
+
253
+ @Test
254
+ public void testCreateInputStreamConcatenatedGZip() throws Exception {
255
+ try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
256
+ Iterator<InputStream> it = provider.createInputStreamIterator(
257
+ CommonsCompressUtil.toFormats("gzip"), 0, getResourceInputStream("concatenated.csv.gz"));
258
+ verifyContents(it, "1,foo\n2,bar");
259
+ }
260
+ }
261
+
262
+ @Test
263
+ public void testCreateInputStreamConcatenatedBZip2() throws Exception {
264
+ try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
265
+ Iterator<InputStream> it = provider.createInputStreamIterator(
266
+ new String[]{CompressorStreamFactory.BZIP2}, 0, getResourceInputStream("concatenated.csv.bz2"));
267
+ verifyContents(it, "1,foo\n2,bar");
268
+ }
269
+ }
270
+
271
+ @Test
272
+ public void testCreateInputStreamConcatenatedBZ2() throws Exception {
273
+ try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
274
+ Iterator<InputStream> it = provider.createInputStreamIterator(
275
+ CommonsCompressUtil.toFormats("bz2"), 0, getResourceInputStream("concatenated.csv.bz2"));
276
+ verifyContents(it, "1,foo\n2,bar");
277
+ }
278
+ }
279
+
244
280
  @Test
245
281
  public void testClose() throws Exception {
246
282
  CommonsCompressProvider provider = new CommonsCompressProvider(task, files);
@@ -303,7 +339,7 @@ public class TestCommonsCompressProvider {
303
339
  }
304
340
  }
305
341
 
306
- @Test(expected=CompressorException.class)
342
+ @Test(expected=Exception.class)
307
343
  public void testCreateCompressorInputStreamWrongFormat() throws Exception {
308
344
  try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
309
345
  provider.createCompressorInputStream("bzip2",
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-decoder-commons-compress
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - hata
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-28 00:00:00.000000000 Z
11
+ date: 2016-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -46,6 +46,7 @@ extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
48
  - .gitignore
49
+ - .travis.yml
49
50
  - LICENSE.txt
50
51
  - README.md
51
52
  - build.gradle
@@ -54,6 +55,36 @@ files:
54
55
  - gradlew
55
56
  - gradlew.bat
56
57
  - lib/embulk/decoder/commons-compress.rb
58
+ - src/integration-test/java/org/embulk/filter/TestIntegration.java
59
+ - src/integration-test/resources/concatenated.csv.bz2
60
+ - src/integration-test/resources/concatenated.csv.gz
61
+ - src/integration-test/resources/config_ar.yml
62
+ - src/integration-test/resources/config_bz2.yml
63
+ - src/integration-test/resources/config_concatenated_bz2.yml
64
+ - src/integration-test/resources/config_concatenated_bzip2.yml
65
+ - src/integration-test/resources/config_concatenated_gz.yml
66
+ - src/integration-test/resources/config_concatenated_gzip.yml
67
+ - src/integration-test/resources/config_gz.yml
68
+ - src/integration-test/resources/config_tar.Z.yml
69
+ - src/integration-test/resources/config_tar.bz2.yml
70
+ - src/integration-test/resources/config_tar.gz.yml
71
+ - src/integration-test/resources/config_tar.yml
72
+ - src/integration-test/resources/config_tgz.yml
73
+ - src/integration-test/resources/config_zip.yml
74
+ - src/integration-test/resources/header.csv
75
+ - src/integration-test/resources/sample_0.tar
76
+ - src/integration-test/resources/sample_1.csv
77
+ - src/integration-test/resources/sample_1.csv.bz2
78
+ - src/integration-test/resources/sample_1.csv.gz
79
+ - src/integration-test/resources/sample_1.tar
80
+ - src/integration-test/resources/sample_2.csv
81
+ - src/integration-test/resources/samples.ar
82
+ - src/integration-test/resources/samples.tar
83
+ - src/integration-test/resources/samples.tar.Z
84
+ - src/integration-test/resources/samples.tar.bz2
85
+ - src/integration-test/resources/samples.tar.gz
86
+ - src/integration-test/resources/samples.tgz
87
+ - src/integration-test/resources/samples.zip
57
88
  - src/main/java/org/embulk/decoder/ArchiveInputStreamIterator.java
58
89
  - src/main/java/org/embulk/decoder/CommonsCompressDecoderPlugin.java
59
90
  - src/main/java/org/embulk/decoder/CommonsCompressFileInput.java
@@ -64,6 +95,8 @@ files:
64
95
  - src/test/java/org/embulk/decoder/TestCommonsCompressFileInput.java
65
96
  - src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java
66
97
  - src/test/java/org/embulk/decoder/TestCommonsCompressUtil.java
98
+ - src/test/resources/org/embulk/decoder/concatenated.csv.bz2
99
+ - src/test/resources/org/embulk/decoder/concatenated.csv.gz
67
100
  - src/test/resources/org/embulk/decoder/sample_0.tar
68
101
  - src/test/resources/org/embulk/decoder/sample_1.csv
69
102
  - src/test/resources/org/embulk/decoder/sample_1.csv.bz2
@@ -78,7 +111,7 @@ files:
78
111
  - src/test/resources/org/embulk/decoder/samples.tgz
79
112
  - src/test/resources/org/embulk/decoder/samples.zip
80
113
  - classpath/commons-compress-1.9.jar
81
- - classpath/embulk-decoder-commons-compress-0.3.1.jar
114
+ - classpath/embulk-decoder-commons-compress-0.3.2.jar
82
115
  homepage: https://github.com/hata/embulk-decoder-commons-compress
83
116
  licenses:
84
117
  - MIT