embulk-decoder-commons-compress 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -10
- data/build.gradle +1 -1
- data/src/integration-test/java/org/embulk/filter/TestIntegration.java +14 -0
- data/src/integration-test/resources/config_concatenated_bzip2.yml +0 -1
- data/src/integration-test/resources/config_concatenated_gz.yml +0 -1
- data/src/integration-test/resources/config_no_concatenated_bzip2.yml +27 -0
- data/src/integration-test/resources/config_no_concatenated_gzip.yml +27 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressDecoderPlugin.java +4 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressProvider.java +4 -6
- data/src/test/java/org/embulk/decoder/TestCommonsCompressDecoderPlugin.java +17 -0
- data/src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java +16 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d01788f5dc971e9ee19b2139c5079fc2d54ebc81
|
4
|
+
data.tar.gz: d7cd120f691677b626b5b758f665e2dafab083ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb6b7d5f8717bc3464ca3e6c5272a6eb7a9e3eee262f2effbfb57e0b8784151b417a9d3e8cabad624080eea8adbabb461f9536b051ddbd676335f783bba5f51a
|
7
|
+
data.tar.gz: 4b1c4d7d7b01ea23c388d858717f8762db2bc2b97651bfb26273e02ec2c307d47faade8143269c4b491edb1adf2acaa9204d86594e1c682cfc0d53dd0d3da7e9
|
data/README.md
CHANGED
@@ -17,6 +17,7 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
|
|
17
17
|
- The format type is one of supported formats by by [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/).
|
18
18
|
- Auto detect is used when there is no configuration. This can use for a single format. If a file format is solid compression like tar.gz, please set format config explicitly.
|
19
19
|
- Some listing formats in [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) may not work in your environment. I could confirm the following formats work well. Your environment may be able to use other formats listed in the site.
|
20
|
+
- **decompress_concatenated**: gzip, bzip2, and xz formats support multiple concatenated streams. The default value of this parameter is true. If you want to disable it, then set to false. See [CompressorStreamFactory.setDecompressConcatenated()](https://commons.apache.org/proper/commons-compress/apidocs/org/apache/commons/compress/compressors/CompressorStreamFactory.html#setDecompressConcatenated(boolean)) in ver.1.9 for more details.
|
20
21
|
|
21
22
|
## Formats
|
22
23
|
|
@@ -29,7 +30,6 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
|
|
29
30
|
- tbz, tbz2, tb2, tar.bz2
|
30
31
|
- taz, tz, tar.Z
|
31
32
|
|
32
|
-
If input files are concatenated gzip or bzip2 format, please set format parameter explicitly.
|
33
33
|
|
34
34
|
## Example
|
35
35
|
|
@@ -62,24 +62,18 @@ in:
|
|
62
62
|
format: tgz
|
63
63
|
```
|
64
64
|
|
65
|
-
- Set
|
66
|
-
```yaml
|
67
|
-
in:
|
68
|
-
type: any input plugin type
|
69
|
-
decoders:
|
70
|
-
- type: commons-compress
|
71
|
-
format: gz
|
72
|
-
```
|
65
|
+
- Set decompress_concatenated to false if you would like to read only the first concatenated entry.
|
73
66
|
|
74
67
|
```yaml
|
75
68
|
in:
|
76
69
|
type: any input plugin type
|
77
70
|
decoders:
|
78
71
|
- type: commons-compress
|
79
|
-
|
72
|
+
decompress_concatenated: false
|
80
73
|
```
|
81
74
|
|
82
75
|
|
76
|
+
|
83
77
|
## Build
|
84
78
|
|
85
79
|
```
|
data/build.gradle
CHANGED
@@ -111,6 +111,20 @@ public class TestIntegration {
|
|
111
111
|
getChecksumFromFiles("result_concatenated_bzip2_000.00.csv"));
|
112
112
|
}
|
113
113
|
|
114
|
+
@Test
|
115
|
+
public void testNoConcatenatedGzip() throws Exception {
|
116
|
+
assertEquals("Verify input and output contents are identical.",
|
117
|
+
getChecksumFromFiles(SAMPLE_1_SRC_FILES),
|
118
|
+
getChecksumFromFiles("result_no_concatenated_gzip_000.00.csv"));
|
119
|
+
}
|
120
|
+
|
121
|
+
@Test
|
122
|
+
public void testNoConcatenatedBzip2() throws Exception {
|
123
|
+
assertEquals("Verify input and output contents are identical.",
|
124
|
+
getChecksumFromFiles(SAMPLE_1_SRC_FILES),
|
125
|
+
getChecksumFromFiles("result_no_concatenated_bzip2_000.00.csv"));
|
126
|
+
}
|
127
|
+
|
114
128
|
private long getChecksumFromFiles(String ... files) throws IOException {
|
115
129
|
Checksum cksum = new CRC32();
|
116
130
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.bz2
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
decompress_concatenated: false
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_no_concatenated_bzip2_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: ./concatenated.csv.gz
|
4
|
+
decoders:
|
5
|
+
- type: commons-compress
|
6
|
+
decompress_concatenated: false
|
7
|
+
parser:
|
8
|
+
charset: UTF-8
|
9
|
+
newline: CRLF
|
10
|
+
type: csv
|
11
|
+
delimiter: ','
|
12
|
+
quote: '"'
|
13
|
+
trim_if_not_quoted: false
|
14
|
+
skip_header_lines: 0
|
15
|
+
allow_extra_columns: false
|
16
|
+
allow_optional_columns: false
|
17
|
+
columns:
|
18
|
+
- {name: id, type: long}
|
19
|
+
- {name: comment, type: string}
|
20
|
+
out:
|
21
|
+
type: file
|
22
|
+
path_prefix: ./result_no_concatenated_gzip_
|
23
|
+
file_ext: csv
|
24
|
+
formatter:
|
25
|
+
type: csv
|
26
|
+
quote_policy: MINIMAL
|
27
|
+
newline: LF
|
@@ -21,6 +21,10 @@ public class CommonsCompressDecoderPlugin
|
|
21
21
|
@ConfigDefault("\"\"")
|
22
22
|
public String getFormat();
|
23
23
|
|
24
|
+
@Config("decompress_concatenated")
|
25
|
+
@ConfigDefault("true")
|
26
|
+
public boolean getDecompressConcatenated();
|
27
|
+
|
24
28
|
@ConfigInject
|
25
29
|
public BufferAllocator getBufferAllocator();
|
26
30
|
}
|
@@ -26,6 +26,7 @@ class CommonsCompressProvider implements Provider {
|
|
26
26
|
private final boolean formatAutoDetection;
|
27
27
|
private Iterator<InputStream> inputStreamIterator;
|
28
28
|
private String[] formats;
|
29
|
+
private final boolean decompressConcatenated;
|
29
30
|
|
30
31
|
CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
|
31
32
|
this.files = files;
|
@@ -37,6 +38,8 @@ class CommonsCompressProvider implements Provider {
|
|
37
38
|
throw new RuntimeException("Failed to get a format.");
|
38
39
|
}
|
39
40
|
}
|
41
|
+
this.decompressConcatenated = task == null
|
42
|
+
|| task.getDecompressConcatenated();
|
40
43
|
}
|
41
44
|
|
42
45
|
@Override
|
@@ -162,6 +165,7 @@ class CommonsCompressProvider implements Provider {
|
|
162
165
|
CompressorInputStream createCompressorInputStream(String format,
|
163
166
|
InputStream in) throws IOException, CompressorException {
|
164
167
|
CompressorStreamFactory factory = new CompressorStreamFactory();
|
168
|
+
factory.setDecompressConcatenated(decompressConcatenated);
|
165
169
|
if (CommonsCompressUtil.isAutoDetect(format)) {
|
166
170
|
in = in.markSupported() ? in : new BufferedInputStream(in);
|
167
171
|
try {
|
@@ -171,12 +175,6 @@ class CommonsCompressProvider implements Provider {
|
|
171
175
|
"Failed to detect a file format. Please try to set a format explicitly.",
|
172
176
|
e);
|
173
177
|
}
|
174
|
-
}
|
175
|
-
|
176
|
-
if (CompressorStreamFactory.GZIP.equalsIgnoreCase(format)) {
|
177
|
-
return new GzipCompressorInputStream(in, true);
|
178
|
-
} else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(format)) {
|
179
|
-
return new BZip2CompressorInputStream(in, true);
|
180
178
|
} else {
|
181
179
|
return factory.createCompressorInputStream(format, in);
|
182
180
|
}
|
@@ -59,6 +59,16 @@ public class TestCommonsCompressDecoderPlugin
|
|
59
59
|
Assert.assertEquals("Verify the default config value.", DEFAULT_FORMAT_CONFIG, configDefault.value());
|
60
60
|
}
|
61
61
|
|
62
|
+
@Test
|
63
|
+
public void testPluginTaskGetDecompressConcatenated() throws Exception {
|
64
|
+
Method method = CommonsCompressDecoderPlugin.PluginTask.class.getMethod("getDecompressConcatenated");
|
65
|
+
Config config = method.getAnnotation(Config.class);
|
66
|
+
ConfigDefault configDefault = method.getAnnotation(ConfigDefault.class);
|
67
|
+
|
68
|
+
Assert.assertEquals("Verify the config name.", "decompress_concatenated", config.value());
|
69
|
+
Assert.assertEquals("Verify the default config value.", "true", configDefault.value());
|
70
|
+
}
|
71
|
+
|
62
72
|
@Test
|
63
73
|
public void testTransaction(@Mocked final ConfigSource config, @Mocked final DecoderPlugin.Control control)
|
64
74
|
{
|
@@ -552,9 +562,11 @@ public class TestCommonsCompressDecoderPlugin
|
|
552
562
|
|
553
563
|
private class MockPluginTask implements CommonsCompressDecoderPlugin.PluginTask {
|
554
564
|
private final String format;
|
565
|
+
private final boolean decompressConcatenated;
|
555
566
|
|
556
567
|
MockPluginTask(String format) {
|
557
568
|
this.format = format;
|
569
|
+
this.decompressConcatenated = true;
|
558
570
|
}
|
559
571
|
|
560
572
|
@Override
|
@@ -571,6 +583,11 @@ public class TestCommonsCompressDecoderPlugin
|
|
571
583
|
return format;
|
572
584
|
}
|
573
585
|
|
586
|
+
@Override
|
587
|
+
public boolean getDecompressConcatenated() {
|
588
|
+
return decompressConcatenated;
|
589
|
+
}
|
590
|
+
|
574
591
|
@Override
|
575
592
|
public BufferAllocator getBufferAllocator() {
|
576
593
|
return newBufferAllocator();
|
@@ -243,6 +243,10 @@ public class TestCommonsCompressProvider {
|
|
243
243
|
|
244
244
|
@Test
|
245
245
|
public void testCreateInputStreamConcatenatedGZ() throws Exception {
|
246
|
+
new NonStrictExpectations() {{
|
247
|
+
task.getDecompressConcatenated(); result = true;
|
248
|
+
}};
|
249
|
+
|
246
250
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
247
251
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
248
252
|
new String[]{CompressorStreamFactory.GZIP}, 0, getResourceInputStream("concatenated.csv.gz"));
|
@@ -252,6 +256,10 @@ public class TestCommonsCompressProvider {
|
|
252
256
|
|
253
257
|
@Test
|
254
258
|
public void testCreateInputStreamConcatenatedGZip() throws Exception {
|
259
|
+
new NonStrictExpectations() {{
|
260
|
+
task.getDecompressConcatenated(); result = true;
|
261
|
+
}};
|
262
|
+
|
255
263
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
256
264
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
257
265
|
CommonsCompressUtil.toFormats("gzip"), 0, getResourceInputStream("concatenated.csv.gz"));
|
@@ -261,6 +269,10 @@ public class TestCommonsCompressProvider {
|
|
261
269
|
|
262
270
|
@Test
|
263
271
|
public void testCreateInputStreamConcatenatedBZip2() throws Exception {
|
272
|
+
new NonStrictExpectations() {{
|
273
|
+
task.getDecompressConcatenated(); result = true;
|
274
|
+
}};
|
275
|
+
|
264
276
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
265
277
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
266
278
|
new String[]{CompressorStreamFactory.BZIP2}, 0, getResourceInputStream("concatenated.csv.bz2"));
|
@@ -270,6 +282,10 @@ public class TestCommonsCompressProvider {
|
|
270
282
|
|
271
283
|
@Test
|
272
284
|
public void testCreateInputStreamConcatenatedBZ2() throws Exception {
|
285
|
+
new NonStrictExpectations() {{
|
286
|
+
task.getDecompressConcatenated(); result = true;
|
287
|
+
}};
|
288
|
+
|
273
289
|
try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
|
274
290
|
Iterator<InputStream> it = provider.createInputStreamIterator(
|
275
291
|
CommonsCompressUtil.toFormats("bz2"), 0, getResourceInputStream("concatenated.csv.bz2"));
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-decoder-commons-compress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hata
|
@@ -65,6 +65,8 @@ files:
|
|
65
65
|
- src/integration-test/resources/config_concatenated_gz.yml
|
66
66
|
- src/integration-test/resources/config_concatenated_gzip.yml
|
67
67
|
- src/integration-test/resources/config_gz.yml
|
68
|
+
- src/integration-test/resources/config_no_concatenated_bzip2.yml
|
69
|
+
- src/integration-test/resources/config_no_concatenated_gzip.yml
|
68
70
|
- src/integration-test/resources/config_tar.Z.yml
|
69
71
|
- src/integration-test/resources/config_tar.bz2.yml
|
70
72
|
- src/integration-test/resources/config_tar.gz.yml
|
@@ -111,7 +113,7 @@ files:
|
|
111
113
|
- src/test/resources/org/embulk/decoder/samples.tgz
|
112
114
|
- src/test/resources/org/embulk/decoder/samples.zip
|
113
115
|
- classpath/commons-compress-1.9.jar
|
114
|
-
- classpath/embulk-decoder-commons-compress-0.3.
|
116
|
+
- classpath/embulk-decoder-commons-compress-0.3.3.jar
|
115
117
|
homepage: https://github.com/hata/embulk-decoder-commons-compress
|
116
118
|
licenses:
|
117
119
|
- MIT
|