embulk-decoder-commons-compress 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8c492e93cc8decc06fa54a952930d446748d1bc3
4
- data.tar.gz: 3bc26ce03ac3fd8cbd450b12d5c798f1f0d88326
3
+ metadata.gz: d01788f5dc971e9ee19b2139c5079fc2d54ebc81
4
+ data.tar.gz: d7cd120f691677b626b5b758f665e2dafab083ce
5
5
  SHA512:
6
- metadata.gz: a3e7102ce968c9d4c7b7733025d8b141ee55c9dd4f3e730c05a0e9ea0d11f0ddadab2fb9ca34b107eb30949108cdb6c3eb669a31eb5111c95eae52c65c6360dd
7
- data.tar.gz: aa39557828feb9906a43d061b2edecaf558bdcae8ff6b00bd6f3623ff37c09d5d3e905f18789c5b8b170a91cb41058ea229fd2df376a6487e644aacefe36c45d
6
+ metadata.gz: bb6b7d5f8717bc3464ca3e6c5272a6eb7a9e3eee262f2effbfb57e0b8784151b417a9d3e8cabad624080eea8adbabb461f9536b051ddbd676335f783bba5f51a
7
+ data.tar.gz: 4b1c4d7d7b01ea23c388d858717f8762db2bc2b97651bfb26273e02ec2c307d47faade8143269c4b491edb1adf2acaa9204d86594e1c682cfc0d53dd0d3da7e9
data/README.md CHANGED
@@ -17,6 +17,7 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
17
17
  - The format type is one of supported formats by by [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/).
18
18
  - Auto detect is used when there is no configuration. This can use for a single format. If a file format is solid compression like tar.gz, please set format config explicitly.
19
19
  - Some listing formats in [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) may not work in your environment. I could confirm the following formats work well. Your environment may be able to use other formats listed in the site.
20
+ - **decompress_concatenated**: gzip, bzip2, and xz formats support multiple concatenated streams. The default value of this parameter is true. If you want to disable it, then set to false. See [CompressorStreamFactory.setDecompressConcatenated()](https://commons.apache.org/proper/commons-compress/apidocs/org/apache/commons/compress/compressors/CompressorStreamFactory.html#setDecompressConcatenated(boolean)) in ver.1.9 for more details.
20
21
 
21
22
  ## Formats
22
23
 
@@ -29,7 +30,6 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
29
30
  - tbz, tbz2, tb2, tar.bz2
30
31
  - taz, tz, tar.Z
31
32
 
32
- If input files are concatenated gzip or bzip2 format, please set format parameter explicitly.
33
33
 
34
34
  ## Example
35
35
 
@@ -62,24 +62,18 @@ in:
62
62
  format: tgz
63
63
  ```
64
64
 
65
- - Set *format* parameter to handle concatenated gzip(or bzip2) file.
66
- ```yaml
67
- in:
68
- type: any input plugin type
69
- decoders:
70
- - type: commons-compress
71
- format: gz
72
- ```
65
+ - Set decompress_concatenated to false if you would like to read only the first concatenated entry.
73
66
 
74
67
  ```yaml
75
68
  in:
76
69
  type: any input plugin type
77
70
  decoders:
78
71
  - type: commons-compress
79
- format: bzip2
72
+ decompress_concatenated: false
80
73
  ```
81
74
 
82
75
 
76
+
83
77
  ## Build
84
78
 
85
79
  ```
data/build.gradle CHANGED
@@ -18,7 +18,7 @@ configurations {
18
18
  provided
19
19
  }
20
20
 
21
- version = "0.3.2"
21
+ version = "0.3.3"
22
22
 
23
23
  dependencies {
24
24
  compile "org.embulk:embulk-core:0.7.0"
@@ -111,6 +111,20 @@ public class TestIntegration {
111
111
  getChecksumFromFiles("result_concatenated_bzip2_000.00.csv"));
112
112
  }
113
113
 
114
+ @Test
115
+ public void testNoConcatenatedGzip() throws Exception {
116
+ assertEquals("Verify input and output contents are identical.",
117
+ getChecksumFromFiles(SAMPLE_1_SRC_FILES),
118
+ getChecksumFromFiles("result_no_concatenated_gzip_000.00.csv"));
119
+ }
120
+
121
+ @Test
122
+ public void testNoConcatenatedBzip2() throws Exception {
123
+ assertEquals("Verify input and output contents are identical.",
124
+ getChecksumFromFiles(SAMPLE_1_SRC_FILES),
125
+ getChecksumFromFiles("result_no_concatenated_bzip2_000.00.csv"));
126
+ }
127
+
114
128
  private long getChecksumFromFiles(String ... files) throws IOException {
115
129
  Checksum cksum = new CRC32();
116
130
 
@@ -3,7 +3,6 @@ in:
3
3
  path_prefix: ./concatenated.csv.bz2
4
4
  decoders:
5
5
  - type: commons-compress
6
- format: bzip2
7
6
  parser:
8
7
  charset: UTF-8
9
8
  newline: CRLF
@@ -3,7 +3,6 @@ in:
3
3
  path_prefix: ./concatenated.csv.gz
4
4
  decoders:
5
5
  - type: commons-compress
6
- format: gz
7
6
  parser:
8
7
  charset: UTF-8
9
8
  newline: CRLF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./concatenated.csv.bz2
4
+ decoders:
5
+ - type: commons-compress
6
+ decompress_concatenated: false
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_no_concatenated_bzip2_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./concatenated.csv.gz
4
+ decoders:
5
+ - type: commons-compress
6
+ decompress_concatenated: false
7
+ parser:
8
+ charset: UTF-8
9
+ newline: CRLF
10
+ type: csv
11
+ delimiter: ','
12
+ quote: '"'
13
+ trim_if_not_quoted: false
14
+ skip_header_lines: 0
15
+ allow_extra_columns: false
16
+ allow_optional_columns: false
17
+ columns:
18
+ - {name: id, type: long}
19
+ - {name: comment, type: string}
20
+ out:
21
+ type: file
22
+ path_prefix: ./result_no_concatenated_gzip_
23
+ file_ext: csv
24
+ formatter:
25
+ type: csv
26
+ quote_policy: MINIMAL
27
+ newline: LF
@@ -21,6 +21,10 @@ public class CommonsCompressDecoderPlugin
21
21
  @ConfigDefault("\"\"")
22
22
  public String getFormat();
23
23
 
24
+ @Config("decompress_concatenated")
25
+ @ConfigDefault("true")
26
+ public boolean getDecompressConcatenated();
27
+
24
28
  @ConfigInject
25
29
  public BufferAllocator getBufferAllocator();
26
30
  }
@@ -26,6 +26,7 @@ class CommonsCompressProvider implements Provider {
26
26
  private final boolean formatAutoDetection;
27
27
  private Iterator<InputStream> inputStreamIterator;
28
28
  private String[] formats;
29
+ private final boolean decompressConcatenated;
29
30
 
30
31
  CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
31
32
  this.files = files;
@@ -37,6 +38,8 @@ class CommonsCompressProvider implements Provider {
37
38
  throw new RuntimeException("Failed to get a format.");
38
39
  }
39
40
  }
41
+ this.decompressConcatenated = task == null
42
+ || task.getDecompressConcatenated();
40
43
  }
41
44
 
42
45
  @Override
@@ -162,6 +165,7 @@ class CommonsCompressProvider implements Provider {
162
165
  CompressorInputStream createCompressorInputStream(String format,
163
166
  InputStream in) throws IOException, CompressorException {
164
167
  CompressorStreamFactory factory = new CompressorStreamFactory();
168
+ factory.setDecompressConcatenated(decompressConcatenated);
165
169
  if (CommonsCompressUtil.isAutoDetect(format)) {
166
170
  in = in.markSupported() ? in : new BufferedInputStream(in);
167
171
  try {
@@ -171,12 +175,6 @@ class CommonsCompressProvider implements Provider {
171
175
  "Failed to detect a file format. Please try to set a format explicitly.",
172
176
  e);
173
177
  }
174
- }
175
-
176
- if (CompressorStreamFactory.GZIP.equalsIgnoreCase(format)) {
177
- return new GzipCompressorInputStream(in, true);
178
- } else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(format)) {
179
- return new BZip2CompressorInputStream(in, true);
180
178
  } else {
181
179
  return factory.createCompressorInputStream(format, in);
182
180
  }
@@ -59,6 +59,16 @@ public class TestCommonsCompressDecoderPlugin
59
59
  Assert.assertEquals("Verify the default config value.", DEFAULT_FORMAT_CONFIG, configDefault.value());
60
60
  }
61
61
 
62
+ @Test
63
+ public void testPluginTaskGetDecompressConcatenated() throws Exception {
64
+ Method method = CommonsCompressDecoderPlugin.PluginTask.class.getMethod("getDecompressConcatenated");
65
+ Config config = method.getAnnotation(Config.class);
66
+ ConfigDefault configDefault = method.getAnnotation(ConfigDefault.class);
67
+
68
+ Assert.assertEquals("Verify the config name.", "decompress_concatenated", config.value());
69
+ Assert.assertEquals("Verify the default config value.", "true", configDefault.value());
70
+ }
71
+
62
72
  @Test
63
73
  public void testTransaction(@Mocked final ConfigSource config, @Mocked final DecoderPlugin.Control control)
64
74
  {
@@ -552,9 +562,11 @@ public class TestCommonsCompressDecoderPlugin
552
562
 
553
563
  private class MockPluginTask implements CommonsCompressDecoderPlugin.PluginTask {
554
564
  private final String format;
565
+ private final boolean decompressConcatenated;
555
566
 
556
567
  MockPluginTask(String format) {
557
568
  this.format = format;
569
+ this.decompressConcatenated = true;
558
570
  }
559
571
 
560
572
  @Override
@@ -571,6 +583,11 @@ public class TestCommonsCompressDecoderPlugin
571
583
  return format;
572
584
  }
573
585
 
586
+ @Override
587
+ public boolean getDecompressConcatenated() {
588
+ return decompressConcatenated;
589
+ }
590
+
574
591
  @Override
575
592
  public BufferAllocator getBufferAllocator() {
576
593
  return newBufferAllocator();
@@ -243,6 +243,10 @@ public class TestCommonsCompressProvider {
243
243
 
244
244
  @Test
245
245
  public void testCreateInputStreamConcatenatedGZ() throws Exception {
246
+ new NonStrictExpectations() {{
247
+ task.getDecompressConcatenated(); result = true;
248
+ }};
249
+
246
250
  try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
247
251
  Iterator<InputStream> it = provider.createInputStreamIterator(
248
252
  new String[]{CompressorStreamFactory.GZIP}, 0, getResourceInputStream("concatenated.csv.gz"));
@@ -252,6 +256,10 @@ public class TestCommonsCompressProvider {
252
256
 
253
257
  @Test
254
258
  public void testCreateInputStreamConcatenatedGZip() throws Exception {
259
+ new NonStrictExpectations() {{
260
+ task.getDecompressConcatenated(); result = true;
261
+ }};
262
+
255
263
  try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
256
264
  Iterator<InputStream> it = provider.createInputStreamIterator(
257
265
  CommonsCompressUtil.toFormats("gzip"), 0, getResourceInputStream("concatenated.csv.gz"));
@@ -261,6 +269,10 @@ public class TestCommonsCompressProvider {
261
269
 
262
270
  @Test
263
271
  public void testCreateInputStreamConcatenatedBZip2() throws Exception {
272
+ new NonStrictExpectations() {{
273
+ task.getDecompressConcatenated(); result = true;
274
+ }};
275
+
264
276
  try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
265
277
  Iterator<InputStream> it = provider.createInputStreamIterator(
266
278
  new String[]{CompressorStreamFactory.BZIP2}, 0, getResourceInputStream("concatenated.csv.bz2"));
@@ -270,6 +282,10 @@ public class TestCommonsCompressProvider {
270
282
 
271
283
  @Test
272
284
  public void testCreateInputStreamConcatenatedBZ2() throws Exception {
285
+ new NonStrictExpectations() {{
286
+ task.getDecompressConcatenated(); result = true;
287
+ }};
288
+
273
289
  try (CommonsCompressProvider provider = new CommonsCompressProvider(task, files)) {
274
290
  Iterator<InputStream> it = provider.createInputStreamIterator(
275
291
  CommonsCompressUtil.toFormats("bz2"), 0, getResourceInputStream("concatenated.csv.bz2"));
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-decoder-commons-compress
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - hata
@@ -65,6 +65,8 @@ files:
65
65
  - src/integration-test/resources/config_concatenated_gz.yml
66
66
  - src/integration-test/resources/config_concatenated_gzip.yml
67
67
  - src/integration-test/resources/config_gz.yml
68
+ - src/integration-test/resources/config_no_concatenated_bzip2.yml
69
+ - src/integration-test/resources/config_no_concatenated_gzip.yml
68
70
  - src/integration-test/resources/config_tar.Z.yml
69
71
  - src/integration-test/resources/config_tar.bz2.yml
70
72
  - src/integration-test/resources/config_tar.gz.yml
@@ -111,7 +113,7 @@ files:
111
113
  - src/test/resources/org/embulk/decoder/samples.tgz
112
114
  - src/test/resources/org/embulk/decoder/samples.zip
113
115
  - classpath/commons-compress-1.9.jar
114
- - classpath/embulk-decoder-commons-compress-0.3.2.jar
116
+ - classpath/embulk-decoder-commons-compress-0.3.3.jar
115
117
  homepage: https://github.com/hata/embulk-decoder-commons-compress
116
118
  licenses:
117
119
  - MIT