embulk-decoder-commons-compress 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d01788f5dc971e9ee19b2139c5079fc2d54ebc81
4
- data.tar.gz: d7cd120f691677b626b5b758f665e2dafab083ce
3
+ metadata.gz: 991a486a32c2dd2bfddf31f937d04e612870ddc5
4
+ data.tar.gz: ae04456e55be753f049dd6343995e22873a1a4e8
5
5
  SHA512:
6
- metadata.gz: bb6b7d5f8717bc3464ca3e6c5272a6eb7a9e3eee262f2effbfb57e0b8784151b417a9d3e8cabad624080eea8adbabb461f9536b051ddbd676335f783bba5f51a
7
- data.tar.gz: 4b1c4d7d7b01ea23c388d858717f8762db2bc2b97651bfb26273e02ec2c307d47faade8143269c4b491edb1adf2acaa9204d86594e1c682cfc0d53dd0d3da7e9
6
+ metadata.gz: a2a6e171d9d98884df6bb4eddafd2d0ad07de835d66a2756926cd919be8bdb84926ca0982e94aeac5c2279406d4f6423bcfcde45daabf13365f262ca7031830f
7
+ data.tar.gz: a0031b1f6c5279cdd2c49fb098eef837e8f9aeac3f766fd318d190a87f4a708e7b509c846be0efe0a09eeadde7e627e312f49832124caa019047b94db0476d6b
data/README.md CHANGED
@@ -18,6 +18,7 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
18
18
  - Auto detect is used when there is no configuration. This can use for a single format. If a file format is solid compression like tar.gz, please set format config explicitly.
19
19
  - Some listing formats in [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) may not work in your environment. I could confirm the following formats work well. Your environment may be able to use other formats listed in the site.
20
20
  - **decompress_concatenated**: gzip, bzip2, and xz formats support multiple concatenated streams. The default value of this parameter is true. If you want to disable it, then set to false. See [CompressorStreamFactory.setDecompressConcatenated()](https://commons.apache.org/proper/commons-compress/apidocs/org/apache/commons/compress/compressors/CompressorStreamFactory.html#setDecompressConcatenated(boolean)) in ver.1.9 for more details.
21
+ - **match_name**: Only the files in an archive which match to match_name are processed. match_name is set by regular expression.
21
22
 
22
23
  ## Formats
23
24
 
@@ -62,7 +63,7 @@ in:
62
63
  format: tgz
63
64
  ```
64
65
 
65
- - Set decompress_concatenated to false if you would like to read only the first concatenated entry.
66
+ - Set decompress_concatenated to false if you would like to read the first concatenated gzip/bzip2 archive only.
66
67
 
67
68
  ```yaml
68
69
  in:
@@ -72,6 +73,16 @@ in:
72
73
  decompress_concatenated: false
73
74
  ```
74
75
 
76
+ - Set match_name to extract only the files whose suffix is '.csv' from an archive.
77
+
78
+ ```yaml
79
+ in:
80
+ type: any input plugin type
81
+ decoders:
82
+ - type: commons-compress
83
+ match_name_: ".*\\.csv"
84
+ ```
85
+
75
86
 
76
87
 
77
88
  ## Build
@@ -18,7 +18,7 @@ configurations {
18
18
  provided
19
19
  }
20
20
 
21
- version = "0.3.3"
21
+ version = "0.4.0"
22
22
 
23
23
  dependencies {
24
24
  compile "org.embulk:embulk-core:0.7.0"
@@ -10,6 +10,7 @@ import org.apache.commons.compress.archivers.ArchiveInputStream;
10
10
  class ArchiveInputStreamIterator implements Iterator<InputStream> {
11
11
  private ArchiveInputStream ain;
12
12
  private ArchiveEntry entry;
13
+ private String matchRegex = "";
13
14
  private boolean endOfArchive = false;
14
15
 
15
16
  ArchiveInputStreamIterator(ArchiveInputStream ain)
@@ -17,6 +18,11 @@ class ArchiveInputStreamIterator implements Iterator<InputStream> {
17
18
  this.ain = ain;
18
19
  }
19
20
 
21
+ ArchiveInputStreamIterator(ArchiveInputStream ain, String matchRegex) {
22
+ this.ain = ain;
23
+ this.matchRegex = matchRegex;
24
+ }
25
+
20
26
  @Override
21
27
  public boolean hasNext() {
22
28
  try {
@@ -60,9 +66,22 @@ class ArchiveInputStreamIterator implements Iterator<InputStream> {
60
66
  return false;
61
67
  } else if (entry.isDirectory()) {
62
68
  continue;
69
+ } else if (!matchName(entry, matchRegex)){
70
+ continue;
63
71
  } else {
64
72
  return true;
65
73
  }
66
74
  }
67
75
  }
76
+
77
+ private boolean matchName(ArchiveEntry entry, String regex) {
78
+ String name = entry.getName();
79
+ if(regex == null || regex.equals("")){
80
+ return true;
81
+ } else if(name == null) {
82
+ return false;
83
+ } else {
84
+ return name.matches(regex);
85
+ }
86
+ }
68
87
  }
@@ -25,6 +25,10 @@ public class CommonsCompressDecoderPlugin
25
25
  @ConfigDefault("true")
26
26
  public boolean getDecompressConcatenated();
27
27
 
28
+ @Config("match_name")
29
+ @ConfigDefault("\"\"")
30
+ public String getMatchName();
31
+
28
32
  @ConfigInject
29
33
  public BufferAllocator getBufferAllocator();
30
34
  }
@@ -27,6 +27,7 @@ class CommonsCompressProvider implements Provider {
27
27
  private Iterator<InputStream> inputStreamIterator;
28
28
  private String[] formats;
29
29
  private final boolean decompressConcatenated;
30
+ private final String matchName;
30
31
 
31
32
  CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
32
33
  this.files = files;
@@ -40,6 +41,7 @@ class CommonsCompressProvider implements Provider {
40
41
  }
41
42
  this.decompressConcatenated = task == null
42
43
  || task.getDecompressConcatenated();
44
+ this.matchName = (task == null)? "" : task.getMatchName();
43
45
  }
44
46
 
45
47
  @Override
@@ -88,7 +90,9 @@ class CommonsCompressProvider implements Provider {
88
90
  in = in.markSupported() ? in : new BufferedInputStream(in);
89
91
  try {
90
92
  return new ArchiveInputStreamIterator(
91
- createArchiveInputStream(AUTO_DETECT_FORMAT, in));
93
+ createArchiveInputStream(AUTO_DETECT_FORMAT, in),
94
+ this.matchName
95
+ );
92
96
  } catch (IOException | ArchiveException e) {
93
97
  // ArchiveStreamFactory set mark and reset the stream.
94
98
  // So, we can use the same stream to check compressor.
@@ -70,6 +70,21 @@ public class TestArchiveInputStreamIterator {
70
70
  assertNull("Verify there is no stream.", it.next());
71
71
  }
72
72
 
73
+ @Test
74
+ public void testHasNextForNameMatch(@Mocked final ArchiveInputStream ain, @Mocked final ArchiveEntry entry) throws Exception {
75
+ new NonStrictExpectations() {{
76
+ ain.getNextEntry(); result = entry; result = entry; result = entry; result = null;
77
+ entry.getName(); result = "first.csv"; result = "second.txt"; result = "third.csv";
78
+ }};
79
+ ArchiveInputStreamIterator it = new ArchiveInputStreamIterator(ain, ".*\\.csv");
80
+ assertTrue("Verify 1st file match", it.hasNext());
81
+ assertEquals("Verify ArchiveInputStream is return.", (InputStream)ain, it.next());
82
+ assertTrue("Verify 3rd file match", it.hasNext());
83
+ assertEquals("Verify ArchiveInputStream is return.", (InputStream)ain, it.next());
84
+ assertFalse("Veryfy no more entry because second.txt is skipped.", it.hasNext());
85
+ assertNull("Verify there is no stream.", it.next());
86
+ }
87
+
73
88
  @Test
74
89
  public void testArchiveFile() throws Exception {
75
90
  InputStream in = getClass().getResourceAsStream("samples.tar");
@@ -563,10 +563,12 @@ public class TestCommonsCompressDecoderPlugin
563
563
  private class MockPluginTask implements CommonsCompressDecoderPlugin.PluginTask {
564
564
  private final String format;
565
565
  private final boolean decompressConcatenated;
566
+ private final String matchName;
566
567
 
567
568
  MockPluginTask(String format) {
568
569
  this.format = format;
569
570
  this.decompressConcatenated = true;
571
+ this.matchName = "";
570
572
  }
571
573
 
572
574
  @Override
@@ -588,6 +590,11 @@ public class TestCommonsCompressDecoderPlugin
588
590
  return decompressConcatenated;
589
591
  }
590
592
 
593
+ @Override
594
+ public String getMatchName() {
595
+ return matchName;
596
+ }
597
+
591
598
  @Override
592
599
  public BufferAllocator getBufferAllocator() {
593
600
  return newBufferAllocator();
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-decoder-commons-compress
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hata
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-02 00:00:00.000000000 Z
11
+ date: 2016-05-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -113,7 +113,7 @@ files:
113
113
  - src/test/resources/org/embulk/decoder/samples.tgz
114
114
  - src/test/resources/org/embulk/decoder/samples.zip
115
115
  - classpath/commons-compress-1.9.jar
116
- - classpath/embulk-decoder-commons-compress-0.3.3.jar
116
+ - classpath/embulk-decoder-commons-compress-0.4.0.jar
117
117
  homepage: https://github.com/hata/embulk-decoder-commons-compress
118
118
  licenses:
119
119
  - MIT