embulk-decoder-commons-compress 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -1
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/decoder/ArchiveInputStreamIterator.java +19 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressDecoderPlugin.java +4 -0
- data/src/main/java/org/embulk/decoder/CommonsCompressProvider.java +5 -1
- data/src/test/java/org/embulk/decoder/TestArchiveInputStreamIterator.java +15 -0
- data/src/test/java/org/embulk/decoder/TestCommonsCompressDecoderPlugin.java +7 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 991a486a32c2dd2bfddf31f937d04e612870ddc5
|
4
|
+
data.tar.gz: ae04456e55be753f049dd6343995e22873a1a4e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2a6e171d9d98884df6bb4eddafd2d0ad07de835d66a2756926cd919be8bdb84926ca0982e94aeac5c2279406d4f6423bcfcde45daabf13365f262ca7031830f
|
7
|
+
data.tar.gz: a0031b1f6c5279cdd2c49fb098eef837e8f9aeac3f766fd318d190a87f4a708e7b509c846be0efe0a09eeadde7e627e312f49832124caa019047b94db0476d6b
|
data/README.md
CHANGED
@@ -18,6 +18,7 @@ This decoder plugin for Embulk supports various archive formats using [Apache Co
|
|
18
18
|
- Auto detect is used when there is no configuration. This can use for a single format. If a file format is solid compression like tar.gz, please set format config explicitly.
|
19
19
|
- Some listing formats in [Apache Commons Compress](http://commons.apache.org/proper/commons-compress/) may not work in your environment. I could confirm the following formats work well. Your environment may be able to use other formats listed in the site.
|
20
20
|
- **decompress_concatenated**: gzip, bzip2, and xz formats support multiple concatenated streams. The default value of this parameter is true. If you want to disable it, then set to false. See [CompressorStreamFactory.setDecompressConcatenated()](https://commons.apache.org/proper/commons-compress/apidocs/org/apache/commons/compress/compressors/CompressorStreamFactory.html#setDecompressConcatenated(boolean)) in ver.1.9 for more details.
|
21
|
+
- **match_name**: Only the files in an archive which match to match_name are processed. match_name is set by regular expression.
|
21
22
|
|
22
23
|
## Formats
|
23
24
|
|
@@ -62,7 +63,7 @@ in:
|
|
62
63
|
format: tgz
|
63
64
|
```
|
64
65
|
|
65
|
-
- Set decompress_concatenated to false if you would like to read
|
66
|
+
- Set decompress_concatenated to false if you would like to read the first concatenated gzip/bzip2 archive only.
|
66
67
|
|
67
68
|
```yaml
|
68
69
|
in:
|
@@ -72,6 +73,16 @@ in:
|
|
72
73
|
decompress_concatenated: false
|
73
74
|
```
|
74
75
|
|
76
|
+
- Set match_name to extract only the files whose suffix is '.csv' from an archive.
|
77
|
+
|
78
|
+
```yaml
|
79
|
+
in:
|
80
|
+
type: any input plugin type
|
81
|
+
decoders:
|
82
|
+
- type: commons-compress
|
83
|
+
match_name_: ".*\\.csv"
|
84
|
+
```
|
85
|
+
|
75
86
|
|
76
87
|
|
77
88
|
## Build
|
data/build.gradle
CHANGED
@@ -10,6 +10,7 @@ import org.apache.commons.compress.archivers.ArchiveInputStream;
|
|
10
10
|
class ArchiveInputStreamIterator implements Iterator<InputStream> {
|
11
11
|
private ArchiveInputStream ain;
|
12
12
|
private ArchiveEntry entry;
|
13
|
+
private String matchRegex = "";
|
13
14
|
private boolean endOfArchive = false;
|
14
15
|
|
15
16
|
ArchiveInputStreamIterator(ArchiveInputStream ain)
|
@@ -17,6 +18,11 @@ class ArchiveInputStreamIterator implements Iterator<InputStream> {
|
|
17
18
|
this.ain = ain;
|
18
19
|
}
|
19
20
|
|
21
|
+
ArchiveInputStreamIterator(ArchiveInputStream ain, String matchRegex) {
|
22
|
+
this.ain = ain;
|
23
|
+
this.matchRegex = matchRegex;
|
24
|
+
}
|
25
|
+
|
20
26
|
@Override
|
21
27
|
public boolean hasNext() {
|
22
28
|
try {
|
@@ -60,9 +66,22 @@ class ArchiveInputStreamIterator implements Iterator<InputStream> {
|
|
60
66
|
return false;
|
61
67
|
} else if (entry.isDirectory()) {
|
62
68
|
continue;
|
69
|
+
} else if (!matchName(entry, matchRegex)){
|
70
|
+
continue;
|
63
71
|
} else {
|
64
72
|
return true;
|
65
73
|
}
|
66
74
|
}
|
67
75
|
}
|
76
|
+
|
77
|
+
private boolean matchName(ArchiveEntry entry, String regex) {
|
78
|
+
String name = entry.getName();
|
79
|
+
if(regex == null || regex.equals("")){
|
80
|
+
return true;
|
81
|
+
} else if(name == null) {
|
82
|
+
return false;
|
83
|
+
} else {
|
84
|
+
return name.matches(regex);
|
85
|
+
}
|
86
|
+
}
|
68
87
|
}
|
@@ -25,6 +25,10 @@ public class CommonsCompressDecoderPlugin
|
|
25
25
|
@ConfigDefault("true")
|
26
26
|
public boolean getDecompressConcatenated();
|
27
27
|
|
28
|
+
@Config("match_name")
|
29
|
+
@ConfigDefault("\"\"")
|
30
|
+
public String getMatchName();
|
31
|
+
|
28
32
|
@ConfigInject
|
29
33
|
public BufferAllocator getBufferAllocator();
|
30
34
|
}
|
@@ -27,6 +27,7 @@ class CommonsCompressProvider implements Provider {
|
|
27
27
|
private Iterator<InputStream> inputStreamIterator;
|
28
28
|
private String[] formats;
|
29
29
|
private final boolean decompressConcatenated;
|
30
|
+
private final String matchName;
|
30
31
|
|
31
32
|
CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
|
32
33
|
this.files = files;
|
@@ -40,6 +41,7 @@ class CommonsCompressProvider implements Provider {
|
|
40
41
|
}
|
41
42
|
this.decompressConcatenated = task == null
|
42
43
|
|| task.getDecompressConcatenated();
|
44
|
+
this.matchName = (task == null)? "" : task.getMatchName();
|
43
45
|
}
|
44
46
|
|
45
47
|
@Override
|
@@ -88,7 +90,9 @@ class CommonsCompressProvider implements Provider {
|
|
88
90
|
in = in.markSupported() ? in : new BufferedInputStream(in);
|
89
91
|
try {
|
90
92
|
return new ArchiveInputStreamIterator(
|
91
|
-
createArchiveInputStream(AUTO_DETECT_FORMAT, in)
|
93
|
+
createArchiveInputStream(AUTO_DETECT_FORMAT, in),
|
94
|
+
this.matchName
|
95
|
+
);
|
92
96
|
} catch (IOException | ArchiveException e) {
|
93
97
|
// ArchiveStreamFactory set mark and reset the stream.
|
94
98
|
// So, we can use the same stream to check compressor.
|
@@ -70,6 +70,21 @@ public class TestArchiveInputStreamIterator {
|
|
70
70
|
assertNull("Verify there is no stream.", it.next());
|
71
71
|
}
|
72
72
|
|
73
|
+
@Test
|
74
|
+
public void testHasNextForNameMatch(@Mocked final ArchiveInputStream ain, @Mocked final ArchiveEntry entry) throws Exception {
|
75
|
+
new NonStrictExpectations() {{
|
76
|
+
ain.getNextEntry(); result = entry; result = entry; result = entry; result = null;
|
77
|
+
entry.getName(); result = "first.csv"; result = "second.txt"; result = "third.csv";
|
78
|
+
}};
|
79
|
+
ArchiveInputStreamIterator it = new ArchiveInputStreamIterator(ain, ".*\\.csv");
|
80
|
+
assertTrue("Verify 1st file match", it.hasNext());
|
81
|
+
assertEquals("Verify ArchiveInputStream is return.", (InputStream)ain, it.next());
|
82
|
+
assertTrue("Verify 3rd file match", it.hasNext());
|
83
|
+
assertEquals("Verify ArchiveInputStream is return.", (InputStream)ain, it.next());
|
84
|
+
assertFalse("Veryfy no more entry because second.txt is skipped.", it.hasNext());
|
85
|
+
assertNull("Verify there is no stream.", it.next());
|
86
|
+
}
|
87
|
+
|
73
88
|
@Test
|
74
89
|
public void testArchiveFile() throws Exception {
|
75
90
|
InputStream in = getClass().getResourceAsStream("samples.tar");
|
@@ -563,10 +563,12 @@ public class TestCommonsCompressDecoderPlugin
|
|
563
563
|
private class MockPluginTask implements CommonsCompressDecoderPlugin.PluginTask {
|
564
564
|
private final String format;
|
565
565
|
private final boolean decompressConcatenated;
|
566
|
+
private final String matchName;
|
566
567
|
|
567
568
|
MockPluginTask(String format) {
|
568
569
|
this.format = format;
|
569
570
|
this.decompressConcatenated = true;
|
571
|
+
this.matchName = "";
|
570
572
|
}
|
571
573
|
|
572
574
|
@Override
|
@@ -588,6 +590,11 @@ public class TestCommonsCompressDecoderPlugin
|
|
588
590
|
return decompressConcatenated;
|
589
591
|
}
|
590
592
|
|
593
|
+
@Override
|
594
|
+
public String getMatchName() {
|
595
|
+
return matchName;
|
596
|
+
}
|
597
|
+
|
591
598
|
@Override
|
592
599
|
public BufferAllocator getBufferAllocator() {
|
593
600
|
return newBufferAllocator();
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-decoder-commons-compress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hata
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -113,7 +113,7 @@ files:
|
|
113
113
|
- src/test/resources/org/embulk/decoder/samples.tgz
|
114
114
|
- src/test/resources/org/embulk/decoder/samples.zip
|
115
115
|
- classpath/commons-compress-1.9.jar
|
116
|
-
- classpath/embulk-decoder-commons-compress-0.
|
116
|
+
- classpath/embulk-decoder-commons-compress-0.4.0.jar
|
117
117
|
homepage: https://github.com/hata/embulk-decoder-commons-compress
|
118
118
|
licenses:
|
119
119
|
- MIT
|