embulk-decoder-unzip 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -37
- data/build.gradle +2 -1
- data/src/main/java/org/embulk/decoder/unzip/ArchiveInputStreamIterator.java +87 -0
- data/src/main/java/org/embulk/decoder/unzip/CommonsCompressFileInput.java +73 -0
- data/src/main/java/org/embulk/decoder/unzip/CommonsCompressProvider.java +190 -0
- data/src/main/java/org/embulk/decoder/unzip/CommonsCompressUtil.java +155 -0
- data/src/main/java/org/embulk/decoder/unzip/UnzipDecoderPlugin.java +16 -35
- data/src/main/resources/test3.zip +0 -0
- metadata +9 -4
- data/src/main/java/org/embulk/decoder/unzip/UnzipInputStream.java +0 -29
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 102affbd723ee4a9398356eadbd7fca47b480ad5
|
4
|
+
data.tar.gz: 6cdc51537a8b63b225d99af80593e33c038e550c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4df92adfe66b3aeec9ecd8bd4a422554f1992e127d7e7e17df057f21e5900c1397ba4b1bfd6e96e71b669ad3d4947c35636d961df04c3e06818a88034666d9ce
|
7
|
+
data.tar.gz: f832f84ee19656949144fd62b507da09d76a0d6f8b0f2f9af17ee2d52124b847e31065918a03588937e5a23153a902b4ed1620d3fb43a9293da3c0bbf99e92c8
|
data/README.md
CHANGED
@@ -1,38 +1,4 @@
|
|
1
|
-
|
1
|
+
Original code is copied from https://github.com/hata/embulk-decoder-commons-compress
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
## Overview
|
6
|
-
|
7
|
-
* **Plugin type**: decoder
|
8
|
-
* **Guess supported**: no
|
9
|
-
|
10
|
-
## Configuration
|
11
|
-
|
12
|
-
- **option1**: description (integer, required)
|
13
|
-
- **option2**: description (string, default: `"myvalue"`)
|
14
|
-
- **option3**: description (string, default: `null`)
|
15
|
-
|
16
|
-
## Example
|
17
|
-
|
18
|
-
```yaml
|
19
|
-
in:
|
20
|
-
type: any output input plugin type
|
21
|
-
decoders:
|
22
|
-
- type: unzip
|
23
|
-
option1: example1
|
24
|
-
option2: example2
|
25
|
-
```
|
26
|
-
|
27
|
-
(If guess supported) you don't have to write `decoder:` section in the configuration file. After writing `in:` section, you can let embulk guess `decoder:` section using this command:
|
28
|
-
|
29
|
-
```
|
30
|
-
$ embulk gem install embulk-decoder-unzip
|
31
|
-
$ embulk guess -g unzip config.yml -o guessed.yml
|
32
|
-
```
|
33
|
-
|
34
|
-
## Build
|
35
|
-
|
36
|
-
```
|
37
|
-
$ ./gradlew gem # -t to watch change of files and rebuild continuously
|
38
|
-
```
|
3
|
+
This is just an upgrade of common-compress from 1.13 to 1.20 It's a solution to the unzip error.
|
4
|
+
The reason for this is to deal with unzip errors.
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.2"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.8
|
19
19
|
targetCompatibility = 1.8
|
@@ -21,6 +21,7 @@ targetCompatibility = 1.8
|
|
21
21
|
dependencies {
|
22
22
|
compile "org.embulk:embulk-core:0.9.23"
|
23
23
|
provided "org.embulk:embulk-core:0.9.23"
|
24
|
+
compile "org.apache.commons:commons-compress:1.20"
|
24
25
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
25
26
|
testCompile "junit:junit:4.+"
|
26
27
|
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
package org.embulk.decoder.unzip;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.io.InputStream;
|
5
|
+
import java.util.Iterator;
|
6
|
+
|
7
|
+
import org.apache.commons.compress.archivers.ArchiveEntry;
|
8
|
+
import org.apache.commons.compress.archivers.ArchiveInputStream;
|
9
|
+
|
10
|
+
class ArchiveInputStreamIterator implements Iterator<InputStream> {
|
11
|
+
private ArchiveInputStream ain;
|
12
|
+
private ArchiveEntry entry;
|
13
|
+
private String matchRegex = "";
|
14
|
+
private boolean endOfArchive = false;
|
15
|
+
|
16
|
+
ArchiveInputStreamIterator(ArchiveInputStream ain)
|
17
|
+
{
|
18
|
+
this.ain = ain;
|
19
|
+
}
|
20
|
+
|
21
|
+
ArchiveInputStreamIterator(ArchiveInputStream ain, String matchRegex) {
|
22
|
+
this.ain = ain;
|
23
|
+
this.matchRegex = matchRegex;
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
public boolean hasNext() {
|
28
|
+
try {
|
29
|
+
return checkNext();
|
30
|
+
} catch (IOException e) {
|
31
|
+
throw new RuntimeException(e);
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
@Override
|
36
|
+
public InputStream next() {
|
37
|
+
try {
|
38
|
+
if (checkNext()) {
|
39
|
+
entry = null;
|
40
|
+
} else {
|
41
|
+
return null;
|
42
|
+
}
|
43
|
+
|
44
|
+
return ain;
|
45
|
+
} catch (IOException e) {
|
46
|
+
throw new RuntimeException(e);
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
@Override
|
51
|
+
public void remove() {
|
52
|
+
throw new UnsupportedOperationException();
|
53
|
+
}
|
54
|
+
|
55
|
+
private boolean checkNext() throws IOException {
|
56
|
+
if (endOfArchive) {
|
57
|
+
return false;
|
58
|
+
} else if (entry != null) {
|
59
|
+
return true;
|
60
|
+
}
|
61
|
+
|
62
|
+
while (true) {
|
63
|
+
entry = ain.getNextEntry();
|
64
|
+
if (entry == null) {
|
65
|
+
endOfArchive = true;
|
66
|
+
return false;
|
67
|
+
} else if (entry.isDirectory()) {
|
68
|
+
continue;
|
69
|
+
} else if (!matchName(entry, matchRegex)){
|
70
|
+
continue;
|
71
|
+
} else {
|
72
|
+
return true;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
private boolean matchName(ArchiveEntry entry, String regex) {
|
78
|
+
String name = entry.getName();
|
79
|
+
if(regex == null || regex.equals("")){
|
80
|
+
return true;
|
81
|
+
} else if(name == null) {
|
82
|
+
return false;
|
83
|
+
} else {
|
84
|
+
return name.matches(regex);
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
@@ -0,0 +1,73 @@
|
|
1
|
+
package org.embulk.decoder.unzip;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.io.InputStream;
|
5
|
+
|
6
|
+
import org.embulk.spi.Buffer;
|
7
|
+
import org.embulk.spi.BufferAllocator;
|
8
|
+
import org.embulk.spi.FileInput;
|
9
|
+
import org.embulk.spi.util.InputStreamFileInput.Provider;
|
10
|
+
|
11
|
+
|
12
|
+
class CommonsCompressFileInput implements FileInput
|
13
|
+
{
|
14
|
+
private final BufferAllocator allocator;
|
15
|
+
private final Provider provider;
|
16
|
+
private InputStream current;
|
17
|
+
|
18
|
+
public CommonsCompressFileInput(BufferAllocator allocator, Provider provider)
|
19
|
+
{
|
20
|
+
this.allocator = allocator;
|
21
|
+
this.provider = provider;
|
22
|
+
this.current = null;
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public Buffer poll()
|
27
|
+
{
|
28
|
+
if (current == null) {
|
29
|
+
throw new IllegalStateException("nextFile() must be called before poll()");
|
30
|
+
}
|
31
|
+
Buffer buffer = allocator.allocate();
|
32
|
+
try {
|
33
|
+
int n = current.read(buffer.array(), buffer.offset(), buffer.capacity());
|
34
|
+
if (n < 0) {
|
35
|
+
return null;
|
36
|
+
}
|
37
|
+
buffer.limit(n);
|
38
|
+
Buffer b = buffer;
|
39
|
+
buffer = null;
|
40
|
+
return b;
|
41
|
+
} catch (IOException ex) {
|
42
|
+
throw new RuntimeException(ex);
|
43
|
+
} finally {
|
44
|
+
if (buffer != null) {
|
45
|
+
buffer.release();
|
46
|
+
buffer = null;
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
@Override
|
52
|
+
public boolean nextFile()
|
53
|
+
{
|
54
|
+
try {
|
55
|
+
// NOTE: DO NOT close current because this stream may
|
56
|
+
// be one of a file in an archive. Provider manage it.
|
57
|
+
current = provider.openNext();
|
58
|
+
return current != null;
|
59
|
+
} catch (IOException ex) {
|
60
|
+
throw new RuntimeException(ex);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
@Override
|
65
|
+
public void close()
|
66
|
+
{
|
67
|
+
try {
|
68
|
+
provider.close();
|
69
|
+
} catch (IOException ex) {
|
70
|
+
throw new RuntimeException(ex);
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
@@ -0,0 +1,190 @@
|
|
1
|
+
package org.embulk.decoder.unzip;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.io.InputStream;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Iterator;
|
8
|
+
import java.util.List;
|
9
|
+
|
10
|
+
import org.apache.commons.compress.archivers.ArchiveException;
|
11
|
+
import org.apache.commons.compress.archivers.ArchiveInputStream;
|
12
|
+
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
|
13
|
+
import org.apache.commons.compress.compressors.CompressorException;
|
14
|
+
import org.apache.commons.compress.compressors.CompressorInputStream;
|
15
|
+
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
16
|
+
import org.embulk.decoder.unzip.UnzipDecoderPlugin.PluginTask;
|
17
|
+
import org.embulk.spi.util.FileInputInputStream;
|
18
|
+
import org.embulk.spi.util.InputStreamFileInput.Provider;
|
19
|
+
|
20
|
+
class CommonsCompressProvider implements Provider {
|
21
|
+
private static final String AUTO_DETECT_FORMAT = "";
|
22
|
+
|
23
|
+
private final FileInputInputStream files;
|
24
|
+
private final boolean formatAutoDetection;
|
25
|
+
private Iterator<InputStream> inputStreamIterator;
|
26
|
+
private String[] formats;
|
27
|
+
private final boolean decompressConcatenated;
|
28
|
+
private final String matchName;
|
29
|
+
|
30
|
+
CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
|
31
|
+
this.files = files;
|
32
|
+
this.formatAutoDetection = task == null
|
33
|
+
|| CommonsCompressUtil.isAutoDetect(task.getFormat());
|
34
|
+
if (!this.formatAutoDetection) {
|
35
|
+
formats = CommonsCompressUtil.toFormats(task.getFormat());
|
36
|
+
if (formats == null) {
|
37
|
+
throw new RuntimeException("Failed to get a format.");
|
38
|
+
}
|
39
|
+
}
|
40
|
+
this.decompressConcatenated = task == null
|
41
|
+
|| task.getDecompressConcatenated();
|
42
|
+
this.matchName = (task == null)? "" : task.getMatchName();
|
43
|
+
}
|
44
|
+
|
45
|
+
@Override
|
46
|
+
public InputStream openNext() throws IOException {
|
47
|
+
while (true) {
|
48
|
+
if (inputStreamIterator == null) {
|
49
|
+
if (!files.nextFile()) {
|
50
|
+
return null;
|
51
|
+
}
|
52
|
+
inputStreamIterator = formatAutoDetection ? createInputStreamIterator(files)
|
53
|
+
: createInputStreamIterator(formats, 0, files);
|
54
|
+
} else {
|
55
|
+
if (inputStreamIterator.hasNext()) {
|
56
|
+
InputStream in = inputStreamIterator.next();
|
57
|
+
if (in == null) {
|
58
|
+
inputStreamIterator = null;
|
59
|
+
} else {
|
60
|
+
return in;
|
61
|
+
}
|
62
|
+
} else {
|
63
|
+
inputStreamIterator = null;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
@Override
|
70
|
+
public void close() throws IOException {
|
71
|
+
inputStreamIterator = null;
|
72
|
+
if (files != null) {
|
73
|
+
files.close();
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
boolean isFormatAutoDetection() {
|
78
|
+
return formatAutoDetection;
|
79
|
+
}
|
80
|
+
|
81
|
+
String[] getFormats() {
|
82
|
+
return formats;
|
83
|
+
}
|
84
|
+
|
85
|
+
Iterator<InputStream> createInputStreamIterator(InputStream in)
|
86
|
+
throws IOException {
|
87
|
+
// It is required to support mark to detect a file format.
|
88
|
+
in = in.markSupported() ? in : new BufferedInputStream(in);
|
89
|
+
try {
|
90
|
+
return new ArchiveInputStreamIterator(
|
91
|
+
createArchiveInputStream(AUTO_DETECT_FORMAT, in),
|
92
|
+
this.matchName
|
93
|
+
);
|
94
|
+
} catch (IOException | ArchiveException e) {
|
95
|
+
// ArchiveStreamFactory set mark and reset the stream.
|
96
|
+
// So, we can use the same stream to check compressor.
|
97
|
+
try {
|
98
|
+
return toIterator(createCompressorInputStream(AUTO_DETECT_FORMAT, in));
|
99
|
+
} catch (CompressorException e2) {
|
100
|
+
throw new IOException("Failed to detect a file format.", e2);
|
101
|
+
}
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/**
|
106
|
+
* Create iterator to list InputStream for each archived/compressed file.
|
107
|
+
*
|
108
|
+
* This can handle like the following formats:
|
109
|
+
* 1 archived format which defined in ArchiveStreamFactory(e.g. tar)
|
110
|
+
* 1 archived format and 1 compressor format defined in CompressorStreamFactory.(e.g. tar.bz2)
|
111
|
+
* 1 compressor format defined in CompressorStreamFactory.(e.g. bz2)
|
112
|
+
* (Actually, compressor formats can use two or more times in this code.
|
113
|
+
* But it is not common case.)
|
114
|
+
*/
|
115
|
+
Iterator<InputStream> createInputStreamIterator(String[] inputFormats,
|
116
|
+
int pos, InputStream in) throws IOException {
|
117
|
+
if (pos >= inputFormats.length) {
|
118
|
+
return toIterator(in);
|
119
|
+
}
|
120
|
+
|
121
|
+
try {
|
122
|
+
String format = inputFormats[pos];
|
123
|
+
if (CommonsCompressUtil.isArchiveFormat(format)) {
|
124
|
+
return new ArchiveInputStreamIterator(
|
125
|
+
createArchiveInputStream(format, in));
|
126
|
+
} else if (CommonsCompressUtil.isCompressorFormat(format)) {
|
127
|
+
return createInputStreamIterator(inputFormats, pos + 1,
|
128
|
+
createCompressorInputStream(format, in));
|
129
|
+
}
|
130
|
+
throw new IOException("Unsupported format is configured. format:"
|
131
|
+
+ format);
|
132
|
+
} catch (ArchiveException | CompressorException e) {
|
133
|
+
throw new IOException(e);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
/**
|
138
|
+
* Create a new ArchiveInputStream to read an archive file based on a format
|
139
|
+
* parameter.
|
140
|
+
*
|
141
|
+
* If format is not set, this method tries to detect file format
|
142
|
+
* automatically. In this case, BufferedInputStream is used to wrap
|
143
|
+
* FileInputInputStream instance. BufferedInputStream may read a data
|
144
|
+
* partially when calling files.nextFile(). However, it doesn't matter
|
145
|
+
* because the partial read data should be discarded. And then this method
|
146
|
+
* is called again to create a new ArchiveInputStream.
|
147
|
+
*
|
148
|
+
* @return a new ArchiveInputStream instance.
|
149
|
+
*/
|
150
|
+
ArchiveInputStream createArchiveInputStream(String format, InputStream in)
|
151
|
+
throws IOException, ArchiveException {
|
152
|
+
ArchiveStreamFactory factory = new ArchiveStreamFactory();
|
153
|
+
if (CommonsCompressUtil.isAutoDetect(format)) {
|
154
|
+
in = in.markSupported() ? in : new BufferedInputStream(in);
|
155
|
+
try {
|
156
|
+
return factory.createArchiveInputStream(in);
|
157
|
+
} catch (ArchiveException e) {
|
158
|
+
throw new IOException(
|
159
|
+
"Failed to detect a file format. Please try to set a format explicitly.",
|
160
|
+
e);
|
161
|
+
}
|
162
|
+
} else {
|
163
|
+
return factory.createArchiveInputStream(format, in);
|
164
|
+
}
|
165
|
+
}
|
166
|
+
|
167
|
+
CompressorInputStream createCompressorInputStream(String format,
|
168
|
+
InputStream in) throws IOException, CompressorException {
|
169
|
+
CompressorStreamFactory factory = new CompressorStreamFactory();
|
170
|
+
factory.setDecompressConcatenated(decompressConcatenated);
|
171
|
+
if (CommonsCompressUtil.isAutoDetect(format)) {
|
172
|
+
in = in.markSupported() ? in : new BufferedInputStream(in);
|
173
|
+
try {
|
174
|
+
return factory.createCompressorInputStream(in);
|
175
|
+
} catch (CompressorException e) {
|
176
|
+
throw new IOException(
|
177
|
+
"Failed to detect a file format. Please try to set a format explicitly.",
|
178
|
+
e);
|
179
|
+
}
|
180
|
+
} else {
|
181
|
+
return factory.createCompressorInputStream(format, in);
|
182
|
+
}
|
183
|
+
}
|
184
|
+
|
185
|
+
private Iterator<InputStream> toIterator(InputStream in) {
|
186
|
+
List<InputStream> list = new ArrayList<InputStream>(1);
|
187
|
+
list.add(in);
|
188
|
+
return list.iterator();
|
189
|
+
}
|
190
|
+
}
|
@@ -0,0 +1,155 @@
|
|
1
|
+
package org.embulk.decoder.unzip;
|
2
|
+
|
3
|
+
import java.util.ArrayList;
|
4
|
+
import java.util.Collections;
|
5
|
+
import java.util.List;
|
6
|
+
|
7
|
+
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
|
8
|
+
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
9
|
+
|
10
|
+
class CommonsCompressUtil {
|
11
|
+
// TODO: It may be better to check performance between Set and array.
|
12
|
+
// NOTE: Some file types may not work in an environment because some required
|
13
|
+
// libraries are not found.
|
14
|
+
static final String[] archiveFormats = {
|
15
|
+
ArchiveStreamFactory.AR,
|
16
|
+
ArchiveStreamFactory.ARJ,
|
17
|
+
ArchiveStreamFactory.CPIO,
|
18
|
+
ArchiveStreamFactory.DUMP,
|
19
|
+
ArchiveStreamFactory.JAR,
|
20
|
+
ArchiveStreamFactory.SEVEN_Z,
|
21
|
+
ArchiveStreamFactory.TAR,
|
22
|
+
ArchiveStreamFactory.ZIP,
|
23
|
+
};
|
24
|
+
|
25
|
+
// Even indexes have both extensions and aliases. And odd indexes are
|
26
|
+
// CompressorStreamFactory values.
|
27
|
+
static final String[] compressorFormats = {
|
28
|
+
CompressorStreamFactory.BZIP2,
|
29
|
+
CompressorStreamFactory.DEFLATE,
|
30
|
+
CompressorStreamFactory.GZIP,
|
31
|
+
CompressorStreamFactory.LZMA,
|
32
|
+
CompressorStreamFactory.PACK200,
|
33
|
+
CompressorStreamFactory.SNAPPY_FRAMED,
|
34
|
+
CompressorStreamFactory.SNAPPY_RAW,
|
35
|
+
CompressorStreamFactory.XZ,
|
36
|
+
CompressorStreamFactory.Z,
|
37
|
+
"bz2", // These values should be handled by normalizeFormats
|
38
|
+
"gzip",
|
39
|
+
};
|
40
|
+
|
41
|
+
// This table is even indexes have short extensions and odd indexes has
|
42
|
+
// split formats for each short extensions.
|
43
|
+
private static final String[] solidCompressionFormats = {
|
44
|
+
"tgz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.GZIP,
|
45
|
+
"tar.gz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.GZIP,
|
46
|
+
"tbz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
|
47
|
+
"tbz2", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
|
48
|
+
"tb2", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
|
49
|
+
"tar.bz2", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
|
50
|
+
"taz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.Z,
|
51
|
+
"tz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.Z,
|
52
|
+
"tar.Z", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.Z,
|
53
|
+
"tlz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.LZMA,
|
54
|
+
"tar.lz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.LZMA,
|
55
|
+
"tar.lzma", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.LZMA,
|
56
|
+
"txz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.XZ,
|
57
|
+
"tar.xz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.XZ
|
58
|
+
};
|
59
|
+
|
60
|
+
static boolean isArchiveFormat(String format) {
|
61
|
+
for (String fmt : archiveFormats) {
|
62
|
+
if (fmt.equalsIgnoreCase(format)) {
|
63
|
+
return true;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
return false;
|
67
|
+
}
|
68
|
+
|
69
|
+
static boolean isCompressorFormat(String format) {
|
70
|
+
for (String fmt : compressorFormats) {
|
71
|
+
if (fmt.equalsIgnoreCase(format)) {
|
72
|
+
return true;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
return false;
|
76
|
+
}
|
77
|
+
|
78
|
+
static boolean isAutoDetect(String format) {
|
79
|
+
return format == null || format.length() == 0;
|
80
|
+
}
|
81
|
+
|
82
|
+
/**
|
83
|
+
* Split solid compresson formats and reorder to decode the formats
|
84
|
+
* based on this order.
|
85
|
+
*
|
86
|
+
* If format is a single format like "tar", then return
|
87
|
+
* new String[]{"tar"}.
|
88
|
+
* If format is a solid compresson format like "tgz", then return
|
89
|
+
* new String[]{"gzip", "tar"}.
|
90
|
+
* If format is "tar bzip2", then return
|
91
|
+
* new String[]{"bzip2", "tar"}.
|
92
|
+
*
|
93
|
+
* @param format contains a file format or some file formats.
|
94
|
+
* @return a single format or multi format values.
|
95
|
+
* Otherwise, returns null.
|
96
|
+
*/
|
97
|
+
static String[] toFormats(String format) {
|
98
|
+
if (isAutoDetect(format)) {
|
99
|
+
return null;
|
100
|
+
} else if (isArchiveFormat(format) || isCompressorFormat(format)) {
|
101
|
+
return normalizeFormats(splitAndReverse(format));
|
102
|
+
}
|
103
|
+
|
104
|
+
String[] formats = toSolidCompressionFormats(format);
|
105
|
+
if (formats != null) {
|
106
|
+
return formats;
|
107
|
+
}
|
108
|
+
|
109
|
+
formats = normalizeFormats(splitAndReverse(format));
|
110
|
+
|
111
|
+
for (String s : formats) {
|
112
|
+
if (!(isArchiveFormat(s) || isCompressorFormat(s))) {
|
113
|
+
return null;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
return formats;
|
118
|
+
}
|
119
|
+
|
120
|
+
private static String[] toSolidCompressionFormats(String format) {
|
121
|
+
for (int i = 0;i < solidCompressionFormats.length; i+= 2) {
|
122
|
+
if (solidCompressionFormats[i].equalsIgnoreCase(format)) {
|
123
|
+
return splitAndReverse(solidCompressionFormats[i + 1]);
|
124
|
+
}
|
125
|
+
}
|
126
|
+
return null;
|
127
|
+
}
|
128
|
+
|
129
|
+
private static String[] splitAndReverse(String format) {
|
130
|
+
List<String> result = new ArrayList<>();
|
131
|
+
for (String s : format.split(" ")) {
|
132
|
+
if (s.length() > 0) {
|
133
|
+
result.add(s);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
Collections.reverse(result);
|
137
|
+
return result.toArray(new String[result.size()]);
|
138
|
+
}
|
139
|
+
|
140
|
+
private static String[] normalizeFormats(String... formats) {
|
141
|
+
if (formats == null || formats.length == 0) {
|
142
|
+
return formats;
|
143
|
+
}
|
144
|
+
|
145
|
+
for (int i = 0;i < formats.length;i++) {
|
146
|
+
if (formats[i].equalsIgnoreCase("gzip")) {
|
147
|
+
formats[i] = CompressorStreamFactory.GZIP;
|
148
|
+
} else if (formats[i].equalsIgnoreCase("bz2")) {
|
149
|
+
formats[i] = CompressorStreamFactory.BZIP2;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
return formats;
|
154
|
+
}
|
155
|
+
}
|
@@ -1,8 +1,5 @@
|
|
1
1
|
package org.embulk.decoder.unzip;
|
2
2
|
|
3
|
-
import java.io.InputStream;
|
4
|
-
import java.io.IOException;
|
5
|
-
|
6
3
|
import org.embulk.config.Config;
|
7
4
|
import org.embulk.config.ConfigDefault;
|
8
5
|
import org.embulk.config.ConfigInject;
|
@@ -13,7 +10,6 @@ import org.embulk.spi.BufferAllocator;
|
|
13
10
|
import org.embulk.spi.DecoderPlugin;
|
14
11
|
import org.embulk.spi.FileInput;
|
15
12
|
import org.embulk.spi.util.FileInputInputStream;
|
16
|
-
import org.embulk.spi.util.InputStreamFileInput;
|
17
13
|
|
18
14
|
public class UnzipDecoderPlugin
|
19
15
|
implements DecoderPlugin
|
@@ -21,6 +17,18 @@ public class UnzipDecoderPlugin
|
|
21
17
|
public interface PluginTask
|
22
18
|
extends Task
|
23
19
|
{
|
20
|
+
@Config("format")
|
21
|
+
@ConfigDefault("\"\"")
|
22
|
+
public String getFormat();
|
23
|
+
|
24
|
+
@Config("decompress_concatenated")
|
25
|
+
@ConfigDefault("true")
|
26
|
+
public boolean getDecompressConcatenated();
|
27
|
+
|
28
|
+
@Config("match_name")
|
29
|
+
@ConfigDefault("\"\"")
|
30
|
+
public String getMatchName();
|
31
|
+
|
24
32
|
// @Config("skip_on_error")
|
25
33
|
// @ConfigDefault("true")
|
26
34
|
// public boolean skipOnError();
|
@@ -40,40 +48,13 @@ public class UnzipDecoderPlugin
|
|
40
48
|
@Override
|
41
49
|
public FileInput open(TaskSource taskSource, FileInput fileInput)
|
42
50
|
{
|
51
|
+
String zipFileName = fileInput.hintOfCurrentInputFileNameForLogging().get();
|
52
|
+
System.out.println(zipFileName);
|
43
53
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
44
54
|
|
45
55
|
final FileInputInputStream files = new FileInputInputStream(fileInput);
|
46
|
-
|
47
|
-
|
48
|
-
try {
|
49
|
-
isfi = new InputStreamFileInput(
|
50
|
-
task.getBufferAllocator(),
|
51
|
-
new InputStreamFileInput.Provider() {
|
52
|
-
public InputStream openNext() throws IOException
|
53
|
-
{
|
54
|
-
if (!files.nextFile()) {
|
55
|
-
return null;
|
56
|
-
}
|
57
|
-
return newDecoderInputStream(task, files);
|
58
|
-
}
|
59
|
-
|
60
|
-
public void close() throws IOException
|
61
|
-
{
|
62
|
-
files.close();
|
63
|
-
}
|
64
|
-
});
|
65
|
-
} catch (Exception e) {
|
66
|
-
// if(task.skipOnError()) {
|
67
|
-
// System.out.println("skip: " + isfi.hintOfCurrentInputFileNameForLogging());
|
68
|
-
// return null;
|
69
|
-
// } else
|
70
|
-
throw new RuntimeException(e);
|
71
|
-
}
|
72
|
-
return isfi;
|
56
|
+
return new CommonsCompressFileInput(task.getBufferAllocator(),
|
57
|
+
new CommonsCompressProvider(task, files));
|
73
58
|
}
|
74
59
|
|
75
|
-
private static InputStream newDecoderInputStream(PluginTask task, InputStream file) throws IOException
|
76
|
-
{
|
77
|
-
return new UnzipInputStream(file);
|
78
|
-
}
|
79
60
|
}
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-decoder-unzip
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- fundoshi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -49,7 +49,8 @@ files:
|
|
49
49
|
- LICENSE.txt
|
50
50
|
- README.md
|
51
51
|
- build.gradle
|
52
|
-
- classpath/
|
52
|
+
- classpath/commons-compress-1.20.jar
|
53
|
+
- classpath/embulk-decoder-unzip-0.1.2.jar
|
53
54
|
- config/checkstyle/checkstyle.xml
|
54
55
|
- config/checkstyle/default.xml
|
55
56
|
- gradle/wrapper/gradle-wrapper.jar
|
@@ -58,12 +59,16 @@ files:
|
|
58
59
|
- gradlew.bat
|
59
60
|
- lib/embulk/decoder/unzip.rb
|
60
61
|
- lib/embulk/guess/unzip.rb
|
62
|
+
- src/main/java/org/embulk/decoder/unzip/ArchiveInputStreamIterator.java
|
63
|
+
- src/main/java/org/embulk/decoder/unzip/CommonsCompressFileInput.java
|
64
|
+
- src/main/java/org/embulk/decoder/unzip/CommonsCompressProvider.java
|
65
|
+
- src/main/java/org/embulk/decoder/unzip/CommonsCompressUtil.java
|
61
66
|
- src/main/java/org/embulk/decoder/unzip/UnzipDecoderPlugin.java
|
62
|
-
- src/main/java/org/embulk/decoder/unzip/UnzipInputStream.java
|
63
67
|
- src/main/resources/config.yml
|
64
68
|
- src/main/resources/sample.csv
|
65
69
|
- src/main/resources/test1.zip
|
66
70
|
- src/main/resources/test2.zip
|
71
|
+
- src/main/resources/test3.zip
|
67
72
|
- src/test/java/org/embulk/decoder/unzip/TestUnzipDecoderPlugin.java
|
68
73
|
homepage:
|
69
74
|
licenses:
|
@@ -1,29 +0,0 @@
|
|
1
|
-
package org.embulk.decoder.unzip;
|
2
|
-
|
3
|
-
import java.io.BufferedInputStream;
|
4
|
-
import java.io.IOException;
|
5
|
-
import java.io.InputStream;
|
6
|
-
import java.nio.charset.StandardCharsets;
|
7
|
-
import java.util.zip.ZipEntry;
|
8
|
-
import java.util.zip.ZipInputStream;
|
9
|
-
|
10
|
-
public class UnzipInputStream extends InputStream {
|
11
|
-
|
12
|
-
private ZipInputStream zis;
|
13
|
-
|
14
|
-
public UnzipInputStream(InputStream is) {
|
15
|
-
zis = new ZipInputStream(new BufferedInputStream(is), StandardCharsets.UTF_8);
|
16
|
-
}
|
17
|
-
|
18
|
-
@Override
|
19
|
-
public int read() throws IOException {
|
20
|
-
|
21
|
-
ZipEntry zipentry = zis.getNextEntry();
|
22
|
-
int v = -1;
|
23
|
-
if(zipentry != null)
|
24
|
-
v = zis.read();
|
25
|
-
return v;
|
26
|
-
|
27
|
-
}
|
28
|
-
|
29
|
-
}
|