embulk-decoder-commons-compress 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +7 -0
  3. data/LICENSE.txt +23 -0
  4. data/README.md +70 -0
  5. data/build.gradle +63 -0
  6. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  7. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  8. data/gradlew +164 -0
  9. data/gradlew.bat +90 -0
  10. data/lib/embulk/decoder/commons-compress.rb +3 -0
  11. data/src/main/java/org/embulk/decoder/ArchiveInputStreamIterator.java +68 -0
  12. data/src/main/java/org/embulk/decoder/CommonsCompressDecoderPlugin.java +52 -0
  13. data/src/main/java/org/embulk/decoder/CommonsCompressFileInput.java +73 -0
  14. data/src/main/java/org/embulk/decoder/CommonsCompressProvider.java +182 -0
  15. data/src/main/java/org/embulk/decoder/CommonsCompressUtil.java +135 -0
  16. data/src/test/java/org/embulk/decoder/TestArchiveInputStreamIterator.java +106 -0
  17. data/src/test/java/org/embulk/decoder/TestCommonsCompressDecoderPlugin.java +593 -0
  18. data/src/test/java/org/embulk/decoder/TestCommonsCompressFileInput.java +152 -0
  19. data/src/test/java/org/embulk/decoder/TestCommonsCompressProvider.java +369 -0
  20. data/src/test/java/org/embulk/decoder/TestCommonsCompressUtil.java +80 -0
  21. data/src/test/resources/org/embulk/decoder/sample_0.tar +0 -0
  22. data/src/test/resources/org/embulk/decoder/sample_1.csv +1 -0
  23. data/src/test/resources/org/embulk/decoder/sample_1.csv.bz2 +0 -0
  24. data/src/test/resources/org/embulk/decoder/sample_1.tar +0 -0
  25. data/src/test/resources/org/embulk/decoder/sample_2.csv +1 -0
  26. data/src/test/resources/org/embulk/decoder/samples.ar +5 -0
  27. data/src/test/resources/org/embulk/decoder/samples.tar +0 -0
  28. data/src/test/resources/org/embulk/decoder/samples.tar.Z +0 -0
  29. data/src/test/resources/org/embulk/decoder/samples.tar.bz2 +0 -0
  30. data/src/test/resources/org/embulk/decoder/samples.tar.gz +0 -0
  31. data/src/test/resources/org/embulk/decoder/samples.tar.xz +0 -0
  32. data/src/test/resources/org/embulk/decoder/samples.tgz +0 -0
  33. data/src/test/resources/org/embulk/decoder/samples.zip +0 -0
  34. metadata +106 -0
@@ -0,0 +1,68 @@
1
+ package org.embulk.decoder;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+ import java.util.Iterator;
6
+
7
+ import org.apache.commons.compress.archivers.ArchiveEntry;
8
+ import org.apache.commons.compress.archivers.ArchiveInputStream;
9
+
10
+ class ArchiveInputStreamIterator implements Iterator<InputStream> {
11
+ private ArchiveInputStream ain;
12
+ private ArchiveEntry entry;
13
+ private boolean endOfArchive = false;
14
+
15
+ ArchiveInputStreamIterator(ArchiveInputStream ain)
16
+ {
17
+ this.ain = ain;
18
+ }
19
+
20
+ @Override
21
+ public boolean hasNext() {
22
+ try {
23
+ return checkNext();
24
+ } catch (IOException e) {
25
+ throw new RuntimeException(e);
26
+ }
27
+ }
28
+
29
+ @Override
30
+ public InputStream next() {
31
+ try {
32
+ if (checkNext()) {
33
+ entry = null;
34
+ } else {
35
+ return null;
36
+ }
37
+
38
+ return ain;
39
+ } catch (IOException e) {
40
+ throw new RuntimeException(e);
41
+ }
42
+ }
43
+
44
+ @Override
45
+ public void remove() {
46
+ throw new UnsupportedOperationException();
47
+ }
48
+
49
+ private boolean checkNext() throws IOException {
50
+ if (endOfArchive) {
51
+ return false;
52
+ } else if (entry != null) {
53
+ return true;
54
+ }
55
+
56
+ while (true) {
57
+ entry = ain.getNextEntry();
58
+ if (entry == null) {
59
+ endOfArchive = true;
60
+ return false;
61
+ } else if (entry.isDirectory()) {
62
+ continue;
63
+ } else {
64
+ return true;
65
+ }
66
+ }
67
+ }
68
+ }
@@ -0,0 +1,52 @@
1
+ package org.embulk.decoder;
2
+
3
+ import org.embulk.config.Config;
4
+ import org.embulk.config.ConfigDefault;
5
+ import org.embulk.config.ConfigInject;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.spi.BufferAllocator;
10
+ import org.embulk.spi.DecoderPlugin;
11
+ import org.embulk.spi.FileInput;
12
+ import org.embulk.spi.util.FileInputInputStream;
13
+
14
+ public class CommonsCompressDecoderPlugin
15
+ implements DecoderPlugin
16
+ {
17
+ public interface PluginTask
18
+ extends Task
19
+ {
20
+ @Config("format")
21
+ @ConfigDefault("\"\"")
22
+ public String getFormat();
23
+
24
+ @ConfigInject
25
+ public BufferAllocator getBufferAllocator();
26
+ }
27
+
28
+ @Override
29
+ public void transaction(ConfigSource config, DecoderPlugin.Control control)
30
+ {
31
+ PluginTask task = config.loadConfig(PluginTask.class);
32
+ control.run(task.dump());
33
+ }
34
+
35
+ @Override
36
+ public FileInput open(TaskSource taskSource, FileInput input)
37
+ {
38
+ PluginTask task = taskSource.loadTask(PluginTask.class);
39
+ return new CommonsCompressFileInput(
40
+ task.getBufferAllocator(),
41
+ new CommonsCompressProvider(task, new FileInputInputStream(input) {
42
+ // NOTE: This is workaround code to avoid hanging issue.
43
+ // This issue will be fixed after merging #112.
44
+ // https://github.com/embulk/embulk/pull/112
45
+ @Override
46
+ public long skip(long len) {
47
+ long skipped = super.skip(len);
48
+ return skipped > 0 ? skipped : 0;
49
+ }
50
+ }));
51
+ }
52
+ }
@@ -0,0 +1,73 @@
1
+ package org.embulk.decoder;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+
6
+ import org.embulk.spi.Buffer;
7
+ import org.embulk.spi.BufferAllocator;
8
+ import org.embulk.spi.FileInput;
9
+ import org.embulk.spi.util.InputStreamFileInput.Provider;
10
+
11
+
12
+ class CommonsCompressFileInput implements FileInput
13
+ {
14
+ private final BufferAllocator allocator;
15
+ private final Provider provider;
16
+ private InputStream current;
17
+
18
+ public CommonsCompressFileInput(BufferAllocator allocator, Provider provider)
19
+ {
20
+ this.allocator = allocator;
21
+ this.provider = provider;
22
+ this.current = null;
23
+ }
24
+
25
+ @Override
26
+ public Buffer poll()
27
+ {
28
+ if (current == null) {
29
+ throw new IllegalStateException("nextFile() must be called before poll()");
30
+ }
31
+ Buffer buffer = allocator.allocate();
32
+ try {
33
+ int n = current.read(buffer.array(), buffer.offset(), buffer.capacity());
34
+ if (n < 0) {
35
+ return null;
36
+ }
37
+ buffer.limit(n);
38
+ Buffer b = buffer;
39
+ buffer = null;
40
+ return b;
41
+ } catch (IOException ex) {
42
+ throw new RuntimeException(ex);
43
+ } finally {
44
+ if (buffer != null) {
45
+ buffer.release();
46
+ buffer = null;
47
+ }
48
+ }
49
+ }
50
+
51
+ @Override
52
+ public boolean nextFile()
53
+ {
54
+ try {
55
+ // NOTE: DO NOT close current because this stream may
56
+ // be one of a file in an archive. Provider manage it.
57
+ current = provider.openNext();
58
+ return current != null;
59
+ } catch (IOException ex) {
60
+ throw new RuntimeException(ex);
61
+ }
62
+ }
63
+
64
+ @Override
65
+ public void close()
66
+ {
67
+ try {
68
+ provider.close();
69
+ } catch (IOException ex) {
70
+ throw new RuntimeException(ex);
71
+ }
72
+ }
73
+ }
@@ -0,0 +1,182 @@
1
+ package org.embulk.decoder;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.IOException;
5
+ import java.io.InputStream;
6
+ import java.util.ArrayList;
7
+ import java.util.Iterator;
8
+ import java.util.List;
9
+
10
+ import org.apache.commons.compress.archivers.ArchiveException;
11
+ import org.apache.commons.compress.archivers.ArchiveInputStream;
12
+ import org.apache.commons.compress.archivers.ArchiveStreamFactory;
13
+ import org.apache.commons.compress.compressors.CompressorException;
14
+ import org.apache.commons.compress.compressors.CompressorInputStream;
15
+ import org.apache.commons.compress.compressors.CompressorStreamFactory;
16
+ import org.embulk.decoder.CommonsCompressDecoderPlugin.PluginTask;
17
+ import org.embulk.spi.util.FileInputInputStream;
18
+ import org.embulk.spi.util.InputStreamFileInput.Provider;
19
+
20
+ class CommonsCompressProvider implements Provider {
21
+ private static final String AUTO_DETECT_FORMAT = "";
22
+
23
+ private final FileInputInputStream files;
24
+ private final boolean formatAutoDetection;
25
+ private Iterator<InputStream> inputStreamIterator;
26
+ private String[] formats;
27
+
28
+ CommonsCompressProvider(PluginTask task, FileInputInputStream files) {
29
+ this.files = files;
30
+ this.formatAutoDetection = task == null
31
+ || CommonsCompressUtil.isAutoDetect(task.getFormat());
32
+ if (!this.formatAutoDetection) {
33
+ formats = CommonsCompressUtil.toFormats(task.getFormat());
34
+ if (formats == null) {
35
+ throw new RuntimeException("Failed to get a format.");
36
+ }
37
+ }
38
+ }
39
+
40
+ @Override
41
+ public InputStream openNext() throws IOException {
42
+ while (true) {
43
+ if (inputStreamIterator == null) {
44
+ if (!files.nextFile()) {
45
+ return null;
46
+ }
47
+ inputStreamIterator = formatAutoDetection ? createInputStreamIterator(files)
48
+ : createInputStreamIterator(formats, 0, files);
49
+ } else {
50
+ if (inputStreamIterator.hasNext()) {
51
+ InputStream in = inputStreamIterator.next();
52
+ if (in == null) {
53
+ inputStreamIterator = null;
54
+ } else {
55
+ return in;
56
+ }
57
+ } else {
58
+ inputStreamIterator = null;
59
+ }
60
+ }
61
+ }
62
+ }
63
+
64
+ @Override
65
+ public void close() throws IOException {
66
+ inputStreamIterator = null;
67
+ if (files != null) {
68
+ files.close();
69
+ }
70
+ }
71
+
72
+ boolean isFormatAutoDetection() {
73
+ return formatAutoDetection;
74
+ }
75
+
76
+ String[] getFormats() {
77
+ return formats;
78
+ }
79
+
80
+ Iterator<InputStream> createInputStreamIterator(InputStream in)
81
+ throws IOException {
82
+ // It is required to support mark to detect a file format.
83
+ in = in.markSupported() ? in : new BufferedInputStream(in);
84
+ try {
85
+ return new ArchiveInputStreamIterator(
86
+ createArchiveInputStream(AUTO_DETECT_FORMAT, in));
87
+ } catch (IOException | ArchiveException e) {
88
+ // ArchiveStreamFactory set mark and reset the stream.
89
+ // So, we can use the same stream to check compressor.
90
+ try {
91
+ return toIterator(createCompressorInputStream(AUTO_DETECT_FORMAT, in));
92
+ } catch (CompressorException e2) {
93
+ throw new IOException("Failed to detect a file format.", e2);
94
+ }
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Create iterator to list InputStream for each archived/compressed file.
100
+ *
101
+ * This can handle like the following formats:
102
+ * 1 archived format which defined in ArchiveStreamFactory(e.g. tar)
103
+ * 1 archived format and 1 compressor format defined in CompressorStreamFactory.(e.g. tar.bz2)
104
+ * 1 compressor format defined in CompressorStreamFactory.(e.g. bz2)
105
+ * (Actually, compressor formats can use two or more times in this code.
106
+ * But it is not common case.)
107
+ */
108
+ Iterator<InputStream> createInputStreamIterator(String[] formats,
109
+ int pos, InputStream in) throws IOException {
110
+ if (pos >= formats.length) {
111
+ return toIterator(in);
112
+ }
113
+
114
+ try {
115
+ String format = formats[pos];
116
+ if (CommonsCompressUtil.isArchiveFormat(format)) {
117
+ return new ArchiveInputStreamIterator(
118
+ createArchiveInputStream(format, in));
119
+ } else if (CommonsCompressUtil.isCompressorFormat(format)) {
120
+ return createInputStreamIterator(formats, pos + 1,
121
+ createCompressorInputStream(format, in));
122
+ }
123
+ throw new IOException("Unsupported format is configured. format:"
124
+ + format);
125
+ } catch (ArchiveException | CompressorException e) {
126
+ throw new IOException(e);
127
+ }
128
+ }
129
+
130
+ /**
131
+ * Create a new ArchiveInputStream to read an archive file based on a format
132
+ * parameter.
133
+ *
134
+ * If format is not set, this method tries to detect file format
135
+ * automatically. In this case, BufferedInputStream is used to wrap
136
+ * FileInputInputStream instance. BufferedInputStream may read a data
137
+ * partially when calling files.nextFile(). However, it doesn't matter
138
+ * because the partial read data should be discarded. And then this method
139
+ * is called again to create a new ArchiveInputStream.
140
+ *
141
+ * @return a new ArchiveInputStream instance.
142
+ */
143
+ ArchiveInputStream createArchiveInputStream(String format, InputStream in)
144
+ throws IOException, ArchiveException {
145
+ ArchiveStreamFactory factory = new ArchiveStreamFactory();
146
+ if (CommonsCompressUtil.isAutoDetect(format)) {
147
+ in = in.markSupported() ? in : new BufferedInputStream(in);
148
+ try {
149
+ return factory.createArchiveInputStream(in);
150
+ } catch (ArchiveException e) {
151
+ throw new IOException(
152
+ "Failed to detect a file format. Please try to set a format explicitly.",
153
+ e);
154
+ }
155
+ } else {
156
+ return factory.createArchiveInputStream(format, in);
157
+ }
158
+ }
159
+
160
+ CompressorInputStream createCompressorInputStream(String format,
161
+ InputStream in) throws IOException, CompressorException {
162
+ CompressorStreamFactory factory = new CompressorStreamFactory();
163
+ if (CommonsCompressUtil.isAutoDetect(format)) {
164
+ in = in.markSupported() ? in : new BufferedInputStream(in);
165
+ try {
166
+ return factory.createCompressorInputStream(in);
167
+ } catch (CompressorException e) {
168
+ throw new IOException(
169
+ "Failed to detect a file format. Please try to set a format explicitly.",
170
+ e);
171
+ }
172
+ } else {
173
+ return factory.createCompressorInputStream(format, in);
174
+ }
175
+ }
176
+
177
+ private Iterator<InputStream> toIterator(InputStream in) {
178
+ List<InputStream> list = new ArrayList<InputStream>(1);
179
+ list.add(in);
180
+ return list.iterator();
181
+ }
182
+ }
@@ -0,0 +1,135 @@
1
+ package org.embulk.decoder;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.Collections;
5
+ import java.util.List;
6
+
7
+ import org.apache.commons.compress.archivers.ArchiveStreamFactory;
8
+ import org.apache.commons.compress.compressors.CompressorStreamFactory;
9
+
10
+ class CommonsCompressUtil {
11
+ // TODO: It may be better to check performance between Set and array.
12
+ // NOTE: Some file types may not work in an environment because some required
13
+ // libraries are not found.
14
+ static final String[] archiveFormats = {
15
+ ArchiveStreamFactory.AR,
16
+ ArchiveStreamFactory.ARJ,
17
+ ArchiveStreamFactory.CPIO,
18
+ ArchiveStreamFactory.DUMP,
19
+ ArchiveStreamFactory.JAR,
20
+ ArchiveStreamFactory.SEVEN_Z,
21
+ ArchiveStreamFactory.TAR,
22
+ ArchiveStreamFactory.ZIP,
23
+ };
24
+
25
+ static final String[] compressorFormats = {
26
+ CompressorStreamFactory.BZIP2,
27
+ CompressorStreamFactory.DEFLATE,
28
+ CompressorStreamFactory.GZIP,
29
+ CompressorStreamFactory.LZMA,
30
+ CompressorStreamFactory.PACK200,
31
+ CompressorStreamFactory.SNAPPY_FRAMED,
32
+ CompressorStreamFactory.SNAPPY_RAW,
33
+ CompressorStreamFactory.XZ,
34
+ CompressorStreamFactory.Z,
35
+ };
36
+
37
+ // This table is even indexes have short extensions and odd indexes has
38
+ // split formats for each short extensions.
39
+ private static final String[] solidCompressionFormats = {
40
+ "tgz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.GZIP,
41
+ "tar.gz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.GZIP,
42
+ "tbz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
43
+ "tbz2", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
44
+ "tb2", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
45
+ "tar.bz2", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.BZIP2,
46
+ "taz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.Z,
47
+ "tz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.Z,
48
+ "tar.Z", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.Z,
49
+ "tlz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.LZMA,
50
+ "tar.lz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.LZMA,
51
+ "tar.lzma", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.LZMA,
52
+ "txz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.XZ,
53
+ "tar.xz", ArchiveStreamFactory.TAR + " " + CompressorStreamFactory.XZ
54
+ };
55
+
56
+ static boolean isArchiveFormat(String format) {
57
+ for (String fmt : archiveFormats) {
58
+ if (fmt.equalsIgnoreCase(format)) {
59
+ return true;
60
+ }
61
+ }
62
+ return false;
63
+ }
64
+
65
+ static boolean isCompressorFormat(String format) {
66
+ for (String fmt : compressorFormats) {
67
+ if (fmt.equalsIgnoreCase(format)) {
68
+ return true;
69
+ }
70
+ }
71
+ return false;
72
+ }
73
+
74
+ static boolean isAutoDetect(String format) {
75
+ return format == null || format.length() == 0;
76
+ }
77
+
78
+ /**
79
+ * Split solid compresson formats and reorder to decode the formats
80
+ * based on this order.
81
+ *
82
+ * If format is a single format like "tar", then return
83
+ * new String[]{"tar"}.
84
+ * If format is a solid compresson format like "tgz", then return
85
+ * new String[]{"gzip", "tar"}.
86
+ * If format is "tar bzip2", then return
87
+ * new String[]{"bzip2", "tar"}.
88
+ *
89
+ * @param format contains a file format or some file formats.
90
+ * @return a single format or multi format values.
91
+ * Otherwise, returns null.
92
+ */
93
+ static String[] toFormats(String format) {
94
+ if (isAutoDetect(format)) {
95
+ return null;
96
+ } else if (isArchiveFormat(format) || isCompressorFormat(format)) {
97
+ return splitAndReverse(format);
98
+ }
99
+
100
+ String[] formats = toSolidCompressionFormats(format);
101
+ if (formats != null) {
102
+ return formats;
103
+ }
104
+
105
+ formats = splitAndReverse(format);
106
+
107
+ for (String s : formats) {
108
+ if (!(isArchiveFormat(s) || isCompressorFormat(s))) {
109
+ return null;
110
+ }
111
+ }
112
+
113
+ return formats;
114
+ }
115
+
116
+ private static String[] toSolidCompressionFormats(String format) {
117
+ for (int i = 0;i < solidCompressionFormats.length; i+= 2) {
118
+ if (solidCompressionFormats[i].equalsIgnoreCase(format)) {
119
+ return splitAndReverse(solidCompressionFormats[i + 1]);
120
+ }
121
+ }
122
+ return null;
123
+ }
124
+
125
+ private static String[] splitAndReverse(String format) {
126
+ List<String> result = new ArrayList<>();
127
+ for (String s : format.split(" ")) {
128
+ if (s.length() > 0) {
129
+ result.add(s);
130
+ }
131
+ }
132
+ Collections.reverse(result);
133
+ return result.toArray(new String[result.size()]);
134
+ }
135
+ }