embulk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +13 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +33 -0
- data/README.md +117 -0
- data/Rakefile +58 -0
- data/bin/embulk +63 -0
- data/build.gradle +149 -0
- data/embulk-cli/build.gradle +6 -0
- data/embulk-cli/pom.xml +94 -0
- data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
- data/embulk-core/build.gradle +6 -0
- data/embulk-core/pom.xml +143 -0
- data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
- data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
- data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
- data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
- data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
- data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
- data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
- data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
- data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
- data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
- data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
- data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
- data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
- data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
- data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
- data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
- data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
- data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
- data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
- data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
- data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
- data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
- data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
- data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
- data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
- data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
- data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
- data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
- data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
- data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
- data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
- data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
- data/embulk-standards/build.gradle +6 -0
- data/embulk-standards/pom.xml +68 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
- data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
- data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
- data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
- data/embulk.gemspec +27 -0
- data/examples/config.yml +34 -0
- data/examples/csv/sample.csv.gz +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk.rb +16 -0
- data/lib/embulk/buffer.rb +17 -0
- data/lib/embulk/column.rb +47 -0
- data/lib/embulk/command/embulk.rb +39 -0
- data/lib/embulk/command/embulk_example.rb +32 -0
- data/lib/embulk/command/embulk_generate_bin.rb +62 -0
- data/lib/embulk/command/embulk_run.rb +243 -0
- data/lib/embulk/data/bundle/.bundle/config +3 -0
- data/lib/embulk/data/bundle/Gemfile +31 -0
- data/lib/embulk/data/bundle/Gemfile.lock +8 -0
- data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
- data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
- data/lib/embulk/data_source.rb +66 -0
- data/lib/embulk/error.rb +5 -0
- data/lib/embulk/guess_charset.rb +26 -0
- data/lib/embulk/guess_csv.rb +195 -0
- data/lib/embulk/guess_gzip.rb +18 -0
- data/lib/embulk/guess_newline.rb +20 -0
- data/lib/embulk/guess_plugin.rb +113 -0
- data/lib/embulk/input_plugin.rb +53 -0
- data/lib/embulk/java/bootstrap.rb +12 -0
- data/lib/embulk/java/imports.rb +26 -0
- data/lib/embulk/java/time_helper.rb +77 -0
- data/lib/embulk/output_plugin.rb +104 -0
- data/lib/embulk/page.rb +28 -0
- data/lib/embulk/page_builder.rb +22 -0
- data/lib/embulk/plugin.rb +152 -0
- data/lib/embulk/plugin_registry.rb +70 -0
- data/lib/embulk/schema.rb +85 -0
- data/lib/embulk/time_format_guess.rb +331 -0
- data/lib/embulk/version.rb +3 -0
- data/pom.xml +533 -0
- data/settings.gradle +5 -0
- metadata +370 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
package org.embulk.standards;
|
|
2
|
+
|
|
3
|
+
import com.google.common.base.Preconditions;
|
|
4
|
+
import java.util.List;
|
|
5
|
+
import java.util.ArrayList;
|
|
6
|
+
import java.util.Deque;
|
|
7
|
+
import java.util.ArrayDeque;
|
|
8
|
+
import java.util.Iterator;
|
|
9
|
+
import org.embulk.spi.util.LineDecoder;
|
|
10
|
+
|
|
11
|
+
public class CsvTokenizer
|
|
12
|
+
{
|
|
13
|
+
static enum RecordState
|
|
14
|
+
{
|
|
15
|
+
NOT_END, END,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
static enum ColumnState
|
|
19
|
+
{
|
|
20
|
+
BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
private static final char END_OF_LINE = '\0';
|
|
24
|
+
private static final boolean TRACE = false;
|
|
25
|
+
|
|
26
|
+
private final char delimiter;
|
|
27
|
+
private final char quote;
|
|
28
|
+
private final char escape;
|
|
29
|
+
private final String newline;
|
|
30
|
+
private final boolean trimIfNotQuoted;
|
|
31
|
+
private final long maxQuotedSizeLimit; // TODO not used yet
|
|
32
|
+
private final LineDecoder input;
|
|
33
|
+
|
|
34
|
+
private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
|
|
35
|
+
private long lineNumber = 0;
|
|
36
|
+
|
|
37
|
+
private String line = null;
|
|
38
|
+
private int linePos = 0;
|
|
39
|
+
private boolean wasQuotedColumn = false;
|
|
40
|
+
private List<String> quotedValueLines = new ArrayList<>();
|
|
41
|
+
private Deque<String> unreadLines = new ArrayDeque<>();
|
|
42
|
+
|
|
43
|
+
public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
|
|
44
|
+
{
|
|
45
|
+
delimiter = task.getDelimiterChar();
|
|
46
|
+
quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
|
|
47
|
+
escape = task.getEscapeChar();
|
|
48
|
+
newline = task.getNewline().getString();
|
|
49
|
+
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
|
50
|
+
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
|
51
|
+
this.input = input;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
public long getCurrentLineNumber()
|
|
55
|
+
{
|
|
56
|
+
return lineNumber;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// returns skipped line
|
|
60
|
+
public String skipCurrentLine()
|
|
61
|
+
{
|
|
62
|
+
String skippedLine;
|
|
63
|
+
if (quotedValueLines.isEmpty()) {
|
|
64
|
+
skippedLine = line;
|
|
65
|
+
} else {
|
|
66
|
+
// recover lines of quoted value
|
|
67
|
+
skippedLine = quotedValueLines.remove(0); // TODO optimize performance
|
|
68
|
+
unreadLines.addAll(quotedValueLines);
|
|
69
|
+
unreadLines.add(line);
|
|
70
|
+
lineNumber -= quotedValueLines.size();
|
|
71
|
+
quotedValueLines.clear();
|
|
72
|
+
}
|
|
73
|
+
recordState = RecordState.END;
|
|
74
|
+
return line;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
public boolean nextFile()
|
|
78
|
+
{
|
|
79
|
+
return input.nextFile();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
public boolean nextRecord()
|
|
83
|
+
{
|
|
84
|
+
// If at the end of record, read the next line and initialize the state
|
|
85
|
+
Preconditions.checkState(recordState == RecordState.END, "too many columns"); // TODO exception class
|
|
86
|
+
boolean hasNext = nextLine(true);
|
|
87
|
+
if (hasNext) {
|
|
88
|
+
recordState = RecordState.NOT_END;
|
|
89
|
+
return true;
|
|
90
|
+
} else {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
private boolean nextLine(boolean ignoreEmptyLine)
|
|
96
|
+
{
|
|
97
|
+
while (true) {
|
|
98
|
+
if (!unreadLines.isEmpty()) {
|
|
99
|
+
line = unreadLines.removeFirst();
|
|
100
|
+
} else {
|
|
101
|
+
line = input.poll();
|
|
102
|
+
if (line == null) {
|
|
103
|
+
return false;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
linePos = 0;
|
|
107
|
+
lineNumber++;
|
|
108
|
+
|
|
109
|
+
if (TRACE) {
|
|
110
|
+
System.out.println("#MN line: " + line + " (" + lineNumber + ")");
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (!line.isEmpty() || !ignoreEmptyLine) {
|
|
114
|
+
return true;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
public String nextColumn()
|
|
120
|
+
{
|
|
121
|
+
Preconditions.checkState(recordState == RecordState.NOT_END, "doesn't have enough columns"); // TODO exception class
|
|
122
|
+
|
|
123
|
+
// reset last state
|
|
124
|
+
wasQuotedColumn = false;
|
|
125
|
+
quotedValueLines.clear();
|
|
126
|
+
|
|
127
|
+
// local state
|
|
128
|
+
int valueStartPos = linePos;
|
|
129
|
+
int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
|
|
130
|
+
StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
|
|
131
|
+
ColumnState columnState = ColumnState.BEGIN;
|
|
132
|
+
|
|
133
|
+
while (true) {
|
|
134
|
+
final char c = nextChar();
|
|
135
|
+
if (TRACE) {
|
|
136
|
+
System.out.println("#MN c: " + c + " (" + columnState + "," + recordState + ")");
|
|
137
|
+
try { Thread.sleep(100); } catch (Exception e) {}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
switch (columnState) {
|
|
141
|
+
case BEGIN:
|
|
142
|
+
// TODO optimization: state is BEGIN only at the first character of a column.
|
|
143
|
+
// this block can be out of the looop.
|
|
144
|
+
if (isDelimiter(c)) {
|
|
145
|
+
// empty value
|
|
146
|
+
return "";
|
|
147
|
+
|
|
148
|
+
} else if (isEndOfLine(c)) {
|
|
149
|
+
// empty value
|
|
150
|
+
recordState = RecordState.END;
|
|
151
|
+
return "";
|
|
152
|
+
|
|
153
|
+
} else if (isSpace(c) && trimIfNotQuoted) {
|
|
154
|
+
columnState = ColumnState.FIRST_TRIM;
|
|
155
|
+
|
|
156
|
+
} else if (isQuote(c)) {
|
|
157
|
+
valueStartPos = linePos; // == 1
|
|
158
|
+
wasQuotedColumn = true;
|
|
159
|
+
quotedValue = new StringBuilder();
|
|
160
|
+
columnState = ColumnState.QUOTED_VALUE;
|
|
161
|
+
|
|
162
|
+
} else {
|
|
163
|
+
columnState = ColumnState.VALUE;
|
|
164
|
+
}
|
|
165
|
+
break;
|
|
166
|
+
|
|
167
|
+
case FIRST_TRIM:
|
|
168
|
+
if (isDelimiter(c)) {
|
|
169
|
+
// empty value
|
|
170
|
+
return "";
|
|
171
|
+
|
|
172
|
+
} else if (isEndOfLine(c)) {
|
|
173
|
+
// empty value
|
|
174
|
+
recordState = RecordState.END;
|
|
175
|
+
return "";
|
|
176
|
+
|
|
177
|
+
} else if (isQuote(c)) {
|
|
178
|
+
// column has heading spaces and quoted. TODO should this be rejected?
|
|
179
|
+
valueStartPos = linePos;
|
|
180
|
+
wasQuotedColumn = true;
|
|
181
|
+
quotedValue = new StringBuilder();
|
|
182
|
+
columnState = ColumnState.QUOTED_VALUE;
|
|
183
|
+
|
|
184
|
+
} else if (isSpace(c)) {
|
|
185
|
+
// skip this character
|
|
186
|
+
|
|
187
|
+
} else {
|
|
188
|
+
valueStartPos = linePos - 1;
|
|
189
|
+
columnState = ColumnState.VALUE;
|
|
190
|
+
}
|
|
191
|
+
break;
|
|
192
|
+
|
|
193
|
+
case VALUE:
|
|
194
|
+
if (isDelimiter(c)) {
|
|
195
|
+
return line.substring(valueStartPos, linePos - 1);
|
|
196
|
+
|
|
197
|
+
} else if (isEndOfLine(c)) {
|
|
198
|
+
recordState = RecordState.END;
|
|
199
|
+
return line.substring(valueStartPos, linePos);
|
|
200
|
+
|
|
201
|
+
} else if (isSpace(c) && trimIfNotQuoted) {
|
|
202
|
+
valueEndPos = linePos - 1; // this is possibly end of value
|
|
203
|
+
columnState = ColumnState.LAST_TRIM_OR_VALUE;
|
|
204
|
+
|
|
205
|
+
// TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
|
|
206
|
+
//} else if (isQuote(c)) {
|
|
207
|
+
// // In RFC4180, If fields are not enclosed with double quotes, then
|
|
208
|
+
// // double quotes may not appear inside the fields. But they are often
|
|
209
|
+
// // included in the fields. We should care about them later.
|
|
210
|
+
|
|
211
|
+
} else {
|
|
212
|
+
// keep VALUE state
|
|
213
|
+
}
|
|
214
|
+
break;
|
|
215
|
+
|
|
216
|
+
case LAST_TRIM_OR_VALUE:
|
|
217
|
+
if (isDelimiter(c)) {
|
|
218
|
+
return line.substring(valueStartPos, valueEndPos);
|
|
219
|
+
|
|
220
|
+
} else if (isEndOfLine(c)) {
|
|
221
|
+
recordState = RecordState.END;
|
|
222
|
+
return line.substring(valueStartPos, valueEndPos);
|
|
223
|
+
|
|
224
|
+
} else if (isSpace(c)) {
|
|
225
|
+
// keep LAST_TRIM_OR_VALUE state
|
|
226
|
+
|
|
227
|
+
} else {
|
|
228
|
+
// this spaces are not trailing spaces. go back to VALUE state
|
|
229
|
+
columnState = ColumnState.BEGIN;
|
|
230
|
+
}
|
|
231
|
+
break;
|
|
232
|
+
|
|
233
|
+
case QUOTED_VALUE:
|
|
234
|
+
if (isEndOfLine(c)) {
|
|
235
|
+
// multi-line quoted value
|
|
236
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
|
237
|
+
quotedValue.append(newline);
|
|
238
|
+
quotedValueLines.add(line);
|
|
239
|
+
if (!nextLine(false)) {
|
|
240
|
+
throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
|
|
241
|
+
}
|
|
242
|
+
valueStartPos = 0;
|
|
243
|
+
|
|
244
|
+
} else if (isQuote(c)) {
|
|
245
|
+
char next = peekNextChar();
|
|
246
|
+
if (TRACE) {
|
|
247
|
+
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
|
248
|
+
}
|
|
249
|
+
if (isQuote(next)) { // escaped quote
|
|
250
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
|
251
|
+
valueStartPos = ++linePos;
|
|
252
|
+
} else {
|
|
253
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
|
254
|
+
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
} else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
|
258
|
+
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
|
259
|
+
char next = peekNextChar();
|
|
260
|
+
if (TRACE) {
|
|
261
|
+
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
|
262
|
+
}
|
|
263
|
+
if (isEndOfLine(c)) {
|
|
264
|
+
// escape end of line. TODO assuming multi-line quoted value without newline?
|
|
265
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
|
266
|
+
quotedValueLines.add(line);
|
|
267
|
+
if (!nextLine(false)) {
|
|
268
|
+
throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
|
|
269
|
+
}
|
|
270
|
+
valueStartPos = 0;
|
|
271
|
+
} else if (isQuote(next) || isEscape(next)) { // escaped quote
|
|
272
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
|
273
|
+
quotedValue.append(next);
|
|
274
|
+
valueStartPos = ++linePos;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
} else {
|
|
278
|
+
// keep QUOTED_VALUE state
|
|
279
|
+
}
|
|
280
|
+
break;
|
|
281
|
+
|
|
282
|
+
case AFTER_QUOTED_VALUE:
|
|
283
|
+
if (isDelimiter(c)) {
|
|
284
|
+
return quotedValue.toString();
|
|
285
|
+
|
|
286
|
+
} else if (isEndOfLine(c)) {
|
|
287
|
+
recordState = RecordState.END;
|
|
288
|
+
return quotedValue.toString();
|
|
289
|
+
|
|
290
|
+
} else if (isSpace(c)) {
|
|
291
|
+
// column has trailing spaces and quoted. TODO should this be rejected?
|
|
292
|
+
|
|
293
|
+
} else {
|
|
294
|
+
throw new RuntimeException("Unexpected extra character after quoted value"); // TODO exception class
|
|
295
|
+
}
|
|
296
|
+
break;
|
|
297
|
+
|
|
298
|
+
default:
|
|
299
|
+
assert false;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
public boolean wasQuotedColumn()
|
|
305
|
+
{
|
|
306
|
+
return wasQuotedColumn;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
private char nextChar()
|
|
310
|
+
{
|
|
311
|
+
Preconditions.checkState(line != null, "nextColumn is called after end of file");
|
|
312
|
+
|
|
313
|
+
if (linePos >= line.length()) {
|
|
314
|
+
return END_OF_LINE;
|
|
315
|
+
} else {
|
|
316
|
+
return line.charAt(linePos++);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
private char peekNextChar()
|
|
321
|
+
{
|
|
322
|
+
Preconditions.checkState(line != null, "peekNextChar is called after end of file");
|
|
323
|
+
|
|
324
|
+
if (linePos >= line.length()) {
|
|
325
|
+
return END_OF_LINE;
|
|
326
|
+
} else {
|
|
327
|
+
return line.charAt(linePos);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
private boolean isSpace(char c)
|
|
332
|
+
{
|
|
333
|
+
return c == ' ';
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
private boolean isDelimiter(char c)
|
|
337
|
+
{
|
|
338
|
+
return c == delimiter;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
private boolean isEndOfLine(char c)
|
|
342
|
+
{
|
|
343
|
+
return c == END_OF_LINE;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
private boolean isQuote(char c)
|
|
347
|
+
{
|
|
348
|
+
return c == quote;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
private boolean isEscape(char c)
|
|
352
|
+
{
|
|
353
|
+
return c == escape;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
package org.embulk.standards;
|
|
2
|
+
|
|
3
|
+
import java.io.InputStream;
|
|
4
|
+
import java.io.IOException;
|
|
5
|
+
import java.util.zip.GZIPInputStream;
|
|
6
|
+
import com.fasterxml.jackson.annotation.JacksonInject;
|
|
7
|
+
import org.embulk.config.Task;
|
|
8
|
+
import org.embulk.config.TaskSource;
|
|
9
|
+
import org.embulk.config.ConfigSource;
|
|
10
|
+
import org.embulk.spi.DecoderPlugin;
|
|
11
|
+
import org.embulk.spi.BufferAllocator;
|
|
12
|
+
import org.embulk.spi.FileInput;
|
|
13
|
+
import org.embulk.spi.util.FileInputInputStream;
|
|
14
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
|
15
|
+
|
|
16
|
+
public class GzipFileDecoderPlugin
|
|
17
|
+
implements DecoderPlugin
|
|
18
|
+
{
|
|
19
|
+
public interface PluginTask
|
|
20
|
+
extends Task
|
|
21
|
+
{
|
|
22
|
+
@JacksonInject
|
|
23
|
+
public BufferAllocator getBufferAllocator();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
@Override
|
|
27
|
+
public void transaction(ConfigSource config, DecoderPlugin.Control control)
|
|
28
|
+
{
|
|
29
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
|
30
|
+
control.run(task.dump());
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
@Override
|
|
34
|
+
public FileInput open(TaskSource taskSource, FileInput input)
|
|
35
|
+
{
|
|
36
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
|
37
|
+
final FileInputInputStream files = new FileInputInputStream(input);
|
|
38
|
+
return new InputStreamFileInput(
|
|
39
|
+
task.getBufferAllocator(),
|
|
40
|
+
new InputStreamFileInput.Provider() {
|
|
41
|
+
public InputStream openNext() throws IOException
|
|
42
|
+
{
|
|
43
|
+
if (!files.nextFile()) {
|
|
44
|
+
return null;
|
|
45
|
+
}
|
|
46
|
+
return new GZIPInputStream(files);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
public void close() throws IOException
|
|
50
|
+
{
|
|
51
|
+
files.close();
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
package org.embulk.standards;
|
|
2
|
+
|
|
3
|
+
import java.io.OutputStream;
|
|
4
|
+
import java.io.IOException;
|
|
5
|
+
import java.util.zip.GZIPOutputStream;
|
|
6
|
+
import org.embulk.config.Task;
|
|
7
|
+
import org.embulk.config.Config;
|
|
8
|
+
import org.embulk.config.ConfigDefault;
|
|
9
|
+
import org.embulk.config.TaskSource;
|
|
10
|
+
import org.embulk.config.ConfigSource;
|
|
11
|
+
import org.embulk.spi.EncoderPlugin;
|
|
12
|
+
import org.embulk.spi.FileOutput;
|
|
13
|
+
import org.embulk.spi.util.FileOutputOutputStream;
|
|
14
|
+
|
|
15
|
+
public class GzipFileEncoderPlugin
|
|
16
|
+
implements EncoderPlugin
|
|
17
|
+
{
|
|
18
|
+
public interface PluginTask
|
|
19
|
+
extends Task
|
|
20
|
+
{
|
|
21
|
+
@Config("level")
|
|
22
|
+
@ConfigDefault("6")
|
|
23
|
+
public int getLevel();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
public void transaction(ConfigSource config, EncoderPlugin.Control control)
|
|
27
|
+
{
|
|
28
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
|
29
|
+
control.run(task.dump());
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
@Override
|
|
33
|
+
public FileOutput open(TaskSource taskSource, FileOutput fileOutput)
|
|
34
|
+
{
|
|
35
|
+
throw new AssertionError("OutputStreamFileOutput is not implemented yet");
|
|
36
|
+
// TODO GZIPOutputStream doesn't support level option?
|
|
37
|
+
//return new OutputStreamFileOutput(new GZIPOutputStream(new FileOutputOutputStream(fileOutput)));
|
|
38
|
+
}
|
|
39
|
+
}
|