embulk 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +13 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +33 -0
- data/README.md +117 -0
- data/Rakefile +58 -0
- data/bin/embulk +63 -0
- data/build.gradle +149 -0
- data/embulk-cli/build.gradle +6 -0
- data/embulk-cli/pom.xml +94 -0
- data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
- data/embulk-core/build.gradle +6 -0
- data/embulk-core/pom.xml +143 -0
- data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
- data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
- data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
- data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
- data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
- data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
- data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
- data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
- data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
- data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
- data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
- data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
- data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
- data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
- data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
- data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
- data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
- data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
- data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
- data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
- data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
- data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
- data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
- data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
- data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
- data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
- data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
- data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
- data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
- data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
- data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
- data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
- data/embulk-standards/build.gradle +6 -0
- data/embulk-standards/pom.xml +68 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
- data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
- data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
- data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
- data/embulk.gemspec +27 -0
- data/examples/config.yml +34 -0
- data/examples/csv/sample.csv.gz +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk.rb +16 -0
- data/lib/embulk/buffer.rb +17 -0
- data/lib/embulk/column.rb +47 -0
- data/lib/embulk/command/embulk.rb +39 -0
- data/lib/embulk/command/embulk_example.rb +32 -0
- data/lib/embulk/command/embulk_generate_bin.rb +62 -0
- data/lib/embulk/command/embulk_run.rb +243 -0
- data/lib/embulk/data/bundle/.bundle/config +3 -0
- data/lib/embulk/data/bundle/Gemfile +31 -0
- data/lib/embulk/data/bundle/Gemfile.lock +8 -0
- data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
- data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
- data/lib/embulk/data_source.rb +66 -0
- data/lib/embulk/error.rb +5 -0
- data/lib/embulk/guess_charset.rb +26 -0
- data/lib/embulk/guess_csv.rb +195 -0
- data/lib/embulk/guess_gzip.rb +18 -0
- data/lib/embulk/guess_newline.rb +20 -0
- data/lib/embulk/guess_plugin.rb +113 -0
- data/lib/embulk/input_plugin.rb +53 -0
- data/lib/embulk/java/bootstrap.rb +12 -0
- data/lib/embulk/java/imports.rb +26 -0
- data/lib/embulk/java/time_helper.rb +77 -0
- data/lib/embulk/output_plugin.rb +104 -0
- data/lib/embulk/page.rb +28 -0
- data/lib/embulk/page_builder.rb +22 -0
- data/lib/embulk/plugin.rb +152 -0
- data/lib/embulk/plugin_registry.rb +70 -0
- data/lib/embulk/schema.rb +85 -0
- data/lib/embulk/time_format_guess.rb +331 -0
- data/lib/embulk/version.rb +3 -0
- data/pom.xml +533 -0
- data/settings.gradle +5 -0
- metadata +370 -0
@@ -0,0 +1,355 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.base.Preconditions;
|
4
|
+
import java.util.List;
|
5
|
+
import java.util.ArrayList;
|
6
|
+
import java.util.Deque;
|
7
|
+
import java.util.ArrayDeque;
|
8
|
+
import java.util.Iterator;
|
9
|
+
import org.embulk.spi.util.LineDecoder;
|
10
|
+
|
11
|
+
public class CsvTokenizer
|
12
|
+
{
|
13
|
+
static enum RecordState
|
14
|
+
{
|
15
|
+
NOT_END, END,
|
16
|
+
}
|
17
|
+
|
18
|
+
static enum ColumnState
|
19
|
+
{
|
20
|
+
BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
|
21
|
+
}
|
22
|
+
|
23
|
+
private static final char END_OF_LINE = '\0';
|
24
|
+
private static final boolean TRACE = false;
|
25
|
+
|
26
|
+
private final char delimiter;
|
27
|
+
private final char quote;
|
28
|
+
private final char escape;
|
29
|
+
private final String newline;
|
30
|
+
private final boolean trimIfNotQuoted;
|
31
|
+
private final long maxQuotedSizeLimit; // TODO not used yet
|
32
|
+
private final LineDecoder input;
|
33
|
+
|
34
|
+
private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
|
35
|
+
private long lineNumber = 0;
|
36
|
+
|
37
|
+
private String line = null;
|
38
|
+
private int linePos = 0;
|
39
|
+
private boolean wasQuotedColumn = false;
|
40
|
+
private List<String> quotedValueLines = new ArrayList<>();
|
41
|
+
private Deque<String> unreadLines = new ArrayDeque<>();
|
42
|
+
|
43
|
+
public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
|
44
|
+
{
|
45
|
+
delimiter = task.getDelimiterChar();
|
46
|
+
quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
|
47
|
+
escape = task.getEscapeChar();
|
48
|
+
newline = task.getNewline().getString();
|
49
|
+
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
50
|
+
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
51
|
+
this.input = input;
|
52
|
+
}
|
53
|
+
|
54
|
+
public long getCurrentLineNumber()
|
55
|
+
{
|
56
|
+
return lineNumber;
|
57
|
+
}
|
58
|
+
|
59
|
+
// returns skipped line
|
60
|
+
public String skipCurrentLine()
|
61
|
+
{
|
62
|
+
String skippedLine;
|
63
|
+
if (quotedValueLines.isEmpty()) {
|
64
|
+
skippedLine = line;
|
65
|
+
} else {
|
66
|
+
// recover lines of quoted value
|
67
|
+
skippedLine = quotedValueLines.remove(0); // TODO optimize performance
|
68
|
+
unreadLines.addAll(quotedValueLines);
|
69
|
+
unreadLines.add(line);
|
70
|
+
lineNumber -= quotedValueLines.size();
|
71
|
+
quotedValueLines.clear();
|
72
|
+
}
|
73
|
+
recordState = RecordState.END;
|
74
|
+
return line;
|
75
|
+
}
|
76
|
+
|
77
|
+
public boolean nextFile()
|
78
|
+
{
|
79
|
+
return input.nextFile();
|
80
|
+
}
|
81
|
+
|
82
|
+
public boolean nextRecord()
|
83
|
+
{
|
84
|
+
// If at the end of record, read the next line and initialize the state
|
85
|
+
Preconditions.checkState(recordState == RecordState.END, "too many columns"); // TODO exception class
|
86
|
+
boolean hasNext = nextLine(true);
|
87
|
+
if (hasNext) {
|
88
|
+
recordState = RecordState.NOT_END;
|
89
|
+
return true;
|
90
|
+
} else {
|
91
|
+
return false;
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
private boolean nextLine(boolean ignoreEmptyLine)
|
96
|
+
{
|
97
|
+
while (true) {
|
98
|
+
if (!unreadLines.isEmpty()) {
|
99
|
+
line = unreadLines.removeFirst();
|
100
|
+
} else {
|
101
|
+
line = input.poll();
|
102
|
+
if (line == null) {
|
103
|
+
return false;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
linePos = 0;
|
107
|
+
lineNumber++;
|
108
|
+
|
109
|
+
if (TRACE) {
|
110
|
+
System.out.println("#MN line: " + line + " (" + lineNumber + ")");
|
111
|
+
}
|
112
|
+
|
113
|
+
if (!line.isEmpty() || !ignoreEmptyLine) {
|
114
|
+
return true;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
public String nextColumn()
|
120
|
+
{
|
121
|
+
Preconditions.checkState(recordState == RecordState.NOT_END, "doesn't have enough columns"); // TODO exception class
|
122
|
+
|
123
|
+
// reset last state
|
124
|
+
wasQuotedColumn = false;
|
125
|
+
quotedValueLines.clear();
|
126
|
+
|
127
|
+
// local state
|
128
|
+
int valueStartPos = linePos;
|
129
|
+
int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
|
130
|
+
StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
|
131
|
+
ColumnState columnState = ColumnState.BEGIN;
|
132
|
+
|
133
|
+
while (true) {
|
134
|
+
final char c = nextChar();
|
135
|
+
if (TRACE) {
|
136
|
+
System.out.println("#MN c: " + c + " (" + columnState + "," + recordState + ")");
|
137
|
+
try { Thread.sleep(100); } catch (Exception e) {}
|
138
|
+
}
|
139
|
+
|
140
|
+
switch (columnState) {
|
141
|
+
case BEGIN:
|
142
|
+
// TODO optimization: state is BEGIN only at the first character of a column.
|
143
|
+
// this block can be out of the looop.
|
144
|
+
if (isDelimiter(c)) {
|
145
|
+
// empty value
|
146
|
+
return "";
|
147
|
+
|
148
|
+
} else if (isEndOfLine(c)) {
|
149
|
+
// empty value
|
150
|
+
recordState = RecordState.END;
|
151
|
+
return "";
|
152
|
+
|
153
|
+
} else if (isSpace(c) && trimIfNotQuoted) {
|
154
|
+
columnState = ColumnState.FIRST_TRIM;
|
155
|
+
|
156
|
+
} else if (isQuote(c)) {
|
157
|
+
valueStartPos = linePos; // == 1
|
158
|
+
wasQuotedColumn = true;
|
159
|
+
quotedValue = new StringBuilder();
|
160
|
+
columnState = ColumnState.QUOTED_VALUE;
|
161
|
+
|
162
|
+
} else {
|
163
|
+
columnState = ColumnState.VALUE;
|
164
|
+
}
|
165
|
+
break;
|
166
|
+
|
167
|
+
case FIRST_TRIM:
|
168
|
+
if (isDelimiter(c)) {
|
169
|
+
// empty value
|
170
|
+
return "";
|
171
|
+
|
172
|
+
} else if (isEndOfLine(c)) {
|
173
|
+
// empty value
|
174
|
+
recordState = RecordState.END;
|
175
|
+
return "";
|
176
|
+
|
177
|
+
} else if (isQuote(c)) {
|
178
|
+
// column has heading spaces and quoted. TODO should this be rejected?
|
179
|
+
valueStartPos = linePos;
|
180
|
+
wasQuotedColumn = true;
|
181
|
+
quotedValue = new StringBuilder();
|
182
|
+
columnState = ColumnState.QUOTED_VALUE;
|
183
|
+
|
184
|
+
} else if (isSpace(c)) {
|
185
|
+
// skip this character
|
186
|
+
|
187
|
+
} else {
|
188
|
+
valueStartPos = linePos - 1;
|
189
|
+
columnState = ColumnState.VALUE;
|
190
|
+
}
|
191
|
+
break;
|
192
|
+
|
193
|
+
case VALUE:
|
194
|
+
if (isDelimiter(c)) {
|
195
|
+
return line.substring(valueStartPos, linePos - 1);
|
196
|
+
|
197
|
+
} else if (isEndOfLine(c)) {
|
198
|
+
recordState = RecordState.END;
|
199
|
+
return line.substring(valueStartPos, linePos);
|
200
|
+
|
201
|
+
} else if (isSpace(c) && trimIfNotQuoted) {
|
202
|
+
valueEndPos = linePos - 1; // this is possibly end of value
|
203
|
+
columnState = ColumnState.LAST_TRIM_OR_VALUE;
|
204
|
+
|
205
|
+
// TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
|
206
|
+
//} else if (isQuote(c)) {
|
207
|
+
// // In RFC4180, If fields are not enclosed with double quotes, then
|
208
|
+
// // double quotes may not appear inside the fields. But they are often
|
209
|
+
// // included in the fields. We should care about them later.
|
210
|
+
|
211
|
+
} else {
|
212
|
+
// keep VALUE state
|
213
|
+
}
|
214
|
+
break;
|
215
|
+
|
216
|
+
case LAST_TRIM_OR_VALUE:
|
217
|
+
if (isDelimiter(c)) {
|
218
|
+
return line.substring(valueStartPos, valueEndPos);
|
219
|
+
|
220
|
+
} else if (isEndOfLine(c)) {
|
221
|
+
recordState = RecordState.END;
|
222
|
+
return line.substring(valueStartPos, valueEndPos);
|
223
|
+
|
224
|
+
} else if (isSpace(c)) {
|
225
|
+
// keep LAST_TRIM_OR_VALUE state
|
226
|
+
|
227
|
+
} else {
|
228
|
+
// this spaces are not trailing spaces. go back to VALUE state
|
229
|
+
columnState = ColumnState.BEGIN;
|
230
|
+
}
|
231
|
+
break;
|
232
|
+
|
233
|
+
case QUOTED_VALUE:
|
234
|
+
if (isEndOfLine(c)) {
|
235
|
+
// multi-line quoted value
|
236
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
237
|
+
quotedValue.append(newline);
|
238
|
+
quotedValueLines.add(line);
|
239
|
+
if (!nextLine(false)) {
|
240
|
+
throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
|
241
|
+
}
|
242
|
+
valueStartPos = 0;
|
243
|
+
|
244
|
+
} else if (isQuote(c)) {
|
245
|
+
char next = peekNextChar();
|
246
|
+
if (TRACE) {
|
247
|
+
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
248
|
+
}
|
249
|
+
if (isQuote(next)) { // escaped quote
|
250
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
251
|
+
valueStartPos = ++linePos;
|
252
|
+
} else {
|
253
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
254
|
+
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
255
|
+
}
|
256
|
+
|
257
|
+
} else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
258
|
+
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
259
|
+
char next = peekNextChar();
|
260
|
+
if (TRACE) {
|
261
|
+
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
262
|
+
}
|
263
|
+
if (isEndOfLine(c)) {
|
264
|
+
// escape end of line. TODO assuming multi-line quoted value without newline?
|
265
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
266
|
+
quotedValueLines.add(line);
|
267
|
+
if (!nextLine(false)) {
|
268
|
+
throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
|
269
|
+
}
|
270
|
+
valueStartPos = 0;
|
271
|
+
} else if (isQuote(next) || isEscape(next)) { // escaped quote
|
272
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
273
|
+
quotedValue.append(next);
|
274
|
+
valueStartPos = ++linePos;
|
275
|
+
}
|
276
|
+
|
277
|
+
} else {
|
278
|
+
// keep QUOTED_VALUE state
|
279
|
+
}
|
280
|
+
break;
|
281
|
+
|
282
|
+
case AFTER_QUOTED_VALUE:
|
283
|
+
if (isDelimiter(c)) {
|
284
|
+
return quotedValue.toString();
|
285
|
+
|
286
|
+
} else if (isEndOfLine(c)) {
|
287
|
+
recordState = RecordState.END;
|
288
|
+
return quotedValue.toString();
|
289
|
+
|
290
|
+
} else if (isSpace(c)) {
|
291
|
+
// column has trailing spaces and quoted. TODO should this be rejected?
|
292
|
+
|
293
|
+
} else {
|
294
|
+
throw new RuntimeException("Unexpected extra character after quoted value"); // TODO exception class
|
295
|
+
}
|
296
|
+
break;
|
297
|
+
|
298
|
+
default:
|
299
|
+
assert false;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
public boolean wasQuotedColumn()
|
305
|
+
{
|
306
|
+
return wasQuotedColumn;
|
307
|
+
}
|
308
|
+
|
309
|
+
private char nextChar()
|
310
|
+
{
|
311
|
+
Preconditions.checkState(line != null, "nextColumn is called after end of file");
|
312
|
+
|
313
|
+
if (linePos >= line.length()) {
|
314
|
+
return END_OF_LINE;
|
315
|
+
} else {
|
316
|
+
return line.charAt(linePos++);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
320
|
+
private char peekNextChar()
|
321
|
+
{
|
322
|
+
Preconditions.checkState(line != null, "peekNextChar is called after end of file");
|
323
|
+
|
324
|
+
if (linePos >= line.length()) {
|
325
|
+
return END_OF_LINE;
|
326
|
+
} else {
|
327
|
+
return line.charAt(linePos);
|
328
|
+
}
|
329
|
+
}
|
330
|
+
|
331
|
+
private boolean isSpace(char c)
|
332
|
+
{
|
333
|
+
return c == ' ';
|
334
|
+
}
|
335
|
+
|
336
|
+
private boolean isDelimiter(char c)
|
337
|
+
{
|
338
|
+
return c == delimiter;
|
339
|
+
}
|
340
|
+
|
341
|
+
private boolean isEndOfLine(char c)
|
342
|
+
{
|
343
|
+
return c == END_OF_LINE;
|
344
|
+
}
|
345
|
+
|
346
|
+
private boolean isQuote(char c)
|
347
|
+
{
|
348
|
+
return c == quote;
|
349
|
+
}
|
350
|
+
|
351
|
+
private boolean isEscape(char c)
|
352
|
+
{
|
353
|
+
return c == escape;
|
354
|
+
}
|
355
|
+
}
|
@@ -0,0 +1,55 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import java.io.InputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.util.zip.GZIPInputStream;
|
6
|
+
import com.fasterxml.jackson.annotation.JacksonInject;
|
7
|
+
import org.embulk.config.Task;
|
8
|
+
import org.embulk.config.TaskSource;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.spi.DecoderPlugin;
|
11
|
+
import org.embulk.spi.BufferAllocator;
|
12
|
+
import org.embulk.spi.FileInput;
|
13
|
+
import org.embulk.spi.util.FileInputInputStream;
|
14
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
15
|
+
|
16
|
+
public class GzipFileDecoderPlugin
|
17
|
+
implements DecoderPlugin
|
18
|
+
{
|
19
|
+
public interface PluginTask
|
20
|
+
extends Task
|
21
|
+
{
|
22
|
+
@JacksonInject
|
23
|
+
public BufferAllocator getBufferAllocator();
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
public void transaction(ConfigSource config, DecoderPlugin.Control control)
|
28
|
+
{
|
29
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
30
|
+
control.run(task.dump());
|
31
|
+
}
|
32
|
+
|
33
|
+
@Override
|
34
|
+
public FileInput open(TaskSource taskSource, FileInput input)
|
35
|
+
{
|
36
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
37
|
+
final FileInputInputStream files = new FileInputInputStream(input);
|
38
|
+
return new InputStreamFileInput(
|
39
|
+
task.getBufferAllocator(),
|
40
|
+
new InputStreamFileInput.Provider() {
|
41
|
+
public InputStream openNext() throws IOException
|
42
|
+
{
|
43
|
+
if (!files.nextFile()) {
|
44
|
+
return null;
|
45
|
+
}
|
46
|
+
return new GZIPInputStream(files);
|
47
|
+
}
|
48
|
+
|
49
|
+
public void close() throws IOException
|
50
|
+
{
|
51
|
+
files.close();
|
52
|
+
}
|
53
|
+
});
|
54
|
+
}
|
55
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import java.io.OutputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.util.zip.GZIPOutputStream;
|
6
|
+
import org.embulk.config.Task;
|
7
|
+
import org.embulk.config.Config;
|
8
|
+
import org.embulk.config.ConfigDefault;
|
9
|
+
import org.embulk.config.TaskSource;
|
10
|
+
import org.embulk.config.ConfigSource;
|
11
|
+
import org.embulk.spi.EncoderPlugin;
|
12
|
+
import org.embulk.spi.FileOutput;
|
13
|
+
import org.embulk.spi.util.FileOutputOutputStream;
|
14
|
+
|
15
|
+
public class GzipFileEncoderPlugin
|
16
|
+
implements EncoderPlugin
|
17
|
+
{
|
18
|
+
public interface PluginTask
|
19
|
+
extends Task
|
20
|
+
{
|
21
|
+
@Config("level")
|
22
|
+
@ConfigDefault("6")
|
23
|
+
public int getLevel();
|
24
|
+
}
|
25
|
+
|
26
|
+
public void transaction(ConfigSource config, EncoderPlugin.Control control)
|
27
|
+
{
|
28
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
29
|
+
control.run(task.dump());
|
30
|
+
}
|
31
|
+
|
32
|
+
@Override
|
33
|
+
public FileOutput open(TaskSource taskSource, FileOutput fileOutput)
|
34
|
+
{
|
35
|
+
throw new AssertionError("OutputStreamFileOutput is not implemented yet");
|
36
|
+
// TODO GZIPOutputStream doesn't support level option?
|
37
|
+
//return new OutputStreamFileOutput(new GZIPOutputStream(new FileOutputOutputStream(fileOutput)));
|
38
|
+
}
|
39
|
+
}
|