embulk-parser-csv_with_default_value 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/LICENSE.txt +21 -0
- data/README.md +54 -0
- data/build.gradle +96 -0
- data/config/checkstyle/checkstyle.xml +128 -0
- data/config/checkstyle/default.xml +108 -0
- data/gradlew +160 -0
- data/gradlew.bat +90 -0
- data/lib/embulk/guess/csv_with_default_value.rb +61 -0
- data/lib/embulk/parser/csv_with_default_value.rb +3 -0
- data/src/main/java/org/embulk/parser/csv_with_default_value/ColumnDefaultValue.java +123 -0
- data/src/main/java/org/embulk/parser/csv_with_default_value/ColumnDefaultValueImpl.java +68 -0
- data/src/main/java/org/embulk/parser/csv_with_default_value/CsvRecordValidateException.java +13 -0
- data/src/main/java/org/embulk/parser/csv_with_default_value/CsvTokenizer.java +512 -0
- data/src/main/java/org/embulk/parser/csv_with_default_value/CsvWithDefaultValueParserPlugin.java +447 -0
- data/src/test/java/org/embulk/EmbulkTestRuntime.java +113 -0
- data/src/test/java/org/embulk/GuiceBinder.java +72 -0
- data/src/test/java/org/embulk/RandomManager.java +53 -0
- data/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
- data/src/test/java/org/embulk/TestUtilityModule.java +17 -0
- data/src/test/java/org/embulk/parser/csv_with_default_value/TestCsvWithDefaultValueParserPlugin.java +97 -0
- data/src/test/java/org/embulk/parser/csv_with_default_value/ValueTypeTest.java +47 -0
- data/src/test/java/org/embulk/spi/MockFormatterPlugin.java +108 -0
- data/src/test/java/org/embulk/spi/MockParserPlugin.java +80 -0
- metadata +97 -0
data/gradlew.bat
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
12
|
+
set DEFAULT_JVM_OPTS=
|
13
|
+
|
14
|
+
set DIRNAME=%~dp0
|
15
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
16
|
+
set APP_BASE_NAME=%~n0
|
17
|
+
set APP_HOME=%DIRNAME%
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windowz variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
if "%@eval[2+2]" == "4" goto 4NT_args
|
53
|
+
|
54
|
+
:win9xME_args
|
55
|
+
@rem Slurp the command line arguments.
|
56
|
+
set CMD_LINE_ARGS=
|
57
|
+
set _SKIP=2
|
58
|
+
|
59
|
+
:win9xME_args_slurp
|
60
|
+
if "x%~1" == "x" goto execute
|
61
|
+
|
62
|
+
set CMD_LINE_ARGS=%*
|
63
|
+
goto execute
|
64
|
+
|
65
|
+
:4NT_args
|
66
|
+
@rem Get arguments from the 4NT Shell from JP Software
|
67
|
+
set CMD_LINE_ARGS=%$
|
68
|
+
|
69
|
+
:execute
|
70
|
+
@rem Setup the command line
|
71
|
+
|
72
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
73
|
+
|
74
|
+
@rem Execute Gradle
|
75
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
76
|
+
|
77
|
+
:end
|
78
|
+
@rem End local scope for the variables with windows NT shell
|
79
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
80
|
+
|
81
|
+
:fail
|
82
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
83
|
+
rem the _cmd.exe /c_ return code!
|
84
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
85
|
+
exit /b 1
|
86
|
+
|
87
|
+
:mainEnd
|
88
|
+
if "%OS%"=="Windows_NT" endlocal
|
89
|
+
|
90
|
+
:omega
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
# TODO implement guess plugin to make this command work:
|
5
|
+
# $ embulk guess -g "csv_with_default_value" partial-config.yml
|
6
|
+
#
|
7
|
+
# Depending on the file format the plugin uses, you can use choose
|
8
|
+
# one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
|
9
|
+
# or line guess (LineGuessPlugin).
|
10
|
+
|
11
|
+
#class CsvWithDefaultValue < GuessPlugin
|
12
|
+
# Plugin.register_guess("csv_with_default_value", self)
|
13
|
+
#
|
14
|
+
# def guess(config, sample_buffer)
|
15
|
+
# if sample_buffer[0,2] == GZIP_HEADER
|
16
|
+
# guessed = {}
|
17
|
+
# guessed["getType"] = "csv_with_default_value"
|
18
|
+
# guessed["property1"] = "guessed-value"
|
19
|
+
# return {"parser" => guessed}
|
20
|
+
# else
|
21
|
+
# return {}
|
22
|
+
# end
|
23
|
+
# end
|
24
|
+
#end
|
25
|
+
|
26
|
+
#class CsvWithDefaultValue < TextGuessPlugin
|
27
|
+
# Plugin.register_guess("csv_with_default_value", self)
|
28
|
+
#
|
29
|
+
# def guess_text(config, sample_text)
|
30
|
+
# js = JSON.parse(sample_text) rescue nil
|
31
|
+
# if js && js["mykeyword"] == "keyword"
|
32
|
+
# guessed = {}
|
33
|
+
# guessed["getType"] = "csv_with_default_value"
|
34
|
+
# guessed["property1"] = "guessed-value"
|
35
|
+
# return {"parser" => guessed}
|
36
|
+
# else
|
37
|
+
# return {}
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
#end
|
41
|
+
|
42
|
+
#class CsvWithDefaultValue < LineGuessPlugin
|
43
|
+
# Plugin.register_guess("csv_with_default_value", self)
|
44
|
+
#
|
45
|
+
# def guess_lines(config, sample_lines)
|
46
|
+
# all_line_matched = sample_lines.all? do |line|
|
47
|
+
# line =~ /mypattern/
|
48
|
+
# end
|
49
|
+
# if all_line_matched
|
50
|
+
# guessed = {}
|
51
|
+
# guessed["getType"] = "csv_with_default_value"
|
52
|
+
# guessed["property1"] = "guessed-value"
|
53
|
+
# return {"parser" => guessed}
|
54
|
+
# else
|
55
|
+
# return {}
|
56
|
+
# end
|
57
|
+
# end
|
58
|
+
#end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
package org.embulk.parser.csv_with_default_value;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
4
|
+
import com.fasterxml.jackson.annotation.JsonValue;
|
5
|
+
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
6
|
+
import com.google.common.base.Joiner;
|
7
|
+
import com.google.common.base.Optional;
|
8
|
+
import com.google.common.collect.ImmutableMap;
|
9
|
+
import com.google.common.collect.ImmutableSet;
|
10
|
+
import org.embulk.config.Config;
|
11
|
+
import org.embulk.config.ConfigDefault;
|
12
|
+
import org.embulk.config.ConfigException;
|
13
|
+
import org.embulk.spi.Column;
|
14
|
+
import org.embulk.spi.PageBuilder;
|
15
|
+
import org.embulk.spi.time.TimestampParseException;
|
16
|
+
import org.embulk.spi.time.TimestampParser;
|
17
|
+
import org.embulk.spi.type.Type;
|
18
|
+
import org.embulk.spi.type.Types;
|
19
|
+
|
20
|
+
import java.util.Locale;
|
21
|
+
import java.util.Map;
|
22
|
+
import java.util.Set;
|
23
|
+
|
24
|
+
@JsonDeserialize(as = ColumnDefaultValueImpl.class)
|
25
|
+
public interface ColumnDefaultValue {
|
26
|
+
|
27
|
+
Set<Type> ALLOWED_TYPES = ImmutableSet.<Type>of(Types.LONG, Types.DOUBLE, Types.DOUBLE, Types.TIMESTAMP);
|
28
|
+
String ALLOWED_TYPES_NAME = Joiner.on(",").join(ALLOWED_TYPES);
|
29
|
+
|
30
|
+
@Config("default_value")
|
31
|
+
Optional<String> getDefaultValue();
|
32
|
+
|
33
|
+
@Config("type")
|
34
|
+
@ConfigDefault("immediate")
|
35
|
+
ColumnDefaultValue.ValueType getType();
|
36
|
+
|
37
|
+
interface DefaultValueSetter {
|
38
|
+
|
39
|
+
/**
|
40
|
+
* @throws CsvRecordValidateException
|
41
|
+
*/
|
42
|
+
void longValue(ColumnDefaultValue value, PageBuilder pageBuilder, Column column);
|
43
|
+
|
44
|
+
/**
|
45
|
+
* @throws CsvRecordValidateException
|
46
|
+
*/
|
47
|
+
void doubleValue(ColumnDefaultValue value, PageBuilder pageBuilder, Column column);
|
48
|
+
|
49
|
+
/**
|
50
|
+
* @throws CsvRecordValidateException
|
51
|
+
*/
|
52
|
+
void timestampValue(ColumnDefaultValue value, TimestampParser parser, PageBuilder pageBuilder, Column column);
|
53
|
+
|
54
|
+
}
|
55
|
+
|
56
|
+
enum ValueType implements DefaultValueSetter {
|
57
|
+
IMMEDIATE {
|
58
|
+
@Override
|
59
|
+
public void longValue(ColumnDefaultValue value, PageBuilder pageBuilder, Column column) {
|
60
|
+
try {
|
61
|
+
pageBuilder.setLong(column, Long.parseLong(value.getDefaultValue().get()));
|
62
|
+
} catch (NumberFormatException e) {
|
63
|
+
throw new CsvRecordValidateException(e);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public void doubleValue(ColumnDefaultValue value, PageBuilder pageBuilder, Column column) {
|
69
|
+
try {
|
70
|
+
pageBuilder.setDouble(column, Double.parseDouble(value.getDefaultValue().get()));
|
71
|
+
} catch (NumberFormatException e) {
|
72
|
+
throw new CsvRecordValidateException(e);
|
73
|
+
}
|
74
|
+
|
75
|
+
}
|
76
|
+
|
77
|
+
@Override
|
78
|
+
public void timestampValue(ColumnDefaultValue value, TimestampParser parser, PageBuilder pageBuilder, Column column) {
|
79
|
+
try {
|
80
|
+
pageBuilder.setTimestamp(column, parser.parse(value.getDefaultValue().get()));
|
81
|
+
} catch (TimestampParseException e) {
|
82
|
+
throw new CsvRecordValidateException(e);
|
83
|
+
}
|
84
|
+
}
|
85
|
+
},
|
86
|
+
NULL {
|
87
|
+
@Override
|
88
|
+
public void longValue(ColumnDefaultValue value, PageBuilder pageBuilder, Column column) {
|
89
|
+
throw new ConfigException("null value is not allowed for long");
|
90
|
+
}
|
91
|
+
|
92
|
+
@Override
|
93
|
+
public void doubleValue(ColumnDefaultValue value, PageBuilder pageBuilder, Column column) {
|
94
|
+
throw new ConfigException("null value is not allowed for double");
|
95
|
+
}
|
96
|
+
|
97
|
+
@Override
|
98
|
+
public void timestampValue(ColumnDefaultValue value, TimestampParser parser, PageBuilder pageBuilder, Column column) {
|
99
|
+
pageBuilder.setNull(column);
|
100
|
+
}
|
101
|
+
};
|
102
|
+
|
103
|
+
@JsonValue
|
104
|
+
@Override
|
105
|
+
public String toString() {
|
106
|
+
return name().toLowerCase(Locale.ENGLISH);
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
@JsonCreator
|
111
|
+
public static ValueType fromString(String value) {
|
112
|
+
Map<String, ValueType> types = ImmutableMap.of(IMMEDIATE.toString().toLowerCase(), IMMEDIATE, NULL.toString().toLowerCase(), NULL);
|
113
|
+
ValueType type = types.get(value);
|
114
|
+
if (type != null) {
|
115
|
+
return type;
|
116
|
+
} else {
|
117
|
+
throw new ConfigException(String.format("Unknown value_type '%s', Supported getType are immediate, null.", value));
|
118
|
+
}
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
package org.embulk.parser.csv_with_default_value;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
4
|
+
import com.google.common.base.Optional;
|
5
|
+
|
6
|
+
public class ColumnDefaultValueImpl implements ColumnDefaultValue {
|
7
|
+
|
8
|
+
@JsonProperty("default_value")
|
9
|
+
private Optional<String> defaultValue;
|
10
|
+
|
11
|
+
@JsonProperty("type")
|
12
|
+
private ValueType type;
|
13
|
+
|
14
|
+
public ColumnDefaultValueImpl(){
|
15
|
+
this(Optional.<String>absent(), ValueType.IMMEDIATE);
|
16
|
+
}
|
17
|
+
|
18
|
+
public ColumnDefaultValueImpl(String defaultValue, ValueType type) {
|
19
|
+
this(Optional.of(defaultValue), type);
|
20
|
+
}
|
21
|
+
|
22
|
+
public ColumnDefaultValueImpl(Optional<String> defaultValue, ValueType type){
|
23
|
+
this.defaultValue = defaultValue;
|
24
|
+
this.type = type;
|
25
|
+
}
|
26
|
+
|
27
|
+
@Override
|
28
|
+
public Optional<String> getDefaultValue() {
|
29
|
+
return defaultValue;
|
30
|
+
}
|
31
|
+
|
32
|
+
@Override
|
33
|
+
public ValueType getType() {
|
34
|
+
return type;
|
35
|
+
}
|
36
|
+
|
37
|
+
@Override
|
38
|
+
public boolean equals(Object o) {
|
39
|
+
if (this == o) return true;
|
40
|
+
if (o == null || ! (o instanceof ColumnDefaultValue)) return false;
|
41
|
+
|
42
|
+
ColumnDefaultValue that = (ColumnDefaultValue) o;
|
43
|
+
|
44
|
+
if(getDefaultValue().isPresent() != that.getDefaultValue().isPresent()){
|
45
|
+
return false;
|
46
|
+
}else if(getDefaultValue().isPresent() && that.getDefaultValue().isPresent()){
|
47
|
+
if(!getDefaultValue().get().equals(that.getDefaultValue().get())){
|
48
|
+
return false;
|
49
|
+
}
|
50
|
+
return getType() == that.getType();
|
51
|
+
}else{
|
52
|
+
return getType() == that.getType();
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
@Override
|
57
|
+
public int hashCode() {
|
58
|
+
int result = getDefaultValue() != null ? getDefaultValue().hashCode() : 0;
|
59
|
+
result = 31 * result + (getType() != null ? getType().hashCode() : 0);
|
60
|
+
return result;
|
61
|
+
}
|
62
|
+
|
63
|
+
@Override
|
64
|
+
public String toString(){
|
65
|
+
return String.format("ColumnDefaultValueImpl(type=%s,value=%s)", getType(), getDefaultValue().or("null"));
|
66
|
+
}
|
67
|
+
|
68
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
package org.embulk.parser.csv_with_default_value;
|
2
|
+
|
3
|
+
import org.embulk.spi.DataException;
|
4
|
+
|
5
|
+
/**
|
6
|
+
* Created by k.sasaki on 2016/10/25.
|
7
|
+
*/
|
8
|
+
class CsvRecordValidateException
|
9
|
+
extends DataException {
|
10
|
+
CsvRecordValidateException(Throwable cause) {
|
11
|
+
super(cause);
|
12
|
+
}
|
13
|
+
}
|
@@ -0,0 +1,512 @@
|
|
1
|
+
package org.embulk.parser.csv_with_default_value;
|
2
|
+
|
3
|
+
import com.google.common.base.Preconditions;
|
4
|
+
import java.util.List;
|
5
|
+
import java.util.ArrayList;
|
6
|
+
import java.util.Deque;
|
7
|
+
import java.util.ArrayDeque;
|
8
|
+
import org.embulk.spi.DataException;
|
9
|
+
import org.embulk.spi.util.LineDecoder;
|
10
|
+
import org.embulk.config.ConfigException;
|
11
|
+
|
12
|
+
public class CsvTokenizer
|
13
|
+
{
|
14
|
+
enum RecordState
|
15
|
+
{
|
16
|
+
NOT_END, END,
|
17
|
+
}
|
18
|
+
|
19
|
+
enum ColumnState
|
20
|
+
{
|
21
|
+
BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
|
22
|
+
}
|
23
|
+
|
24
|
+
private static final char END_OF_LINE = '\0';
|
25
|
+
static final char NO_QUOTE = '\0';
|
26
|
+
static final char NO_ESCAPE = '\0';
|
27
|
+
|
28
|
+
private final char delimiterChar;
|
29
|
+
private final String delimiterFollowingString;
|
30
|
+
private final char quote;
|
31
|
+
private final char escape;
|
32
|
+
private final String newline;
|
33
|
+
private final boolean trimIfNotQuoted;
|
34
|
+
private final long maxQuotedSizeLimit;
|
35
|
+
private final String commentLineMarker;
|
36
|
+
private final LineDecoder input;
|
37
|
+
private final String nullStringOrNull;
|
38
|
+
|
39
|
+
private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
|
40
|
+
private long lineNumber = 0;
|
41
|
+
|
42
|
+
private String line = null;
|
43
|
+
private int linePos = 0;
|
44
|
+
private boolean wasQuotedColumn = false;
|
45
|
+
private List<String> quotedValueLines = new ArrayList<>();
|
46
|
+
private Deque<String> unreadLines = new ArrayDeque<>();
|
47
|
+
|
48
|
+
public CsvTokenizer(LineDecoder input, CsvWithDefaultValueParserPlugin.PluginTask task)
|
49
|
+
{
|
50
|
+
String delimiter = task.getDelimiter();
|
51
|
+
if (delimiter.length() == 0) {
|
52
|
+
throw new ConfigException("Empty delimiter is not allowed");
|
53
|
+
} else {
|
54
|
+
this.delimiterChar = delimiter.charAt(0);
|
55
|
+
if (delimiter.length() > 1) {
|
56
|
+
delimiterFollowingString = delimiter.substring(1);
|
57
|
+
} else {
|
58
|
+
delimiterFollowingString = null;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
quote = task.getQuoteChar().or(CsvWithDefaultValueParserPlugin.QuoteCharacter.noQuote()).getCharacter();
|
62
|
+
escape = task.getEscapeChar().or(CsvWithDefaultValueParserPlugin.EscapeCharacter.noEscape()).getCharacter();
|
63
|
+
newline = task.getNewline().getString();
|
64
|
+
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
65
|
+
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
66
|
+
commentLineMarker = task.getCommentLineMarker().orNull();
|
67
|
+
nullStringOrNull = task.getNullString().orNull();
|
68
|
+
this.input = input;
|
69
|
+
}
|
70
|
+
|
71
|
+
public long getCurrentLineNumber()
|
72
|
+
{
|
73
|
+
return lineNumber;
|
74
|
+
}
|
75
|
+
|
76
|
+
public boolean skipHeaderLine()
|
77
|
+
{
|
78
|
+
boolean skipped = input.poll() != null;
|
79
|
+
if (skipped) {
|
80
|
+
lineNumber++;
|
81
|
+
}
|
82
|
+
return skipped;
|
83
|
+
}
|
84
|
+
|
85
|
+
// returns skipped line
|
86
|
+
public String skipCurrentLine()
|
87
|
+
{
|
88
|
+
String skippedLine;
|
89
|
+
if (quotedValueLines.isEmpty()) {
|
90
|
+
skippedLine = line;
|
91
|
+
} else {
|
92
|
+
// recover lines of quoted value
|
93
|
+
skippedLine = quotedValueLines.remove(0); // TODO optimize performance
|
94
|
+
unreadLines.addAll(quotedValueLines);
|
95
|
+
lineNumber -= quotedValueLines.size();
|
96
|
+
if (line != null) {
|
97
|
+
unreadLines.add(line);
|
98
|
+
lineNumber -= 1;
|
99
|
+
}
|
100
|
+
quotedValueLines.clear();
|
101
|
+
}
|
102
|
+
recordState = RecordState.END;
|
103
|
+
return skippedLine;
|
104
|
+
}
|
105
|
+
|
106
|
+
public boolean nextFile()
|
107
|
+
{
|
108
|
+
boolean next = input.nextFile();
|
109
|
+
if (next) {
|
110
|
+
lineNumber = 0;
|
111
|
+
}
|
112
|
+
return next;
|
113
|
+
}
|
114
|
+
|
115
|
+
// used by guess-csv
|
116
|
+
public boolean nextRecord()
|
117
|
+
{
|
118
|
+
return nextRecord(true);
|
119
|
+
}
|
120
|
+
|
121
|
+
public boolean nextRecord(boolean skipEmptyLine)
|
122
|
+
{
|
123
|
+
// If at the end of record, read the next line and initialize the state
|
124
|
+
if (recordState != RecordState.END) {
|
125
|
+
throw new TooManyColumnsException("Too many columns");
|
126
|
+
}
|
127
|
+
|
128
|
+
boolean hasNext = nextLine(skipEmptyLine);
|
129
|
+
if (hasNext) {
|
130
|
+
recordState = RecordState.NOT_END;
|
131
|
+
return true;
|
132
|
+
} else {
|
133
|
+
return false;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
private boolean nextLine(boolean skipEmptyLine)
|
138
|
+
{
|
139
|
+
while (true) {
|
140
|
+
if (!unreadLines.isEmpty()) {
|
141
|
+
line = unreadLines.removeFirst();
|
142
|
+
} else {
|
143
|
+
line = input.poll();
|
144
|
+
if (line == null) {
|
145
|
+
return false;
|
146
|
+
}
|
147
|
+
}
|
148
|
+
linePos = 0;
|
149
|
+
lineNumber++;
|
150
|
+
|
151
|
+
boolean skip = skipEmptyLine && (
|
152
|
+
line.isEmpty() ||
|
153
|
+
(commentLineMarker != null && line.startsWith(commentLineMarker)));
|
154
|
+
if (!skip) {
|
155
|
+
return true;
|
156
|
+
}
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
public boolean hasNextColumn()
|
161
|
+
{
|
162
|
+
return recordState == RecordState.NOT_END;
|
163
|
+
}
|
164
|
+
|
165
|
+
public String nextColumn()
|
166
|
+
{
|
167
|
+
if (!hasNextColumn()) {
|
168
|
+
throw new TooFewColumnsException("Too few columns");
|
169
|
+
}
|
170
|
+
|
171
|
+
// reset last state
|
172
|
+
wasQuotedColumn = false;
|
173
|
+
quotedValueLines.clear();
|
174
|
+
|
175
|
+
// local state
|
176
|
+
int valueStartPos = linePos;
|
177
|
+
int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
|
178
|
+
StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
|
179
|
+
ColumnState columnState = ColumnState.BEGIN;
|
180
|
+
|
181
|
+
while (true) {
|
182
|
+
final char c = nextChar();
|
183
|
+
|
184
|
+
switch (columnState) {
|
185
|
+
case BEGIN:
|
186
|
+
// TODO optimization: state is BEGIN only at the first character of a column.
|
187
|
+
// this block can be out of the looop.
|
188
|
+
if (isDelimiter(c)) {
|
189
|
+
// empty value
|
190
|
+
if (delimiterFollowingString == null) {
|
191
|
+
return "";
|
192
|
+
} else if (isDelimiterFollowingFrom(linePos)) {
|
193
|
+
linePos += delimiterFollowingString.length();
|
194
|
+
return "";
|
195
|
+
}
|
196
|
+
// not a delimiter
|
197
|
+
}
|
198
|
+
if (isEndOfLine(c)) {
|
199
|
+
// empty value
|
200
|
+
recordState = RecordState.END;
|
201
|
+
return "";
|
202
|
+
|
203
|
+
} else if (isSpace(c) && trimIfNotQuoted) {
|
204
|
+
columnState = ColumnState.FIRST_TRIM;
|
205
|
+
|
206
|
+
} else if (isQuote(c)) {
|
207
|
+
valueStartPos = linePos; // == 1
|
208
|
+
wasQuotedColumn = true;
|
209
|
+
quotedValue = new StringBuilder();
|
210
|
+
columnState = ColumnState.QUOTED_VALUE;
|
211
|
+
|
212
|
+
} else {
|
213
|
+
columnState = ColumnState.VALUE;
|
214
|
+
}
|
215
|
+
break;
|
216
|
+
|
217
|
+
case FIRST_TRIM:
|
218
|
+
if (isDelimiter(c)) {
|
219
|
+
// empty value
|
220
|
+
if (delimiterFollowingString == null) {
|
221
|
+
return "";
|
222
|
+
} else if (isDelimiterFollowingFrom(linePos)) {
|
223
|
+
linePos += delimiterFollowingString.length();
|
224
|
+
return "";
|
225
|
+
}
|
226
|
+
// not a delimiter
|
227
|
+
}
|
228
|
+
if (isEndOfLine(c)) {
|
229
|
+
// empty value
|
230
|
+
recordState = RecordState.END;
|
231
|
+
return "";
|
232
|
+
|
233
|
+
} else if (isQuote(c)) {
|
234
|
+
// column has heading spaces and quoted. TODO should this be rejected?
|
235
|
+
valueStartPos = linePos;
|
236
|
+
wasQuotedColumn = true;
|
237
|
+
quotedValue = new StringBuilder();
|
238
|
+
columnState = ColumnState.QUOTED_VALUE;
|
239
|
+
|
240
|
+
} else if (isSpace(c)) {
|
241
|
+
// skip this character
|
242
|
+
|
243
|
+
} else {
|
244
|
+
valueStartPos = linePos - 1;
|
245
|
+
columnState = ColumnState.VALUE;
|
246
|
+
}
|
247
|
+
break;
|
248
|
+
|
249
|
+
case VALUE:
|
250
|
+
if (isDelimiter(c)) {
|
251
|
+
if (delimiterFollowingString == null) {
|
252
|
+
return line.substring(valueStartPos, linePos - 1);
|
253
|
+
} else if (isDelimiterFollowingFrom(linePos)) {
|
254
|
+
String value = line.substring(valueStartPos, linePos - 1);
|
255
|
+
linePos += delimiterFollowingString.length();
|
256
|
+
return value;
|
257
|
+
}
|
258
|
+
// not a delimiter
|
259
|
+
}
|
260
|
+
if (isEndOfLine(c)) {
|
261
|
+
recordState = RecordState.END;
|
262
|
+
return line.substring(valueStartPos, linePos);
|
263
|
+
|
264
|
+
} else if (isSpace(c) && trimIfNotQuoted) {
|
265
|
+
valueEndPos = linePos - 1; // this is possibly end of value
|
266
|
+
columnState = ColumnState.LAST_TRIM_OR_VALUE;
|
267
|
+
|
268
|
+
// TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
|
269
|
+
//} else if (isQuote(c)) {
|
270
|
+
// // In RFC4180, If fields are not enclosed with double quotes, then
|
271
|
+
// // double quotes may not appear inside the fields. But they are often
|
272
|
+
// // included in the fields. We should care about them later.
|
273
|
+
|
274
|
+
} else {
|
275
|
+
// keep VALUE state
|
276
|
+
}
|
277
|
+
break;
|
278
|
+
|
279
|
+
case LAST_TRIM_OR_VALUE:
|
280
|
+
if (isDelimiter(c)) {
|
281
|
+
if (delimiterFollowingString == null) {
|
282
|
+
return line.substring(valueStartPos, valueEndPos);
|
283
|
+
} else if (isDelimiterFollowingFrom(linePos)) {
|
284
|
+
linePos += delimiterFollowingString.length();
|
285
|
+
return line.substring(valueStartPos, valueEndPos);
|
286
|
+
} else {
|
287
|
+
// not a delimiter
|
288
|
+
}
|
289
|
+
}
|
290
|
+
if (isEndOfLine(c)) {
|
291
|
+
recordState = RecordState.END;
|
292
|
+
return line.substring(valueStartPos, valueEndPos);
|
293
|
+
|
294
|
+
} else if (isSpace(c)) {
|
295
|
+
// keep LAST_TRIM_OR_VALUE state
|
296
|
+
|
297
|
+
} else {
|
298
|
+
// this spaces are not trailing spaces. go back to VALUE state
|
299
|
+
columnState = ColumnState.VALUE;
|
300
|
+
}
|
301
|
+
break;
|
302
|
+
|
303
|
+
case QUOTED_VALUE:
|
304
|
+
if (isEndOfLine(c)) {
|
305
|
+
// multi-line quoted value
|
306
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
307
|
+
quotedValue.append(newline);
|
308
|
+
quotedValueLines.add(line);
|
309
|
+
if (!nextLine(false)) {
|
310
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
311
|
+
}
|
312
|
+
valueStartPos = 0;
|
313
|
+
|
314
|
+
} else if (isQuote(c)) {
|
315
|
+
char next = peekNextChar();
|
316
|
+
if (isQuote(next)) { // escaped quote
|
317
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
318
|
+
valueStartPos = ++linePos;
|
319
|
+
} else {
|
320
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
321
|
+
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
322
|
+
}
|
323
|
+
|
324
|
+
} else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
325
|
+
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
326
|
+
char next = peekNextChar();
|
327
|
+
if (isEndOfLine(c)) {
|
328
|
+
// escape end of line. TODO assuming multi-line quoted value without newline?
|
329
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
330
|
+
quotedValueLines.add(line);
|
331
|
+
if (!nextLine(false)) {
|
332
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
333
|
+
}
|
334
|
+
valueStartPos = 0;
|
335
|
+
} else if (isQuote(next) || isEscape(next)) { // escaped quote
|
336
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
337
|
+
quotedValue.append(next);
|
338
|
+
valueStartPos = ++linePos;
|
339
|
+
}
|
340
|
+
|
341
|
+
} else {
|
342
|
+
if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
|
343
|
+
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
|
344
|
+
}
|
345
|
+
// keep QUOTED_VALUE state
|
346
|
+
}
|
347
|
+
break;
|
348
|
+
|
349
|
+
case AFTER_QUOTED_VALUE:
|
350
|
+
if (isDelimiter(c)) {
|
351
|
+
if (delimiterFollowingString == null) {
|
352
|
+
return quotedValue.toString();
|
353
|
+
} else if (isDelimiterFollowingFrom(linePos)) {
|
354
|
+
linePos += delimiterFollowingString.length();
|
355
|
+
return quotedValue.toString();
|
356
|
+
}
|
357
|
+
// not a delimiter
|
358
|
+
}
|
359
|
+
if (isEndOfLine(c)) {
|
360
|
+
recordState = RecordState.END;
|
361
|
+
return quotedValue.toString();
|
362
|
+
|
363
|
+
} else if (isSpace(c)) {
|
364
|
+
// column has trailing spaces and quoted. TODO should this be rejected?
|
365
|
+
|
366
|
+
} else {
|
367
|
+
throw new InvalidValueException(String.format("Unexpected extra character '%c' after a value quoted by '%c'", c, quote));
|
368
|
+
}
|
369
|
+
break;
|
370
|
+
|
371
|
+
default:
|
372
|
+
assert false;
|
373
|
+
}
|
374
|
+
}
|
375
|
+
}
|
376
|
+
|
377
|
+
public String nextColumnOrNull()
|
378
|
+
{
|
379
|
+
String v = nextColumn();
|
380
|
+
if (nullStringOrNull == null) {
|
381
|
+
if (v.isEmpty()) {
|
382
|
+
if (wasQuotedColumn) {
|
383
|
+
return "";
|
384
|
+
}
|
385
|
+
else {
|
386
|
+
return null;
|
387
|
+
}
|
388
|
+
}
|
389
|
+
else {
|
390
|
+
return v;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
else {
|
394
|
+
if (v.equals(nullStringOrNull)) {
|
395
|
+
return null;
|
396
|
+
}
|
397
|
+
else {
|
398
|
+
return v;
|
399
|
+
}
|
400
|
+
}
|
401
|
+
}
|
402
|
+
|
403
|
+
public boolean wasQuotedColumn()
|
404
|
+
{
|
405
|
+
return wasQuotedColumn;
|
406
|
+
}
|
407
|
+
|
408
|
+
private char nextChar()
|
409
|
+
{
|
410
|
+
Preconditions.checkState(line != null, "nextColumn is called after end of file");
|
411
|
+
|
412
|
+
if (linePos >= line.length()) {
|
413
|
+
return END_OF_LINE;
|
414
|
+
} else {
|
415
|
+
return line.charAt(linePos++);
|
416
|
+
}
|
417
|
+
}
|
418
|
+
|
419
|
+
private char peekNextChar()
|
420
|
+
{
|
421
|
+
Preconditions.checkState(line != null, "peekNextChar is called after end of file");
|
422
|
+
|
423
|
+
if (linePos >= line.length()) {
|
424
|
+
return END_OF_LINE;
|
425
|
+
} else {
|
426
|
+
return line.charAt(linePos);
|
427
|
+
}
|
428
|
+
}
|
429
|
+
|
430
|
+
private boolean isSpace(char c)
|
431
|
+
{
|
432
|
+
return c == ' ';
|
433
|
+
}
|
434
|
+
|
435
|
+
private boolean isDelimiterFollowingFrom(int pos)
|
436
|
+
{
|
437
|
+
if (line.length() < pos + delimiterFollowingString.length()) {
|
438
|
+
return false;
|
439
|
+
}
|
440
|
+
for (int i = 0; i < delimiterFollowingString.length(); i++) {
|
441
|
+
if (delimiterFollowingString.charAt(i) != line.charAt(pos + i)) {
|
442
|
+
return false;
|
443
|
+
}
|
444
|
+
}
|
445
|
+
return true;
|
446
|
+
}
|
447
|
+
|
448
|
+
private boolean isDelimiter(char c)
|
449
|
+
{
|
450
|
+
return c == delimiterChar;
|
451
|
+
}
|
452
|
+
|
453
|
+
private boolean isEndOfLine(char c)
|
454
|
+
{
|
455
|
+
return c == END_OF_LINE;
|
456
|
+
}
|
457
|
+
|
458
|
+
private boolean isQuote(char c)
|
459
|
+
{
|
460
|
+
return quote != NO_QUOTE && c == quote;
|
461
|
+
}
|
462
|
+
|
463
|
+
private boolean isEscape(char c)
|
464
|
+
{
|
465
|
+
return escape != NO_ESCAPE && c == escape;
|
466
|
+
}
|
467
|
+
|
468
|
+
public static class InvalidFormatException
|
469
|
+
extends DataException
|
470
|
+
{
|
471
|
+
public InvalidFormatException(String message)
|
472
|
+
{
|
473
|
+
super(message);
|
474
|
+
}
|
475
|
+
}
|
476
|
+
|
477
|
+
public static class InvalidValueException
|
478
|
+
extends DataException
|
479
|
+
{
|
480
|
+
public InvalidValueException(String message)
|
481
|
+
{
|
482
|
+
super(message);
|
483
|
+
}
|
484
|
+
}
|
485
|
+
|
486
|
+
public static class QuotedSizeLimitExceededException
|
487
|
+
extends InvalidValueException
|
488
|
+
{
|
489
|
+
public QuotedSizeLimitExceededException(String message)
|
490
|
+
{
|
491
|
+
super(message);
|
492
|
+
}
|
493
|
+
}
|
494
|
+
|
495
|
+
public class TooManyColumnsException
|
496
|
+
extends InvalidFormatException
|
497
|
+
{
|
498
|
+
public TooManyColumnsException(String message)
|
499
|
+
{
|
500
|
+
super(message);
|
501
|
+
}
|
502
|
+
}
|
503
|
+
|
504
|
+
public class TooFewColumnsException
|
505
|
+
extends InvalidFormatException
|
506
|
+
{
|
507
|
+
public TooFewColumnsException(String message)
|
508
|
+
{
|
509
|
+
super(message);
|
510
|
+
}
|
511
|
+
}
|
512
|
+
}
|