embulk-input-marketo-extended 0.6.18
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/PULL_REQUEST_TEMPLATE.md +37 -0
- data/.gitignore +14 -0
- data/.travis.yml +6 -0
- data/CHANGELOG.md +170 -0
- data/LICENSE.txt +21 -0
- data/README.md +213 -0
- data/build.gradle +103 -0
- data/config/checkstyle/checkstyle.xml +128 -0
- data/config/checkstyle/default.xml +108 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +169 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/input/marketo.rb +3 -0
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/input/marketo/CsvTokenizer.java +700 -0
- data/src/main/java/org/embulk/input/marketo/MarketoInputPlugin.java +15 -0
- data/src/main/java/org/embulk/input/marketo/MarketoInputPluginDelegate.java +100 -0
- data/src/main/java/org/embulk/input/marketo/MarketoService.java +38 -0
- data/src/main/java/org/embulk/input/marketo/MarketoServiceImpl.java +245 -0
- data/src/main/java/org/embulk/input/marketo/MarketoUtils.java +212 -0
- data/src/main/java/org/embulk/input/marketo/delegate/ActivityBulkExtractInputPlugin.java +167 -0
- data/src/main/java/org/embulk/input/marketo/delegate/CampaignInputPlugin.java +48 -0
- data/src/main/java/org/embulk/input/marketo/delegate/CustomObjectInputPlugin.java +75 -0
- data/src/main/java/org/embulk/input/marketo/delegate/CustomObjectResponseMapperBuilder.java +81 -0
- data/src/main/java/org/embulk/input/marketo/delegate/LeadBulkExtractInputPlugin.java +66 -0
- data/src/main/java/org/embulk/input/marketo/delegate/LeadServiceResponseMapperBuilder.java +85 -0
- data/src/main/java/org/embulk/input/marketo/delegate/LeadWithListInputPlugin.java +64 -0
- data/src/main/java/org/embulk/input/marketo/delegate/LeadWithProgramInputPlugin.java +60 -0
- data/src/main/java/org/embulk/input/marketo/delegate/MarketoBaseBulkExtractInputPlugin.java +441 -0
- data/src/main/java/org/embulk/input/marketo/delegate/MarketoBaseInputPluginDelegate.java +92 -0
- data/src/main/java/org/embulk/input/marketo/delegate/ProgramInputPlugin.java +228 -0
- data/src/main/java/org/embulk/input/marketo/exception/MarketoAPIException.java +30 -0
- data/src/main/java/org/embulk/input/marketo/model/BulkExtractRangeHeader.java +26 -0
- data/src/main/java/org/embulk/input/marketo/model/MarketoAccessTokenResponse.java +92 -0
- data/src/main/java/org/embulk/input/marketo/model/MarketoBulkExtractRequest.java +68 -0
- data/src/main/java/org/embulk/input/marketo/model/MarketoError.java +40 -0
- data/src/main/java/org/embulk/input/marketo/model/MarketoField.java +126 -0
- data/src/main/java/org/embulk/input/marketo/model/MarketoResponse.java +82 -0
- data/src/main/java/org/embulk/input/marketo/model/filter/DateRangeFilter.java +40 -0
- data/src/main/java/org/embulk/input/marketo/rest/MarketoBaseRestClient.java +306 -0
- data/src/main/java/org/embulk/input/marketo/rest/MarketoInputStreamResponseEntityReader.java +69 -0
- data/src/main/java/org/embulk/input/marketo/rest/MarketoRESTEndpoint.java +47 -0
- data/src/main/java/org/embulk/input/marketo/rest/MarketoResponseJetty92EntityReader.java +89 -0
- data/src/main/java/org/embulk/input/marketo/rest/MarketoRestClient.java +569 -0
- data/src/main/java/org/embulk/input/marketo/rest/RecordPagingIterable.java +180 -0
- data/src/test/java/org/embulk/input/marketo/MarketoServiceImplTest.java +140 -0
- data/src/test/java/org/embulk/input/marketo/MarketoUtilsTest.java +87 -0
- data/src/test/java/org/embulk/input/marketo/delegate/ActivityBulkExtractInputPluginTest.java +128 -0
- data/src/test/java/org/embulk/input/marketo/delegate/CampaignInputPluginTest.java +73 -0
- data/src/test/java/org/embulk/input/marketo/delegate/CustomObjectInputPluginTest.java +102 -0
- data/src/test/java/org/embulk/input/marketo/delegate/LeadBulkExtractInputPluginTest.java +99 -0
- data/src/test/java/org/embulk/input/marketo/delegate/LeadServiceResponseMapperBuilderTest.java +119 -0
- data/src/test/java/org/embulk/input/marketo/delegate/LeadWithListInputPluginTest.java +101 -0
- data/src/test/java/org/embulk/input/marketo/delegate/LeadWithProgramInputPluginTest.java +103 -0
- data/src/test/java/org/embulk/input/marketo/delegate/MarketoBaseBulkExtractInputPluginTest.java +169 -0
- data/src/test/java/org/embulk/input/marketo/delegate/ProgramInputPluginTest.java +343 -0
- data/src/test/java/org/embulk/input/marketo/rest/MarketoBaseRestClientTest.java +368 -0
- data/src/test/java/org/embulk/input/marketo/rest/MarketoRestClientTest.java +584 -0
- data/src/test/resources/config/activity_bulk_extract_config.yaml +7 -0
- data/src/test/resources/config/custom_object_config.yaml +8 -0
- data/src/test/resources/config/lead_bulk_extract_config.yaml +8 -0
- data/src/test/resources/config/rest_config.yaml +3 -0
- data/src/test/resources/fixtures/activity_extract1.csv +35 -0
- data/src/test/resources/fixtures/activity_extract2.csv +22 -0
- data/src/test/resources/fixtures/activity_types.json +22 -0
- data/src/test/resources/fixtures/all_program_full.json +53 -0
- data/src/test/resources/fixtures/campaign_response.json +38 -0
- data/src/test/resources/fixtures/campaign_response_full.json +102 -0
- data/src/test/resources/fixtures/custom_object_describe.json +124 -0
- data/src/test/resources/fixtures/custom_object_describe_marketo_fields_full.json +22 -0
- data/src/test/resources/fixtures/custom_object_expected.json +66 -0
- data/src/test/resources/fixtures/custom_object_response.json +24 -0
- data/src/test/resources/fixtures/custom_object_response_full.json +23 -0
- data/src/test/resources/fixtures/lead_by_list.json +33 -0
- data/src/test/resources/fixtures/lead_by_program_response.json +47 -0
- data/src/test/resources/fixtures/lead_describe.json +221 -0
- data/src/test/resources/fixtures/lead_describe_expected.json +66 -0
- data/src/test/resources/fixtures/lead_describe_marketo_fields_full.json +518 -0
- data/src/test/resources/fixtures/lead_extract1.csv +11 -0
- data/src/test/resources/fixtures/lead_response_full.json +2402 -0
- data/src/test/resources/fixtures/lead_with_program_full.json +17 -0
- data/src/test/resources/fixtures/leads_extract2.csv +10 -0
- data/src/test/resources/fixtures/list_reponse_full.json +191 -0
- data/src/test/resources/fixtures/lists_response.json +31 -0
- data/src/test/resources/fixtures/program_response.json +71 -0
- metadata +171 -0
data/gradlew.bat
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
set DIRNAME=%~dp0
|
12
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
+
set APP_BASE_NAME=%~n0
|
14
|
+
set APP_HOME=%DIRNAME%
|
15
|
+
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
|
53
|
+
:win9xME_args
|
54
|
+
@rem Slurp the command line arguments.
|
55
|
+
set CMD_LINE_ARGS=
|
56
|
+
set _SKIP=2
|
57
|
+
|
58
|
+
:win9xME_args_slurp
|
59
|
+
if "x%~1" == "x" goto execute
|
60
|
+
|
61
|
+
set CMD_LINE_ARGS=%*
|
62
|
+
|
63
|
+
:execute
|
64
|
+
@rem Setup the command line
|
65
|
+
|
66
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
+
|
68
|
+
@rem Execute Gradle
|
69
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
+
|
71
|
+
:end
|
72
|
+
@rem End local scope for the variables with windows NT shell
|
73
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
+
|
75
|
+
:fail
|
76
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
+
rem the _cmd.exe /c_ return code!
|
78
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
+
exit /b 1
|
80
|
+
|
81
|
+
:mainEnd
|
82
|
+
if "%OS%"=="Windows_NT" endlocal
|
83
|
+
|
84
|
+
:omega
|
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = "embulk-input-marketo-extended"
|
@@ -0,0 +1,700 @@
|
|
1
|
+
package org.embulk.input.marketo;
|
2
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
3
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
4
|
+
import com.fasterxml.jackson.annotation.JsonValue;
|
5
|
+
import com.google.common.base.Optional;
|
6
|
+
import com.google.common.base.Preconditions;
|
7
|
+
import org.embulk.config.Config;
|
8
|
+
import org.embulk.config.ConfigDefault;
|
9
|
+
import org.embulk.config.ConfigException;
|
10
|
+
import org.embulk.spi.DataException;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.embulk.spi.util.LineDecoder;
|
13
|
+
|
14
|
+
import java.util.ArrayDeque;
|
15
|
+
import java.util.ArrayList;
|
16
|
+
import java.util.Deque;
|
17
|
+
import java.util.List;
|
18
|
+
|
19
|
+
/**
|
20
|
+
* Created by tai.khuu on 9/15/17.
|
21
|
+
*/
|
22
|
+
public class CsvTokenizer
|
23
|
+
{
|
24
|
+
static enum RecordState
|
25
|
+
{
|
26
|
+
NOT_END, END,
|
27
|
+
}
|
28
|
+
|
29
|
+
static enum ColumnState
|
30
|
+
{
|
31
|
+
BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
|
32
|
+
}
|
33
|
+
|
34
|
+
private static final char END_OF_LINE = '\0';
|
35
|
+
static final char NO_QUOTE = '\0';
|
36
|
+
static final char NO_ESCAPE = '\0';
|
37
|
+
|
38
|
+
public interface PluginTask extends LineDecoder.DecoderTask
|
39
|
+
{
|
40
|
+
@Config("delimiter")
|
41
|
+
@ConfigDefault("\",\"")
|
42
|
+
String getDelimiter();
|
43
|
+
|
44
|
+
@Config("quote")
|
45
|
+
@ConfigDefault("\"\\\"\"")
|
46
|
+
Optional<QuoteCharacter> getQuoteChar();
|
47
|
+
|
48
|
+
@Config("escape")
|
49
|
+
@ConfigDefault("\"\\\\\"")
|
50
|
+
Optional<EscapeCharacter> getEscapeChar();
|
51
|
+
|
52
|
+
// Null value handling: if the CsvParser found 'non-quoted empty string's,
|
53
|
+
// it replaces them to string that users specified like "\N", "NULL".
|
54
|
+
@Config("null_string")
|
55
|
+
@ConfigDefault("\"null\"")
|
56
|
+
Optional<String> getNullString();
|
57
|
+
|
58
|
+
@Config("trim_if_not_quoted")
|
59
|
+
@ConfigDefault("false")
|
60
|
+
boolean getTrimIfNotQuoted();
|
61
|
+
|
62
|
+
@Config("max_quoted_size_limit")
|
63
|
+
@ConfigDefault("131072") //128kB
|
64
|
+
long getMaxQuotedSizeLimit();
|
65
|
+
|
66
|
+
@Config("comment_line_marker")
|
67
|
+
@ConfigDefault("null")
|
68
|
+
Optional<String> getCommentLineMarker();
|
69
|
+
}
|
70
|
+
|
71
|
+
private final char delimiterChar;
|
72
|
+
private final String delimiterFollowingString;
|
73
|
+
private final char quote;
|
74
|
+
private final char escape;
|
75
|
+
private final String newline;
|
76
|
+
private final boolean trimIfNotQuoted;
|
77
|
+
private final long maxQuotedSizeLimit;
|
78
|
+
private final String commentLineMarker;
|
79
|
+
private final LineDecoder input;
|
80
|
+
private final String nullStringOrNull;
|
81
|
+
|
82
|
+
private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
|
83
|
+
private long lineNumber = 0;
|
84
|
+
|
85
|
+
private String line = null;
|
86
|
+
private int linePos = 0;
|
87
|
+
private boolean wasQuotedColumn = false;
|
88
|
+
private List<String> quotedValueLines = new ArrayList<>();
|
89
|
+
private Deque<String> unreadLines = new ArrayDeque<>();
|
90
|
+
|
91
|
+
public CsvTokenizer(LineDecoder input, PluginTask task)
|
92
|
+
{
|
93
|
+
this(task.getDelimiter(), task.getQuoteChar().or(QuoteCharacter.noQuote()).getCharacter(),
|
94
|
+
task.getEscapeChar().or(EscapeCharacter.noEscape()).getCharacter(), task.getNewline().getString(),
|
95
|
+
task.getTrimIfNotQuoted(), task.getMaxQuotedSizeLimit(), task.getCommentLineMarker().orNull(), input, task.getNullString().orNull());
|
96
|
+
}
|
97
|
+
|
98
|
+
public CsvTokenizer(String delimiter, char quote, char escape, String newline, boolean trimIfNotQuoted, long maxQuotedSizeLimit, String commentLineMarker, LineDecoder input, String nullStringOrNull)
|
99
|
+
{
|
100
|
+
if (delimiter.length() == 0) {
|
101
|
+
throw new ConfigException("Empty delimiter is not allowed");
|
102
|
+
}
|
103
|
+
else {
|
104
|
+
this.delimiterChar = delimiter.charAt(0);
|
105
|
+
if (delimiter.length() > 1) {
|
106
|
+
delimiterFollowingString = delimiter.substring(1);
|
107
|
+
}
|
108
|
+
else {
|
109
|
+
delimiterFollowingString = null;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
this.quote = quote;
|
113
|
+
this.escape = escape;
|
114
|
+
this.newline = newline;
|
115
|
+
this.trimIfNotQuoted = trimIfNotQuoted;
|
116
|
+
this.maxQuotedSizeLimit = maxQuotedSizeLimit;
|
117
|
+
this.commentLineMarker = commentLineMarker;
|
118
|
+
this.input = input;
|
119
|
+
this.nullStringOrNull = nullStringOrNull;
|
120
|
+
}
|
121
|
+
|
122
|
+
public long getCurrentLineNumber()
|
123
|
+
{
|
124
|
+
return lineNumber;
|
125
|
+
}
|
126
|
+
|
127
|
+
public boolean skipHeaderLine()
|
128
|
+
{
|
129
|
+
boolean skipped = input.poll() != null;
|
130
|
+
if (skipped) {
|
131
|
+
lineNumber++;
|
132
|
+
}
|
133
|
+
return skipped;
|
134
|
+
}
|
135
|
+
|
136
|
+
// returns skipped line
|
137
|
+
public String skipCurrentLine()
|
138
|
+
{
|
139
|
+
String skippedLine;
|
140
|
+
if (quotedValueLines.isEmpty()) {
|
141
|
+
skippedLine = line;
|
142
|
+
}
|
143
|
+
else {
|
144
|
+
// recover lines of quoted value
|
145
|
+
skippedLine = quotedValueLines.remove(0); // TODO optimize performance
|
146
|
+
unreadLines.addAll(quotedValueLines);
|
147
|
+
lineNumber -= quotedValueLines.size();
|
148
|
+
if (line != null) {
|
149
|
+
unreadLines.add(line);
|
150
|
+
lineNumber -= 1;
|
151
|
+
}
|
152
|
+
quotedValueLines.clear();
|
153
|
+
}
|
154
|
+
recordState = RecordState.END;
|
155
|
+
return skippedLine;
|
156
|
+
}
|
157
|
+
|
158
|
+
public boolean nextFile()
|
159
|
+
{
|
160
|
+
boolean next = input.nextFile();
|
161
|
+
if (next) {
|
162
|
+
lineNumber = 0;
|
163
|
+
}
|
164
|
+
return next;
|
165
|
+
}
|
166
|
+
|
167
|
+
// used by guess-csv
|
168
|
+
public boolean nextRecord()
|
169
|
+
{
|
170
|
+
return nextRecord(true);
|
171
|
+
}
|
172
|
+
|
173
|
+
public boolean nextRecord(boolean skipEmptyLine)
|
174
|
+
{
|
175
|
+
// If at the end of record, read the next line and initialize the state
|
176
|
+
if (recordState != RecordState.END) {
|
177
|
+
throw new TooManyColumnsException("Too many columns");
|
178
|
+
}
|
179
|
+
|
180
|
+
boolean hasNext = nextLine(skipEmptyLine);
|
181
|
+
if (hasNext) {
|
182
|
+
recordState = RecordState.NOT_END;
|
183
|
+
return true;
|
184
|
+
}
|
185
|
+
else {
|
186
|
+
return false;
|
187
|
+
}
|
188
|
+
}
|
189
|
+
|
190
|
+
private boolean nextLine(boolean skipEmptyLine)
|
191
|
+
{
|
192
|
+
while (true) {
|
193
|
+
if (!unreadLines.isEmpty()) {
|
194
|
+
line = unreadLines.removeFirst();
|
195
|
+
}
|
196
|
+
else {
|
197
|
+
line = input.poll();
|
198
|
+
if (line == null) {
|
199
|
+
return false;
|
200
|
+
}
|
201
|
+
}
|
202
|
+
linePos = 0;
|
203
|
+
lineNumber++;
|
204
|
+
|
205
|
+
boolean skip = skipEmptyLine && (
|
206
|
+
line.isEmpty() ||
|
207
|
+
(commentLineMarker != null && line.startsWith(commentLineMarker)));
|
208
|
+
if (!skip) {
|
209
|
+
return true;
|
210
|
+
}
|
211
|
+
}
|
212
|
+
}
|
213
|
+
|
214
|
+
public boolean hasNextColumn()
|
215
|
+
{
|
216
|
+
return recordState == RecordState.NOT_END;
|
217
|
+
}
|
218
|
+
|
219
|
+
public String nextColumn()
|
220
|
+
{
|
221
|
+
if (!hasNextColumn()) {
|
222
|
+
throw new TooFewColumnsException("Too few columns");
|
223
|
+
}
|
224
|
+
|
225
|
+
// reset last state
|
226
|
+
wasQuotedColumn = false;
|
227
|
+
quotedValueLines.clear();
|
228
|
+
|
229
|
+
// local state
|
230
|
+
int valueStartPos = linePos;
|
231
|
+
int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
|
232
|
+
StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
|
233
|
+
ColumnState columnState = ColumnState.BEGIN;
|
234
|
+
|
235
|
+
while (true) {
|
236
|
+
final char c = nextChar();
|
237
|
+
|
238
|
+
switch (columnState) {
|
239
|
+
case BEGIN:
|
240
|
+
// TODO optimization: state is BEGIN only at the first character of a column.
|
241
|
+
// this block can be out of the looop.
|
242
|
+
if (isDelimiter(c)) {
|
243
|
+
// empty value
|
244
|
+
if (delimiterFollowingString == null) {
|
245
|
+
return "";
|
246
|
+
}
|
247
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
248
|
+
linePos += delimiterFollowingString.length();
|
249
|
+
return "";
|
250
|
+
}
|
251
|
+
// not a delimiter
|
252
|
+
}
|
253
|
+
if (isEndOfLine(c)) {
|
254
|
+
// empty value
|
255
|
+
recordState = RecordState.END;
|
256
|
+
return "";
|
257
|
+
}
|
258
|
+
else if (isSpace(c) && trimIfNotQuoted) {
|
259
|
+
columnState = ColumnState.FIRST_TRIM;
|
260
|
+
}
|
261
|
+
else if (isQuote(c)) {
|
262
|
+
valueStartPos = linePos; // == 1
|
263
|
+
wasQuotedColumn = true;
|
264
|
+
quotedValue = new StringBuilder();
|
265
|
+
columnState = ColumnState.QUOTED_VALUE;
|
266
|
+
}
|
267
|
+
else {
|
268
|
+
columnState = ColumnState.VALUE;
|
269
|
+
}
|
270
|
+
break;
|
271
|
+
|
272
|
+
case FIRST_TRIM:
|
273
|
+
if (isDelimiter(c)) {
|
274
|
+
// empty value
|
275
|
+
if (delimiterFollowingString == null) {
|
276
|
+
return "";
|
277
|
+
}
|
278
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
279
|
+
linePos += delimiterFollowingString.length();
|
280
|
+
return "";
|
281
|
+
}
|
282
|
+
// not a delimiter
|
283
|
+
}
|
284
|
+
if (isEndOfLine(c)) {
|
285
|
+
// empty value
|
286
|
+
recordState = RecordState.END;
|
287
|
+
return "";
|
288
|
+
}
|
289
|
+
else if (isQuote(c)) {
|
290
|
+
// column has heading spaces and quoted. TODO should this be rejected?
|
291
|
+
valueStartPos = linePos;
|
292
|
+
wasQuotedColumn = true;
|
293
|
+
quotedValue = new StringBuilder();
|
294
|
+
columnState = ColumnState.QUOTED_VALUE;
|
295
|
+
}
|
296
|
+
else if (isSpace(c)) {
|
297
|
+
// skip this character
|
298
|
+
} else {
|
299
|
+
valueStartPos = linePos - 1;
|
300
|
+
columnState = ColumnState.VALUE;
|
301
|
+
}
|
302
|
+
break;
|
303
|
+
|
304
|
+
case VALUE:
|
305
|
+
if (isDelimiter(c)) {
|
306
|
+
if (delimiterFollowingString == null) {
|
307
|
+
return line.substring(valueStartPos, linePos - 1);
|
308
|
+
}
|
309
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
310
|
+
String value = line.substring(valueStartPos, linePos - 1);
|
311
|
+
linePos += delimiterFollowingString.length();
|
312
|
+
return value;
|
313
|
+
}
|
314
|
+
// not a delimiter
|
315
|
+
}
|
316
|
+
if (isEndOfLine(c)) {
|
317
|
+
recordState = RecordState.END;
|
318
|
+
return line.substring(valueStartPos, linePos);
|
319
|
+
}
|
320
|
+
else if (isSpace(c) && trimIfNotQuoted) {
|
321
|
+
valueEndPos = linePos - 1; // this is possibly end of value
|
322
|
+
columnState = ColumnState.LAST_TRIM_OR_VALUE;
|
323
|
+
|
324
|
+
// TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
|
325
|
+
//} else if (isQuote(c)) {
|
326
|
+
// // In RFC4180, If fields are not enclosed with double quotes, then
|
327
|
+
// // double quotes may not appear inside the fields. But they are often
|
328
|
+
// // included in the fields. We should care about them later.
|
329
|
+
}
|
330
|
+
else {
|
331
|
+
// keep VALUE state
|
332
|
+
}
|
333
|
+
break;
|
334
|
+
|
335
|
+
case LAST_TRIM_OR_VALUE:
|
336
|
+
if (isDelimiter(c)) {
|
337
|
+
if (delimiterFollowingString == null) {
|
338
|
+
return line.substring(valueStartPos, valueEndPos);
|
339
|
+
}
|
340
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
341
|
+
linePos += delimiterFollowingString.length();
|
342
|
+
return line.substring(valueStartPos, valueEndPos);
|
343
|
+
}
|
344
|
+
else {
|
345
|
+
// not a delimiter
|
346
|
+
}
|
347
|
+
}
|
348
|
+
if (isEndOfLine(c)) {
|
349
|
+
recordState = RecordState.END;
|
350
|
+
return line.substring(valueStartPos, valueEndPos);
|
351
|
+
}
|
352
|
+
else if (isSpace(c)) {
|
353
|
+
// keep LAST_TRIM_OR_VALUE state
|
354
|
+
} else {
|
355
|
+
// this spaces are not trailing spaces. go back to VALUE state
|
356
|
+
columnState = ColumnState.VALUE;
|
357
|
+
}
|
358
|
+
break;
|
359
|
+
|
360
|
+
case QUOTED_VALUE:
|
361
|
+
if (isEndOfLine(c)) {
|
362
|
+
// multi-line quoted value
|
363
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
364
|
+
quotedValue.append(newline);
|
365
|
+
quotedValueLines.add(line);
|
366
|
+
if (!nextLine(false)) {
|
367
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
368
|
+
}
|
369
|
+
valueStartPos = 0;
|
370
|
+
}
|
371
|
+
else if (isQuote(c)) {
|
372
|
+
char next = peekNextChar();
|
373
|
+
if (isQuote(next)) { // escaped quote
|
374
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
375
|
+
valueStartPos = ++linePos;
|
376
|
+
}
|
377
|
+
else {
|
378
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
379
|
+
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
380
|
+
}
|
381
|
+
}
|
382
|
+
else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
383
|
+
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
384
|
+
char next = peekNextChar();
|
385
|
+
if (isEndOfLine(c)) {
|
386
|
+
// escape end of line. TODO assuming multi-line quoted value without newline?
|
387
|
+
quotedValue.append(line.substring(valueStartPos, linePos));
|
388
|
+
quotedValueLines.add(line);
|
389
|
+
if (!nextLine(false)) {
|
390
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
391
|
+
}
|
392
|
+
valueStartPos = 0;
|
393
|
+
}
|
394
|
+
else if (isQuote(next) || isEscape(next)) { // escaped quote
|
395
|
+
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
396
|
+
quotedValue.append(next);
|
397
|
+
valueStartPos = ++linePos;
|
398
|
+
}
|
399
|
+
}
|
400
|
+
else {
|
401
|
+
if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
|
402
|
+
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
|
403
|
+
}
|
404
|
+
// keep QUOTED_VALUE state
|
405
|
+
}
|
406
|
+
break;
|
407
|
+
|
408
|
+
case AFTER_QUOTED_VALUE:
|
409
|
+
if (isDelimiter(c)) {
|
410
|
+
if (delimiterFollowingString == null) {
|
411
|
+
return quotedValue.toString();
|
412
|
+
}
|
413
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
414
|
+
linePos += delimiterFollowingString.length();
|
415
|
+
return quotedValue.toString();
|
416
|
+
}
|
417
|
+
// not a delimiter
|
418
|
+
}
|
419
|
+
if (isEndOfLine(c)) {
|
420
|
+
recordState = RecordState.END;
|
421
|
+
return quotedValue.toString();
|
422
|
+
}
|
423
|
+
else if (isSpace(c)) {
|
424
|
+
// column has trailing spaces and quoted. TODO should this be rejected?
|
425
|
+
} else {
|
426
|
+
// I do not see a reason to reject record if stray quotes happen:
|
427
|
+
// ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS Accept stray quotes as-is in the field. Instead, it behaves undefined if delimiters are in fields. "a"b" goes a"b. "a""b" goes a"b.
|
428
|
+
// https://www.embulk.org/docs/built-in.html#csv-parser-plugin
|
429
|
+
// throw new InvalidValueException(String.format("Unexpected extra character '%c' after a value quoted by '%c'", c, quote));
|
430
|
+
Exec.getLogger(CsvTokenizer.class).warn(String.format("Unexpected extra character '%c' after a value quoted by '%c', Record= %s", c, quote, line));
|
431
|
+
|
432
|
+
}
|
433
|
+
break;
|
434
|
+
|
435
|
+
default:
|
436
|
+
assert false;
|
437
|
+
}
|
438
|
+
}
|
439
|
+
}
|
440
|
+
|
441
|
+
public String nextColumnOrNull()
|
442
|
+
{
|
443
|
+
String v = nextColumn();
|
444
|
+
if (nullStringOrNull == null) {
|
445
|
+
if (v.isEmpty()) {
|
446
|
+
if (wasQuotedColumn) {
|
447
|
+
return "";
|
448
|
+
}
|
449
|
+
else {
|
450
|
+
return null;
|
451
|
+
}
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
return v;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
else {
|
458
|
+
if (v.equals(nullStringOrNull)) {
|
459
|
+
return null;
|
460
|
+
}
|
461
|
+
else {
|
462
|
+
return v;
|
463
|
+
}
|
464
|
+
}
|
465
|
+
}
|
466
|
+
|
467
|
+
public boolean wasQuotedColumn()
|
468
|
+
{
|
469
|
+
return wasQuotedColumn;
|
470
|
+
}
|
471
|
+
|
472
|
+
private char nextChar()
|
473
|
+
{
|
474
|
+
Preconditions.checkState(line != null, "nextColumn is called after end of file");
|
475
|
+
|
476
|
+
if (linePos >= line.length()) {
|
477
|
+
return END_OF_LINE;
|
478
|
+
}
|
479
|
+
else {
|
480
|
+
return line.charAt(linePos++);
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
private char peekNextChar()
|
485
|
+
{
|
486
|
+
Preconditions.checkState(line != null, "peekNextChar is called after end of file");
|
487
|
+
|
488
|
+
if (linePos >= line.length()) {
|
489
|
+
return END_OF_LINE;
|
490
|
+
}
|
491
|
+
else {
|
492
|
+
return line.charAt(linePos);
|
493
|
+
}
|
494
|
+
}
|
495
|
+
|
496
|
+
private boolean isSpace(char c)
|
497
|
+
{
|
498
|
+
return c == ' ';
|
499
|
+
}
|
500
|
+
|
501
|
+
private boolean isDelimiterFollowingFrom(int pos)
|
502
|
+
{
|
503
|
+
if (line.length() < pos + delimiterFollowingString.length()) {
|
504
|
+
return false;
|
505
|
+
}
|
506
|
+
for (int i = 0; i < delimiterFollowingString.length(); i++) {
|
507
|
+
if (delimiterFollowingString.charAt(i) != line.charAt(pos + i)) {
|
508
|
+
return false;
|
509
|
+
}
|
510
|
+
}
|
511
|
+
return true;
|
512
|
+
}
|
513
|
+
|
514
|
+
private boolean isDelimiter(char c)
|
515
|
+
{
|
516
|
+
return c == delimiterChar;
|
517
|
+
}
|
518
|
+
|
519
|
+
private boolean isEndOfLine(char c)
|
520
|
+
{
|
521
|
+
return c == END_OF_LINE;
|
522
|
+
}
|
523
|
+
|
524
|
+
private boolean isQuote(char c)
|
525
|
+
{
|
526
|
+
return quote != NO_QUOTE && c == quote;
|
527
|
+
}
|
528
|
+
|
529
|
+
private boolean isEscape(char c)
|
530
|
+
{
|
531
|
+
return escape != NO_ESCAPE && c == escape;
|
532
|
+
}
|
533
|
+
|
534
|
+
public static class InvalidFormatException
|
535
|
+
extends DataException
|
536
|
+
{
|
537
|
+
public InvalidFormatException(String message)
|
538
|
+
{
|
539
|
+
super(message);
|
540
|
+
}
|
541
|
+
}
|
542
|
+
|
543
|
+
public static class InvalidValueException
|
544
|
+
extends DataException
|
545
|
+
{
|
546
|
+
public InvalidValueException(String message)
|
547
|
+
{
|
548
|
+
super(message);
|
549
|
+
}
|
550
|
+
}
|
551
|
+
|
552
|
+
public static class QuotedSizeLimitExceededException
|
553
|
+
extends InvalidValueException
|
554
|
+
{
|
555
|
+
public QuotedSizeLimitExceededException(String message)
|
556
|
+
{
|
557
|
+
super(message);
|
558
|
+
}
|
559
|
+
}
|
560
|
+
|
561
|
+
public class TooManyColumnsException
|
562
|
+
extends InvalidFormatException
|
563
|
+
{
|
564
|
+
public TooManyColumnsException(String message)
|
565
|
+
{
|
566
|
+
super(message);
|
567
|
+
}
|
568
|
+
}
|
569
|
+
|
570
|
+
public class TooFewColumnsException
|
571
|
+
extends InvalidFormatException
|
572
|
+
{
|
573
|
+
public TooFewColumnsException(String message)
|
574
|
+
{
|
575
|
+
super(message);
|
576
|
+
}
|
577
|
+
}
|
578
|
+
|
579
|
+
public static class QuoteCharacter
|
580
|
+
{
|
581
|
+
private final char character;
|
582
|
+
|
583
|
+
public QuoteCharacter(char character)
|
584
|
+
{
|
585
|
+
this.character = character;
|
586
|
+
}
|
587
|
+
|
588
|
+
public static QuoteCharacter noQuote()
|
589
|
+
{
|
590
|
+
return new QuoteCharacter(CsvTokenizer.NO_QUOTE);
|
591
|
+
}
|
592
|
+
|
593
|
+
@JsonCreator
|
594
|
+
public static QuoteCharacter ofString(String str)
|
595
|
+
{
|
596
|
+
if (str.length() >= 2) {
|
597
|
+
throw new ConfigException("\"quote\" option accepts only 1 character.");
|
598
|
+
}
|
599
|
+
else if (str.isEmpty()) {
|
600
|
+
Exec.getLogger(CsvTokenizer.class).warn("Setting '' (empty string) to \"quote\" option is obsoleted. Currently it becomes '\"' automatically but this behavior will be removed. Please set '\"' explicitly.");
|
601
|
+
return new QuoteCharacter('"');
|
602
|
+
}
|
603
|
+
else {
|
604
|
+
return new QuoteCharacter(str.charAt(0));
|
605
|
+
}
|
606
|
+
}
|
607
|
+
|
608
|
+
@JsonIgnore
|
609
|
+
public char getCharacter()
|
610
|
+
{
|
611
|
+
return character;
|
612
|
+
}
|
613
|
+
|
614
|
+
@JsonValue
|
615
|
+
public String getOptionalString()
|
616
|
+
{
|
617
|
+
return new String(new char[] { character });
|
618
|
+
}
|
619
|
+
|
620
|
+
@Override
|
621
|
+
public int hashCode()
|
622
|
+
{
|
623
|
+
final int prime = 31;
|
624
|
+
int result = 1;
|
625
|
+
result = prime * result + character;
|
626
|
+
return result;
|
627
|
+
}
|
628
|
+
|
629
|
+
@Override
|
630
|
+
public boolean equals(Object obj)
|
631
|
+
{
|
632
|
+
if (!(obj instanceof QuoteCharacter)) {
|
633
|
+
return false;
|
634
|
+
}
|
635
|
+
QuoteCharacter o = (QuoteCharacter) obj;
|
636
|
+
return character == o.character;
|
637
|
+
}
|
638
|
+
}
|
639
|
+
|
640
|
+
public static class EscapeCharacter
|
641
|
+
{
|
642
|
+
private final char character;
|
643
|
+
|
644
|
+
public EscapeCharacter(char character)
|
645
|
+
{
|
646
|
+
this.character = character;
|
647
|
+
}
|
648
|
+
|
649
|
+
public static EscapeCharacter noEscape()
|
650
|
+
{
|
651
|
+
return new EscapeCharacter(CsvTokenizer.NO_ESCAPE);
|
652
|
+
}
|
653
|
+
|
654
|
+
@JsonCreator
|
655
|
+
public static EscapeCharacter ofString(String str)
|
656
|
+
{
|
657
|
+
if (str.length() >= 2) {
|
658
|
+
throw new ConfigException("\"escape\" option accepts only 1 character.");
|
659
|
+
}
|
660
|
+
else if (str.isEmpty()) {
|
661
|
+
Exec.getLogger(CsvTokenizer.class).warn("Setting '' (empty string) to \"escape\" option is obsoleted. Currently it becomes null automatically but this behavior will be removed. Please set \"escape: null\" explicitly.");
|
662
|
+
return noEscape();
|
663
|
+
}
|
664
|
+
else {
|
665
|
+
return new EscapeCharacter(str.charAt(0));
|
666
|
+
}
|
667
|
+
}
|
668
|
+
|
669
|
+
@JsonIgnore
|
670
|
+
public char getCharacter()
|
671
|
+
{
|
672
|
+
return character;
|
673
|
+
}
|
674
|
+
|
675
|
+
@JsonValue
|
676
|
+
public String getOptionalString()
|
677
|
+
{
|
678
|
+
return new String(new char[] { character });
|
679
|
+
}
|
680
|
+
|
681
|
+
@Override
|
682
|
+
public boolean equals(Object obj)
|
683
|
+
{
|
684
|
+
if (!(obj instanceof EscapeCharacter)) {
|
685
|
+
return false;
|
686
|
+
}
|
687
|
+
EscapeCharacter o = (EscapeCharacter) obj;
|
688
|
+
return character == o.character;
|
689
|
+
}
|
690
|
+
|
691
|
+
@Override
|
692
|
+
public int hashCode()
|
693
|
+
{
|
694
|
+
final int prime = 31;
|
695
|
+
int result = 1;
|
696
|
+
result = prime * result + character;
|
697
|
+
return result;
|
698
|
+
}
|
699
|
+
}
|
700
|
+
}
|