embulk-parser-csv_guessable 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bf39985065e771f33ffee0a47f7866f4377ac933
4
- data.tar.gz: 521303beddd8bb591d4b8f619f4561846eb8c1ab
3
+ metadata.gz: 90dc39f04076979425a69d11b3177e1e4b1d5e7a
4
+ data.tar.gz: 5dca8965baaeb7fbe51f5f9df63f385b09d9bdb1
5
5
  SHA512:
6
- metadata.gz: 0cf15b12d61bafe9c8742797b88caf0a88f6982acf635643eb6baee109cc63d226992856efe3e8df21bc4579a54d1d419789c0c63c45ca53cda36bd03557d84c
7
- data.tar.gz: a3ab77844c86c449f322a67d5de57a47315c56c67f94a524feb6195a15b6c197335d9676ddb3a597a6cffd748fc3c7fb125cc789bbe8750b09ae5641f5cd5093
6
+ metadata.gz: 61bd54ee36352ab6667654f6dfdfcccd77de37efdd49f3325924538ba3d921737a1e9139691cc8aa8617f47893cef8219f0f50bff454a456df01309ed4668617
7
+ data.tar.gz: d6ab6e9d35ae8932ee5aa1a035c8f1440e6c4d1cf41565010278ca88d13a91cd85e2444476106a29cae6bd4312c5b2987d1211dfb45c26e7b6ae9bf511e12b75
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Csv Guessable parser plugin for Embulk
2
- **embulk-parser-csv_gussable** guesses and parses csv which has schema in header.
2
+ **embulk-parser-csv_gussable** (runtime) guesses and parses csv which has schema in header.
3
+ This plugin is useful in case of target csv schema changes frequently.
3
4
 
4
5
  Also it can behave as original csv parser without **embulk-parser-csv_guessable** specified configs.
5
6
 
@@ -12,11 +13,11 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
12
13
 
13
14
  - **schema_file**: filename which has schema.(string, default: `null`)
14
15
  - **schema_line**: schema line in header. (integer default: `"1"`)
15
- - **columns**: Columns hint for guess (hash, default: `null`)
16
+ - **(TODO)columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `"schema_file"` is set. If `"schema_file"` isn't set, this is same as original csv parser's `"columns"`. (hash, default: `null`)
16
17
  - any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
17
18
 
18
19
  ## Example
19
- data/test.csv
20
+ test.csv
20
21
 
21
22
  ```csv
22
23
  id, title, description
@@ -35,16 +36,31 @@ in:
35
36
  schema_line: 1
36
37
  ```
37
38
 
39
+ (To explain)
40
+ In case original csv parser
41
+ config.yml
42
+ ```yaml
43
+ in:
44
+ type: any file input plugin type
45
+ parser:
46
+ type: csv
47
+ skip_header_lines: 1
48
+ column:
49
+ - {name: id, type: string}
50
+ - {name: title, type: string}
51
+ - {name: description, type: string}
52
+ ```
53
+
38
54
  <!--
39
55
  (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
40
56
  -->
41
57
 
42
58
  ```
43
59
  $ embulk gem install embulk-parser-csv_guessable
60
+ ```
44
61
  <!--
45
62
  $ embulk guess -g csv_guessable config.yml -o guessed.yml
46
63
  -->
47
- ```
48
64
 
49
65
  ## Build
50
66
 
data/build.gradle CHANGED
@@ -13,14 +13,14 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.1"
16
+ version = "0.1.2"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
20
20
 
21
21
  dependencies {
22
- compile fileTree(dir: 'libs', include: '*.jar')
23
22
  compile "org.embulk:embulk-core:0.8.22"
23
+ compile "org.embulk:embulk-standards:0.8.22"
24
24
  compile "com.opencsv:opencsv:3.9"
25
25
  provided "org.embulk:embulk-core:0.8.22"
26
26
  testCompile "junit:junit:4.+"
data/gradlew.bat CHANGED
@@ -1,84 +1,84 @@
1
- @if "%DEBUG%" == "" @echo off
2
- @rem ##########################################################################
3
- @rem
4
- @rem Gradle startup script for Windows
5
- @rem
6
- @rem ##########################################################################
7
-
8
- @rem Set local scope for the variables with windows NT shell
9
- if "%OS%"=="Windows_NT" setlocal
10
-
11
- set DIRNAME=%~dp0
12
- if "%DIRNAME%" == "" set DIRNAME=.
13
- set APP_BASE_NAME=%~n0
14
- set APP_HOME=%DIRNAME%
15
-
16
- @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
- set DEFAULT_JVM_OPTS=
18
-
19
- @rem Find java.exe
20
- if defined JAVA_HOME goto findJavaFromJavaHome
21
-
22
- set JAVA_EXE=java.exe
23
- %JAVA_EXE% -version >NUL 2>&1
24
- if "%ERRORLEVEL%" == "0" goto init
25
-
26
- echo.
27
- echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
- echo.
29
- echo Please set the JAVA_HOME variable in your environment to match the
30
- echo location of your Java installation.
31
-
32
- goto fail
33
-
34
- :findJavaFromJavaHome
35
- set JAVA_HOME=%JAVA_HOME:"=%
36
- set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
-
38
- if exist "%JAVA_EXE%" goto init
39
-
40
- echo.
41
- echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
- echo.
43
- echo Please set the JAVA_HOME variable in your environment to match the
44
- echo location of your Java installation.
45
-
46
- goto fail
47
-
48
- :init
49
- @rem Get command-line arguments, handling Windows variants
50
-
51
- if not "%OS%" == "Windows_NT" goto win9xME_args
52
-
53
- :win9xME_args
54
- @rem Slurp the command line arguments.
55
- set CMD_LINE_ARGS=
56
- set _SKIP=2
57
-
58
- :win9xME_args_slurp
59
- if "x%~1" == "x" goto execute
60
-
61
- set CMD_LINE_ARGS=%*
62
-
63
- :execute
64
- @rem Setup the command line
65
-
66
- set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67
-
68
- @rem Execute Gradle
69
- "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70
-
71
- :end
72
- @rem End local scope for the variables with windows NT shell
73
- if "%ERRORLEVEL%"=="0" goto mainEnd
74
-
75
- :fail
76
- rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77
- rem the _cmd.exe /c_ return code!
78
- if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79
- exit /b 1
80
-
81
- :mainEnd
82
- if "%OS%"=="Windows_NT" endlocal
83
-
84
- :omega
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ set DIRNAME=%~dp0
12
+ if "%DIRNAME%" == "" set DIRNAME=.
13
+ set APP_BASE_NAME=%~n0
14
+ set APP_HOME=%DIRNAME%
15
+
16
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
+ set DEFAULT_JVM_OPTS=
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windows variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+
53
+ :win9xME_args
54
+ @rem Slurp the command line arguments.
55
+ set CMD_LINE_ARGS=
56
+ set _SKIP=2
57
+
58
+ :win9xME_args_slurp
59
+ if "x%~1" == "x" goto execute
60
+
61
+ set CMD_LINE_ARGS=%*
62
+
63
+ :execute
64
+ @rem Setup the command line
65
+
66
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67
+
68
+ @rem Execute Gradle
69
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70
+
71
+ :end
72
+ @rem End local scope for the variables with windows NT shell
73
+ if "%ERRORLEVEL%"=="0" goto mainEnd
74
+
75
+ :fail
76
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77
+ rem the _cmd.exe /c_ return code!
78
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79
+ exit /b 1
80
+
81
+ :mainEnd
82
+ if "%OS%"=="Windows_NT" endlocal
83
+
84
+ :omega
@@ -2,19 +2,10 @@ package org.embulk.parser.csv_guessable;
2
2
 
3
3
  import com.google.common.base.Optional;
4
4
  import com.google.common.collect.ImmutableSet;
5
- import java.io.BufferedReader;
6
5
  import com.opencsv.CSVReader; // TODO: use embulk's parser
7
- import java.io.IOException;
8
- import java.io.StringReader;
9
- import java.nio.charset.StandardCharsets;
10
- import java.nio.file.Files;
11
- import java.nio.file.Path;
12
- import java.util.ArrayList;
13
- import org.slf4j.Logger;
14
6
 
15
7
  import org.embulk.config.Config;
16
8
  import org.embulk.config.ConfigDefault;
17
- import org.embulk.config.ConfigDiff;
18
9
  import org.embulk.config.ConfigException;
19
10
  import org.embulk.config.ConfigSource;
20
11
  import org.embulk.config.Task;
@@ -25,22 +16,31 @@ import org.embulk.spi.ColumnVisitor;
25
16
  import org.embulk.spi.DataException;
26
17
  import org.embulk.spi.Exec;
27
18
  import org.embulk.spi.FileInput;
28
- import org.embulk.spi.json.JsonParser;
29
- import org.embulk.spi.json.JsonParseException;
30
19
  import org.embulk.spi.PageBuilder;
31
20
  import org.embulk.spi.PageOutput;
32
21
  import org.embulk.spi.ParserPlugin;
33
22
  import org.embulk.spi.Schema;
34
23
  import org.embulk.spi.SchemaConfig;
35
- import org.embulk.spi.time.TimestampParser;
24
+ import org.embulk.spi.json.JsonParseException;
25
+ import org.embulk.spi.json.JsonParser;
36
26
  import org.embulk.spi.time.TimestampParseException;
27
+ import org.embulk.spi.time.TimestampParser;
37
28
  import org.embulk.spi.type.Types;
38
29
  import org.embulk.spi.unit.LocalFile;
39
30
  import org.embulk.spi.util.LineDecoder;
40
31
  import org.embulk.spi.util.Timestamps;
41
-
42
32
  import org.embulk.standards.CsvParserPlugin;
43
33
 
34
+ import org.slf4j.Logger;
35
+
36
+ import java.io.BufferedReader;
37
+ import java.io.IOException;
38
+ import java.io.StringReader;
39
+ import java.nio.charset.StandardCharsets;
40
+ import java.nio.file.Files;
41
+ import java.nio.file.Path;
42
+ import java.util.ArrayList;
43
+
44
44
  public class CsvGuessableParserPlugin
45
45
  extends CsvParserPlugin
46
46
  {
@@ -136,7 +136,8 @@ public class CsvGuessableParserPlugin
136
136
  if (task.getHeaderLine().isPresent()) {
137
137
  // TODO: use 'columns' as hints for guess
138
138
  throw new ConfigException("embulk-parsre-csv_gussable will use 'columnes' as hints for guess as hints for guess. Please delete 'columnes' now.");
139
- } else { /* guess from header */
139
+ }
140
+ else { /* guess from header */
140
141
  int schemaLine = task.getSchemaLine();
141
142
  task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
142
143
 
@@ -146,7 +147,8 @@ public class CsvGuessableParserPlugin
146
147
  log.debug(columns.toString());
147
148
  schemaConfig = new SchemaConfig(columns);
148
149
  }
149
- } else { /* embulk-parser-csv embulk */
150
+ }
151
+ else { /* embulk-parser-csv embulk */
150
152
  // backward compatibility
151
153
  if (task.getHeaderLine().isPresent()) {
152
154
  if (task.getSkipHeaderLines() > 0) {
@@ -154,7 +156,8 @@ public class CsvGuessableParserPlugin
154
156
  }
155
157
  if (task.getHeaderLine().get()) {
156
158
  task.setSkipHeaderLines(1);
157
- } else {
159
+ }
160
+ else {
158
161
  task.setSkipHeaderLines(0);
159
162
  }
160
163
  }
@@ -199,80 +202,96 @@ public class CsvGuessableParserPlugin
199
202
 
200
203
  try {
201
204
  schema.visitColumns(new ColumnVisitor() {
205
+ @Override
202
206
  public void booleanColumn(Column column)
203
207
  {
204
208
  String v = nextColumn();
205
209
  if (v == null) {
206
210
  pageBuilder.setNull(column);
207
- } else {
211
+ }
212
+ else {
208
213
  pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
209
214
  }
210
215
  }
211
216
 
217
+ @Override
212
218
  public void longColumn(Column column)
213
219
  {
214
220
  String v = nextColumn();
215
221
  if (v == null) {
216
222
  pageBuilder.setNull(column);
217
- } else {
223
+ }
224
+ else {
218
225
  try {
219
226
  pageBuilder.setLong(column, Long.parseLong(v));
220
- } catch (NumberFormatException e) {
227
+ }
228
+ catch (NumberFormatException e) {
221
229
  // TODO support default value
222
230
  throw new CsvRecordValidateException(e);
223
231
  }
224
232
  }
225
233
  }
226
234
 
235
+ @Override
227
236
  public void doubleColumn(Column column)
228
237
  {
229
238
  String v = nextColumn();
230
239
  if (v == null) {
231
240
  pageBuilder.setNull(column);
232
- } else {
241
+ }
242
+ else {
233
243
  try {
234
244
  pageBuilder.setDouble(column, Double.parseDouble(v));
235
- } catch (NumberFormatException e) {
245
+ }
246
+ catch (NumberFormatException e) {
236
247
  // TODO support default value
237
248
  throw new CsvRecordValidateException(e);
238
249
  }
239
250
  }
240
251
  }
241
252
 
253
+ @Override
242
254
  public void stringColumn(Column column)
243
255
  {
244
256
  String v = nextColumn();
245
257
  if (v == null) {
246
258
  pageBuilder.setNull(column);
247
- } else {
259
+ }
260
+ else {
248
261
  pageBuilder.setString(column, v);
249
262
  }
250
263
  }
251
264
 
265
+ @Override
252
266
  public void timestampColumn(Column column)
253
267
  {
254
268
  String v = nextColumn();
255
269
  if (v == null) {
256
270
  pageBuilder.setNull(column);
257
- } else {
271
+ }
272
+ else {
258
273
  try {
259
274
  // pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
260
- } catch (TimestampParseException e) {
275
+ }
276
+ catch (TimestampParseException e) {
261
277
  // TODO support default value
262
278
  throw new CsvRecordValidateException(e);
263
279
  }
264
280
  }
265
281
  }
266
282
 
283
+ @Override
267
284
  public void jsonColumn(Column column)
268
285
  {
269
286
  String v = nextColumn();
270
287
  if (v == null) {
271
288
  pageBuilder.setNull(column);
272
- } else {
289
+ }
290
+ else {
273
291
  try {
274
292
  pageBuilder.setJson(column, jsonParser.parse(v));
275
- } catch (JsonParseException e) {
293
+ }
294
+ catch (JsonParseException e) {
276
295
  // TODO support default value
277
296
  throw new CsvRecordValidateException(e);
278
297
  }
@@ -291,19 +310,21 @@ public class CsvGuessableParserPlugin
291
310
 
292
311
  try {
293
312
  hasNextRecord = tokenizer.nextRecord();
294
- } catch (CsvTokenizer.TooManyColumnsException ex) {
313
+ }
314
+ catch (CsvTokenizer.TooManyColumnsException ex) {
295
315
  if (allowExtraColumns) {
296
316
  String tooManyColumnsLine = tokenizer.skipCurrentLine();
297
317
  // TODO warning
298
318
  hasNextRecord = tokenizer.nextRecord();
299
- } else {
319
+ }
320
+ else {
300
321
  // this line will be skipped at the following catch section
301
322
  throw ex;
302
323
  }
303
324
  }
304
325
  pageBuilder.addRecord();
305
-
306
- } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
326
+ }
327
+ catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
307
328
  String skippedLine = tokenizer.skipCurrentLine();
308
329
  long lineNumber = tokenizer.getCurrentLineNumber();
309
330
  if (stopOnInvalidRecord) {
@@ -334,14 +355,15 @@ public class CsvGuessableParserPlugin
334
355
  }
335
356
  }
336
357
 
337
- private String readHeader(Path path, int schemaLine) {
358
+ private String readHeader(Path path, int schemaLine)
359
+ {
338
360
  if (schemaLine <= 0) {
339
361
  throw new ConfigException("'schemaLine' must be set '> 0'");
340
362
  }
341
363
 
342
364
  String line = null;
343
365
  try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
344
- for (int i=1; i <= schemaLine; ++i) {
366
+ for (int i = 1; i <= schemaLine; ++i) {
345
367
  line = br.readLine();
346
368
  if (line == null) {
347
369
  throw new ConfigException("not found 'schema_line' in 'schema_file'");
@@ -353,13 +375,14 @@ public class CsvGuessableParserPlugin
353
375
  return line;
354
376
  }
355
377
 
356
- private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config) {
357
- ArrayList columns = new ArrayList<ArrayList>();
378
+ private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
379
+ {
380
+ ArrayList columns = new ArrayList<ArrayList>();
358
381
  PluginTask task = config.loadConfig(PluginTask.class);
359
382
 
360
383
  try (CSVReader reader = new CSVReader(new StringReader(header))) {
361
384
  String[] csv = reader.readNext();
362
- for (String column: csv) {
385
+ for (String column : csv) {
363
386
  columns.add(new ColumnConfig(column, Types.STRING, config));
364
387
  }
365
388
  } catch (IOException e) {
@@ -1,13 +1,14 @@
1
1
  package org.embulk.parser.csv_guessable;
2
2
 
3
3
  import com.google.common.base.Preconditions;
4
- import java.util.List;
5
- import java.util.ArrayList;
6
- import java.util.Deque;
7
- import java.util.ArrayDeque;
4
+ import org.embulk.config.ConfigException;
8
5
  import org.embulk.spi.DataException;
9
6
  import org.embulk.spi.util.LineDecoder;
10
- import org.embulk.config.ConfigException;
7
+
8
+ import java.util.ArrayDeque;
9
+ import java.util.ArrayList;
10
+ import java.util.Deque;
11
+ import java.util.List;
11
12
 
12
13
  public class CsvTokenizer
13
14
  {
@@ -50,11 +51,13 @@ public class CsvTokenizer
50
51
  String delimiter = task.getDelimiter();
51
52
  if (delimiter.length() == 0) {
52
53
  throw new ConfigException("Empty delimiter is not allowed");
53
- } else {
54
+ }
55
+ else {
54
56
  this.delimiterChar = delimiter.charAt(0);
55
57
  if (delimiter.length() > 1) {
56
58
  delimiterFollowingString = delimiter.substring(1);
57
- } else {
59
+ }
60
+ else {
58
61
  delimiterFollowingString = null;
59
62
  }
60
63
  }
@@ -88,7 +91,8 @@ public class CsvTokenizer
88
91
  String skippedLine;
89
92
  if (quotedValueLines.isEmpty()) {
90
93
  skippedLine = line;
91
- } else {
94
+ }
95
+ else {
92
96
  // recover lines of quoted value
93
97
  skippedLine = quotedValueLines.remove(0); // TODO optimize performance
94
98
  unreadLines.addAll(quotedValueLines);
@@ -129,7 +133,8 @@ public class CsvTokenizer
129
133
  if (hasNext) {
130
134
  recordState = RecordState.NOT_END;
131
135
  return true;
132
- } else {
136
+ }
137
+ else {
133
138
  return false;
134
139
  }
135
140
  }
@@ -139,7 +144,8 @@ public class CsvTokenizer
139
144
  while (true) {
140
145
  if (!unreadLines.isEmpty()) {
141
146
  line = unreadLines.removeFirst();
142
- } else {
147
+ }
148
+ else {
143
149
  line = input.poll();
144
150
  if (line == null) {
145
151
  return false;
@@ -189,7 +195,8 @@ public class CsvTokenizer
189
195
  // empty value
190
196
  if (delimiterFollowingString == null) {
191
197
  return "";
192
- } else if (isDelimiterFollowingFrom(linePos)) {
198
+ }
199
+ else if (isDelimiterFollowingFrom(linePos)) {
193
200
  linePos += delimiterFollowingString.length();
194
201
  return "";
195
202
  }
@@ -199,17 +206,17 @@ public class CsvTokenizer
199
206
  // empty value
200
207
  recordState = RecordState.END;
201
208
  return "";
202
-
203
- } else if (isSpace(c) && trimIfNotQuoted) {
209
+ }
210
+ else if (isSpace(c) && trimIfNotQuoted) {
204
211
  columnState = ColumnState.FIRST_TRIM;
205
-
206
- } else if (isQuote(c)) {
212
+ }
213
+ else if (isQuote(c)) {
207
214
  valueStartPos = linePos; // == 1
208
215
  wasQuotedColumn = true;
209
216
  quotedValue = new StringBuilder();
210
217
  columnState = ColumnState.QUOTED_VALUE;
211
-
212
- } else {
218
+ }
219
+ else {
213
220
  columnState = ColumnState.VALUE;
214
221
  }
215
222
  break;
@@ -219,7 +226,8 @@ public class CsvTokenizer
219
226
  // empty value
220
227
  if (delimiterFollowingString == null) {
221
228
  return "";
222
- } else if (isDelimiterFollowingFrom(linePos)) {
229
+ }
230
+ else if (isDelimiterFollowingFrom(linePos)) {
223
231
  linePos += delimiterFollowingString.length();
224
232
  return "";
225
233
  }
@@ -229,18 +237,18 @@ public class CsvTokenizer
229
237
  // empty value
230
238
  recordState = RecordState.END;
231
239
  return "";
232
-
233
- } else if (isQuote(c)) {
240
+ }
241
+ else if (isQuote(c)) {
234
242
  // column has heading spaces and quoted. TODO should this be rejected?
235
243
  valueStartPos = linePos;
236
244
  wasQuotedColumn = true;
237
245
  quotedValue = new StringBuilder();
238
246
  columnState = ColumnState.QUOTED_VALUE;
239
-
240
- } else if (isSpace(c)) {
247
+ }
248
+ else if (isSpace(c)) {
241
249
  // skip this character
242
-
243
- } else {
250
+ }
251
+ else {
244
252
  valueStartPos = linePos - 1;
245
253
  columnState = ColumnState.VALUE;
246
254
  }
@@ -250,7 +258,8 @@ public class CsvTokenizer
250
258
  if (isDelimiter(c)) {
251
259
  if (delimiterFollowingString == null) {
252
260
  return line.substring(valueStartPos, linePos - 1);
253
- } else if (isDelimiterFollowingFrom(linePos)) {
261
+ }
262
+ else if (isDelimiterFollowingFrom(linePos)) {
254
263
  String value = line.substring(valueStartPos, linePos - 1);
255
264
  linePos += delimiterFollowingString.length();
256
265
  return value;
@@ -260,8 +269,8 @@ public class CsvTokenizer
260
269
  if (isEndOfLine(c)) {
261
270
  recordState = RecordState.END;
262
271
  return line.substring(valueStartPos, linePos);
263
-
264
- } else if (isSpace(c) && trimIfNotQuoted) {
272
+ }
273
+ else if (isSpace(c) && trimIfNotQuoted) {
265
274
  valueEndPos = linePos - 1; // this is possibly end of value
266
275
  columnState = ColumnState.LAST_TRIM_OR_VALUE;
267
276
 
@@ -270,8 +279,8 @@ public class CsvTokenizer
270
279
  // // In RFC4180, If fields are not enclosed with double quotes, then
271
280
  // // double quotes may not appear inside the fields. But they are often
272
281
  // // included in the fields. We should care about them later.
273
-
274
- } else {
282
+ }
283
+ else {
275
284
  // keep VALUE state
276
285
  }
277
286
  break;
@@ -280,21 +289,23 @@ public class CsvTokenizer
280
289
  if (isDelimiter(c)) {
281
290
  if (delimiterFollowingString == null) {
282
291
  return line.substring(valueStartPos, valueEndPos);
283
- } else if (isDelimiterFollowingFrom(linePos)) {
292
+ }
293
+ else if (isDelimiterFollowingFrom(linePos)) {
284
294
  linePos += delimiterFollowingString.length();
285
295
  return line.substring(valueStartPos, valueEndPos);
286
- } else {
296
+ }
297
+ else {
287
298
  // not a delimiter
288
299
  }
289
300
  }
290
301
  if (isEndOfLine(c)) {
291
302
  recordState = RecordState.END;
292
303
  return line.substring(valueStartPos, valueEndPos);
293
-
294
- } else if (isSpace(c)) {
304
+ }
305
+ else if (isSpace(c)) {
295
306
  // keep LAST_TRIM_OR_VALUE state
296
-
297
- } else {
307
+ }
308
+ else {
298
309
  // this spaces are not trailing spaces. go back to VALUE state
299
310
  columnState = ColumnState.VALUE;
300
311
  }
@@ -310,18 +321,19 @@ public class CsvTokenizer
310
321
  throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
311
322
  }
312
323
  valueStartPos = 0;
313
-
314
- } else if (isQuote(c)) {
324
+ }
325
+ else if (isQuote(c)) {
315
326
  char next = peekNextChar();
316
327
  if (isQuote(next)) { // escaped quote
317
328
  quotedValue.append(line.substring(valueStartPos, linePos));
318
329
  valueStartPos = ++linePos;
319
- } else {
330
+ }
331
+ else {
320
332
  quotedValue.append(line.substring(valueStartPos, linePos - 1));
321
333
  columnState = ColumnState.AFTER_QUOTED_VALUE;
322
334
  }
323
-
324
- } else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
335
+ }
336
+ else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
325
337
  // In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
326
338
  char next = peekNextChar();
327
339
  if (isEndOfLine(c)) {
@@ -332,15 +344,16 @@ public class CsvTokenizer
332
344
  throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
333
345
  }
334
346
  valueStartPos = 0;
335
- } else if (isQuote(next) || isEscape(next)) { // escaped quote
347
+ }
348
+ else if (isQuote(next) || isEscape(next)) { // escaped quote
336
349
  quotedValue.append(line.substring(valueStartPos, linePos - 1));
337
350
  quotedValue.append(next);
338
351
  valueStartPos = ++linePos;
339
352
  }
340
-
341
- } else {
353
+ }
354
+ else {
342
355
  if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
343
- throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
356
+ throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
344
357
  }
345
358
  // keep QUOTED_VALUE state
346
359
  }
@@ -350,7 +363,8 @@ public class CsvTokenizer
350
363
  if (isDelimiter(c)) {
351
364
  if (delimiterFollowingString == null) {
352
365
  return quotedValue.toString();
353
- } else if (isDelimiterFollowingFrom(linePos)) {
366
+ }
367
+ else if (isDelimiterFollowingFrom(linePos)) {
354
368
  linePos += delimiterFollowingString.length();
355
369
  return quotedValue.toString();
356
370
  }
@@ -359,11 +373,11 @@ public class CsvTokenizer
359
373
  if (isEndOfLine(c)) {
360
374
  recordState = RecordState.END;
361
375
  return quotedValue.toString();
362
-
363
- } else if (isSpace(c)) {
376
+ }
377
+ else if (isSpace(c)) {
364
378
  // column has trailing spaces and quoted. TODO should this be rejected?
365
-
366
- } else {
379
+ }
380
+ else {
367
381
  throw new InvalidValueException(String.format("Unexpected extra character '%c' after a value quoted by '%c'", c, quote));
368
382
  }
369
383
  break;
@@ -411,7 +425,8 @@ public class CsvTokenizer
411
425
 
412
426
  if (linePos >= line.length()) {
413
427
  return END_OF_LINE;
414
- } else {
428
+ }
429
+ else {
415
430
  return line.charAt(linePos++);
416
431
  }
417
432
  }
@@ -422,7 +437,8 @@ public class CsvTokenizer
422
437
 
423
438
  if (linePos >= line.length()) {
424
439
  return END_OF_LINE;
425
- } else {
440
+ }
441
+ else {
426
442
  return line.charAt(linePos);
427
443
  }
428
444
  }
@@ -1,17 +1,16 @@
1
1
  package org.embulk.parser.csv_guessable;
2
2
 
3
+ import org.embulk.EmbulkTestRuntime;
3
4
  import org.embulk.config.ConfigException;
4
5
  import org.embulk.config.ConfigLoader;
5
6
  import org.embulk.config.ConfigSource;
6
- import org.embulk.EmbulkTestRuntime;
7
7
  import org.embulk.spi.Exec;
8
8
  import org.junit.Rule;
9
- import org.junit.rules.ExpectedException;
10
9
  import org.junit.Test;
11
-
12
- import static org.junit.Assert.assertFalse;
10
+ import org.junit.rules.ExpectedException;
13
11
 
14
12
  import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
13
+ import static org.junit.Assert.assertFalse;
15
14
 
16
15
  public class TestCsvGuessableParserPlugin
17
16
  {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-csv_guessable
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - koooge
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-01 00:00:00.000000000 Z
11
+ date: 2017-06-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -57,7 +57,6 @@ files:
57
57
  - gradlew.bat
58
58
  - lib/embulk/guess/csv_guessable.rb
59
59
  - lib/embulk/parser/csv_guessable.rb
60
- - libs/embulk-standards-0.8.22.jar
61
60
  - src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
62
61
  - src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
63
62
  - src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
@@ -67,9 +66,10 @@ files:
67
66
  - src/test/resources/yml/original-csv.yml
68
67
  - src/test/resources/yml/replace_column_name.yml
69
68
  - classpath/commons-lang3-3.5.jar
70
- - classpath/embulk-parser-csv_guessable-0.1.1.jar
71
69
  - classpath/opencsv-3.9.jar
72
70
  - classpath/commons-beanutils-1.9.3.jar
71
+ - classpath/commons-compress-1.10.jar
72
+ - classpath/embulk-parser-csv_guessable-0.1.2.jar
73
73
  - classpath/embulk-standards-0.8.22.jar
74
74
  - classpath/commons-collections-3.2.2.jar
75
75
  - classpath/commons-logging-1.2.jar
Binary file