embulk-parser-csv_guessable 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bf39985065e771f33ffee0a47f7866f4377ac933
4
- data.tar.gz: 521303beddd8bb591d4b8f619f4561846eb8c1ab
3
+ metadata.gz: 90dc39f04076979425a69d11b3177e1e4b1d5e7a
4
+ data.tar.gz: 5dca8965baaeb7fbe51f5f9df63f385b09d9bdb1
5
5
  SHA512:
6
- metadata.gz: 0cf15b12d61bafe9c8742797b88caf0a88f6982acf635643eb6baee109cc63d226992856efe3e8df21bc4579a54d1d419789c0c63c45ca53cda36bd03557d84c
7
- data.tar.gz: a3ab77844c86c449f322a67d5de57a47315c56c67f94a524feb6195a15b6c197335d9676ddb3a597a6cffd748fc3c7fb125cc789bbe8750b09ae5641f5cd5093
6
+ metadata.gz: 61bd54ee36352ab6667654f6dfdfcccd77de37efdd49f3325924538ba3d921737a1e9139691cc8aa8617f47893cef8219f0f50bff454a456df01309ed4668617
7
+ data.tar.gz: d6ab6e9d35ae8932ee5aa1a035c8f1440e6c4d1cf41565010278ca88d13a91cd85e2444476106a29cae6bd4312c5b2987d1211dfb45c26e7b6ae9bf511e12b75
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Csv Guessable parser plugin for Embulk
2
- **embulk-parser-csv_gussable** guesses and parses csv which has schema in header.
2
+ **embulk-parser-csv_gussable** (runtime) guesses and parses csv which has schema in header.
3
+ This plugin is useful in case of target csv schema changes frequently.
3
4
 
4
5
  Also it can behave as original csv parser without **embulk-parser-csv_guessable** specified configs.
5
6
 
@@ -12,11 +13,11 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
12
13
 
13
14
  - **schema_file**: filename which has schema.(string, default: `null`)
14
15
  - **schema_line**: schema line in header. (integer default: `"1"`)
15
- - **columns**: Columns hint for guess (hash, default: `null`)
16
+ - **(TODO)columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `"schema_file"` is set. If `"schema_file"` isn't set, this is same as original csv parser's `"columns"`. (hash, default: `null`)
16
17
  - any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
17
18
 
18
19
  ## Example
19
- data/test.csv
20
+ test.csv
20
21
 
21
22
  ```csv
22
23
  id, title, description
@@ -35,16 +36,31 @@ in:
35
36
  schema_line: 1
36
37
  ```
37
38
 
39
+ (To explain)
40
+ In case original csv parser
41
+ config.yml
42
+ ```yaml
43
+ in:
44
+ type: any file input plugin type
45
+ parser:
46
+ type: csv
47
+ skip_header_lines: 1
48
+ column:
49
+ - {name: id, type: string}
50
+ - {name: title, type: string}
51
+ - {name: description, type: string}
52
+ ```
53
+
38
54
  <!--
39
55
  (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
40
56
  -->
41
57
 
42
58
  ```
43
59
  $ embulk gem install embulk-parser-csv_guessable
60
+ ```
44
61
  <!--
45
62
  $ embulk guess -g csv_guessable config.yml -o guessed.yml
46
63
  -->
47
- ```
48
64
 
49
65
  ## Build
50
66
 
data/build.gradle CHANGED
@@ -13,14 +13,14 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.1"
16
+ version = "0.1.2"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
20
20
 
21
21
  dependencies {
22
- compile fileTree(dir: 'libs', include: '*.jar')
23
22
  compile "org.embulk:embulk-core:0.8.22"
23
+ compile "org.embulk:embulk-standards:0.8.22"
24
24
  compile "com.opencsv:opencsv:3.9"
25
25
  provided "org.embulk:embulk-core:0.8.22"
26
26
  testCompile "junit:junit:4.+"
data/gradlew.bat CHANGED
@@ -1,84 +1,84 @@
1
- @if "%DEBUG%" == "" @echo off
2
- @rem ##########################################################################
3
- @rem
4
- @rem Gradle startup script for Windows
5
- @rem
6
- @rem ##########################################################################
7
-
8
- @rem Set local scope for the variables with windows NT shell
9
- if "%OS%"=="Windows_NT" setlocal
10
-
11
- set DIRNAME=%~dp0
12
- if "%DIRNAME%" == "" set DIRNAME=.
13
- set APP_BASE_NAME=%~n0
14
- set APP_HOME=%DIRNAME%
15
-
16
- @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
- set DEFAULT_JVM_OPTS=
18
-
19
- @rem Find java.exe
20
- if defined JAVA_HOME goto findJavaFromJavaHome
21
-
22
- set JAVA_EXE=java.exe
23
- %JAVA_EXE% -version >NUL 2>&1
24
- if "%ERRORLEVEL%" == "0" goto init
25
-
26
- echo.
27
- echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
- echo.
29
- echo Please set the JAVA_HOME variable in your environment to match the
30
- echo location of your Java installation.
31
-
32
- goto fail
33
-
34
- :findJavaFromJavaHome
35
- set JAVA_HOME=%JAVA_HOME:"=%
36
- set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
-
38
- if exist "%JAVA_EXE%" goto init
39
-
40
- echo.
41
- echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
- echo.
43
- echo Please set the JAVA_HOME variable in your environment to match the
44
- echo location of your Java installation.
45
-
46
- goto fail
47
-
48
- :init
49
- @rem Get command-line arguments, handling Windows variants
50
-
51
- if not "%OS%" == "Windows_NT" goto win9xME_args
52
-
53
- :win9xME_args
54
- @rem Slurp the command line arguments.
55
- set CMD_LINE_ARGS=
56
- set _SKIP=2
57
-
58
- :win9xME_args_slurp
59
- if "x%~1" == "x" goto execute
60
-
61
- set CMD_LINE_ARGS=%*
62
-
63
- :execute
64
- @rem Setup the command line
65
-
66
- set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67
-
68
- @rem Execute Gradle
69
- "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70
-
71
- :end
72
- @rem End local scope for the variables with windows NT shell
73
- if "%ERRORLEVEL%"=="0" goto mainEnd
74
-
75
- :fail
76
- rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77
- rem the _cmd.exe /c_ return code!
78
- if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79
- exit /b 1
80
-
81
- :mainEnd
82
- if "%OS%"=="Windows_NT" endlocal
83
-
84
- :omega
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ set DIRNAME=%~dp0
12
+ if "%DIRNAME%" == "" set DIRNAME=.
13
+ set APP_BASE_NAME=%~n0
14
+ set APP_HOME=%DIRNAME%
15
+
16
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
+ set DEFAULT_JVM_OPTS=
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windows variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+
53
+ :win9xME_args
54
+ @rem Slurp the command line arguments.
55
+ set CMD_LINE_ARGS=
56
+ set _SKIP=2
57
+
58
+ :win9xME_args_slurp
59
+ if "x%~1" == "x" goto execute
60
+
61
+ set CMD_LINE_ARGS=%*
62
+
63
+ :execute
64
+ @rem Setup the command line
65
+
66
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67
+
68
+ @rem Execute Gradle
69
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70
+
71
+ :end
72
+ @rem End local scope for the variables with windows NT shell
73
+ if "%ERRORLEVEL%"=="0" goto mainEnd
74
+
75
+ :fail
76
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77
+ rem the _cmd.exe /c_ return code!
78
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79
+ exit /b 1
80
+
81
+ :mainEnd
82
+ if "%OS%"=="Windows_NT" endlocal
83
+
84
+ :omega
@@ -2,19 +2,10 @@ package org.embulk.parser.csv_guessable;
2
2
 
3
3
  import com.google.common.base.Optional;
4
4
  import com.google.common.collect.ImmutableSet;
5
- import java.io.BufferedReader;
6
5
  import com.opencsv.CSVReader; // TODO: use embulk's parser
7
- import java.io.IOException;
8
- import java.io.StringReader;
9
- import java.nio.charset.StandardCharsets;
10
- import java.nio.file.Files;
11
- import java.nio.file.Path;
12
- import java.util.ArrayList;
13
- import org.slf4j.Logger;
14
6
 
15
7
  import org.embulk.config.Config;
16
8
  import org.embulk.config.ConfigDefault;
17
- import org.embulk.config.ConfigDiff;
18
9
  import org.embulk.config.ConfigException;
19
10
  import org.embulk.config.ConfigSource;
20
11
  import org.embulk.config.Task;
@@ -25,22 +16,31 @@ import org.embulk.spi.ColumnVisitor;
25
16
  import org.embulk.spi.DataException;
26
17
  import org.embulk.spi.Exec;
27
18
  import org.embulk.spi.FileInput;
28
- import org.embulk.spi.json.JsonParser;
29
- import org.embulk.spi.json.JsonParseException;
30
19
  import org.embulk.spi.PageBuilder;
31
20
  import org.embulk.spi.PageOutput;
32
21
  import org.embulk.spi.ParserPlugin;
33
22
  import org.embulk.spi.Schema;
34
23
  import org.embulk.spi.SchemaConfig;
35
- import org.embulk.spi.time.TimestampParser;
24
+ import org.embulk.spi.json.JsonParseException;
25
+ import org.embulk.spi.json.JsonParser;
36
26
  import org.embulk.spi.time.TimestampParseException;
27
+ import org.embulk.spi.time.TimestampParser;
37
28
  import org.embulk.spi.type.Types;
38
29
  import org.embulk.spi.unit.LocalFile;
39
30
  import org.embulk.spi.util.LineDecoder;
40
31
  import org.embulk.spi.util.Timestamps;
41
-
42
32
  import org.embulk.standards.CsvParserPlugin;
43
33
 
34
+ import org.slf4j.Logger;
35
+
36
+ import java.io.BufferedReader;
37
+ import java.io.IOException;
38
+ import java.io.StringReader;
39
+ import java.nio.charset.StandardCharsets;
40
+ import java.nio.file.Files;
41
+ import java.nio.file.Path;
42
+ import java.util.ArrayList;
43
+
44
44
  public class CsvGuessableParserPlugin
45
45
  extends CsvParserPlugin
46
46
  {
@@ -136,7 +136,8 @@ public class CsvGuessableParserPlugin
136
136
  if (task.getHeaderLine().isPresent()) {
137
137
  // TODO: use 'columns' as hints for guess
138
138
  throw new ConfigException("embulk-parsre-csv_gussable will use 'columnes' as hints for guess as hints for guess. Please delete 'columnes' now.");
139
- } else { /* guess from header */
139
+ }
140
+ else { /* guess from header */
140
141
  int schemaLine = task.getSchemaLine();
141
142
  task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
142
143
 
@@ -146,7 +147,8 @@ public class CsvGuessableParserPlugin
146
147
  log.debug(columns.toString());
147
148
  schemaConfig = new SchemaConfig(columns);
148
149
  }
149
- } else { /* embulk-parser-csv embulk */
150
+ }
151
+ else { /* embulk-parser-csv embulk */
150
152
  // backward compatibility
151
153
  if (task.getHeaderLine().isPresent()) {
152
154
  if (task.getSkipHeaderLines() > 0) {
@@ -154,7 +156,8 @@ public class CsvGuessableParserPlugin
154
156
  }
155
157
  if (task.getHeaderLine().get()) {
156
158
  task.setSkipHeaderLines(1);
157
- } else {
159
+ }
160
+ else {
158
161
  task.setSkipHeaderLines(0);
159
162
  }
160
163
  }
@@ -199,80 +202,96 @@ public class CsvGuessableParserPlugin
199
202
 
200
203
  try {
201
204
  schema.visitColumns(new ColumnVisitor() {
205
+ @Override
202
206
  public void booleanColumn(Column column)
203
207
  {
204
208
  String v = nextColumn();
205
209
  if (v == null) {
206
210
  pageBuilder.setNull(column);
207
- } else {
211
+ }
212
+ else {
208
213
  pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
209
214
  }
210
215
  }
211
216
 
217
+ @Override
212
218
  public void longColumn(Column column)
213
219
  {
214
220
  String v = nextColumn();
215
221
  if (v == null) {
216
222
  pageBuilder.setNull(column);
217
- } else {
223
+ }
224
+ else {
218
225
  try {
219
226
  pageBuilder.setLong(column, Long.parseLong(v));
220
- } catch (NumberFormatException e) {
227
+ }
228
+ catch (NumberFormatException e) {
221
229
  // TODO support default value
222
230
  throw new CsvRecordValidateException(e);
223
231
  }
224
232
  }
225
233
  }
226
234
 
235
+ @Override
227
236
  public void doubleColumn(Column column)
228
237
  {
229
238
  String v = nextColumn();
230
239
  if (v == null) {
231
240
  pageBuilder.setNull(column);
232
- } else {
241
+ }
242
+ else {
233
243
  try {
234
244
  pageBuilder.setDouble(column, Double.parseDouble(v));
235
- } catch (NumberFormatException e) {
245
+ }
246
+ catch (NumberFormatException e) {
236
247
  // TODO support default value
237
248
  throw new CsvRecordValidateException(e);
238
249
  }
239
250
  }
240
251
  }
241
252
 
253
+ @Override
242
254
  public void stringColumn(Column column)
243
255
  {
244
256
  String v = nextColumn();
245
257
  if (v == null) {
246
258
  pageBuilder.setNull(column);
247
- } else {
259
+ }
260
+ else {
248
261
  pageBuilder.setString(column, v);
249
262
  }
250
263
  }
251
264
 
265
+ @Override
252
266
  public void timestampColumn(Column column)
253
267
  {
254
268
  String v = nextColumn();
255
269
  if (v == null) {
256
270
  pageBuilder.setNull(column);
257
- } else {
271
+ }
272
+ else {
258
273
  try {
259
274
  // pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
260
- } catch (TimestampParseException e) {
275
+ }
276
+ catch (TimestampParseException e) {
261
277
  // TODO support default value
262
278
  throw new CsvRecordValidateException(e);
263
279
  }
264
280
  }
265
281
  }
266
282
 
283
+ @Override
267
284
  public void jsonColumn(Column column)
268
285
  {
269
286
  String v = nextColumn();
270
287
  if (v == null) {
271
288
  pageBuilder.setNull(column);
272
- } else {
289
+ }
290
+ else {
273
291
  try {
274
292
  pageBuilder.setJson(column, jsonParser.parse(v));
275
- } catch (JsonParseException e) {
293
+ }
294
+ catch (JsonParseException e) {
276
295
  // TODO support default value
277
296
  throw new CsvRecordValidateException(e);
278
297
  }
@@ -291,19 +310,21 @@ public class CsvGuessableParserPlugin
291
310
 
292
311
  try {
293
312
  hasNextRecord = tokenizer.nextRecord();
294
- } catch (CsvTokenizer.TooManyColumnsException ex) {
313
+ }
314
+ catch (CsvTokenizer.TooManyColumnsException ex) {
295
315
  if (allowExtraColumns) {
296
316
  String tooManyColumnsLine = tokenizer.skipCurrentLine();
297
317
  // TODO warning
298
318
  hasNextRecord = tokenizer.nextRecord();
299
- } else {
319
+ }
320
+ else {
300
321
  // this line will be skipped at the following catch section
301
322
  throw ex;
302
323
  }
303
324
  }
304
325
  pageBuilder.addRecord();
305
-
306
- } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
326
+ }
327
+ catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
307
328
  String skippedLine = tokenizer.skipCurrentLine();
308
329
  long lineNumber = tokenizer.getCurrentLineNumber();
309
330
  if (stopOnInvalidRecord) {
@@ -334,14 +355,15 @@ public class CsvGuessableParserPlugin
334
355
  }
335
356
  }
336
357
 
337
- private String readHeader(Path path, int schemaLine) {
358
+ private String readHeader(Path path, int schemaLine)
359
+ {
338
360
  if (schemaLine <= 0) {
339
361
  throw new ConfigException("'schemaLine' must be set '> 0'");
340
362
  }
341
363
 
342
364
  String line = null;
343
365
  try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
344
- for (int i=1; i <= schemaLine; ++i) {
366
+ for (int i = 1; i <= schemaLine; ++i) {
345
367
  line = br.readLine();
346
368
  if (line == null) {
347
369
  throw new ConfigException("not found 'schema_line' in 'schema_file'");
@@ -353,13 +375,14 @@ public class CsvGuessableParserPlugin
353
375
  return line;
354
376
  }
355
377
 
356
- private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config) {
357
- ArrayList columns = new ArrayList<ArrayList>();
378
+ private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
379
+ {
380
+ ArrayList columns = new ArrayList<ArrayList>();
358
381
  PluginTask task = config.loadConfig(PluginTask.class);
359
382
 
360
383
  try (CSVReader reader = new CSVReader(new StringReader(header))) {
361
384
  String[] csv = reader.readNext();
362
- for (String column: csv) {
385
+ for (String column : csv) {
363
386
  columns.add(new ColumnConfig(column, Types.STRING, config));
364
387
  }
365
388
  } catch (IOException e) {
@@ -1,13 +1,14 @@
1
1
  package org.embulk.parser.csv_guessable;
2
2
 
3
3
  import com.google.common.base.Preconditions;
4
- import java.util.List;
5
- import java.util.ArrayList;
6
- import java.util.Deque;
7
- import java.util.ArrayDeque;
4
+ import org.embulk.config.ConfigException;
8
5
  import org.embulk.spi.DataException;
9
6
  import org.embulk.spi.util.LineDecoder;
10
- import org.embulk.config.ConfigException;
7
+
8
+ import java.util.ArrayDeque;
9
+ import java.util.ArrayList;
10
+ import java.util.Deque;
11
+ import java.util.List;
11
12
 
12
13
  public class CsvTokenizer
13
14
  {
@@ -50,11 +51,13 @@ public class CsvTokenizer
50
51
  String delimiter = task.getDelimiter();
51
52
  if (delimiter.length() == 0) {
52
53
  throw new ConfigException("Empty delimiter is not allowed");
53
- } else {
54
+ }
55
+ else {
54
56
  this.delimiterChar = delimiter.charAt(0);
55
57
  if (delimiter.length() > 1) {
56
58
  delimiterFollowingString = delimiter.substring(1);
57
- } else {
59
+ }
60
+ else {
58
61
  delimiterFollowingString = null;
59
62
  }
60
63
  }
@@ -88,7 +91,8 @@ public class CsvTokenizer
88
91
  String skippedLine;
89
92
  if (quotedValueLines.isEmpty()) {
90
93
  skippedLine = line;
91
- } else {
94
+ }
95
+ else {
92
96
  // recover lines of quoted value
93
97
  skippedLine = quotedValueLines.remove(0); // TODO optimize performance
94
98
  unreadLines.addAll(quotedValueLines);
@@ -129,7 +133,8 @@ public class CsvTokenizer
129
133
  if (hasNext) {
130
134
  recordState = RecordState.NOT_END;
131
135
  return true;
132
- } else {
136
+ }
137
+ else {
133
138
  return false;
134
139
  }
135
140
  }
@@ -139,7 +144,8 @@ public class CsvTokenizer
139
144
  while (true) {
140
145
  if (!unreadLines.isEmpty()) {
141
146
  line = unreadLines.removeFirst();
142
- } else {
147
+ }
148
+ else {
143
149
  line = input.poll();
144
150
  if (line == null) {
145
151
  return false;
@@ -189,7 +195,8 @@ public class CsvTokenizer
189
195
  // empty value
190
196
  if (delimiterFollowingString == null) {
191
197
  return "";
192
- } else if (isDelimiterFollowingFrom(linePos)) {
198
+ }
199
+ else if (isDelimiterFollowingFrom(linePos)) {
193
200
  linePos += delimiterFollowingString.length();
194
201
  return "";
195
202
  }
@@ -199,17 +206,17 @@ public class CsvTokenizer
199
206
  // empty value
200
207
  recordState = RecordState.END;
201
208
  return "";
202
-
203
- } else if (isSpace(c) && trimIfNotQuoted) {
209
+ }
210
+ else if (isSpace(c) && trimIfNotQuoted) {
204
211
  columnState = ColumnState.FIRST_TRIM;
205
-
206
- } else if (isQuote(c)) {
212
+ }
213
+ else if (isQuote(c)) {
207
214
  valueStartPos = linePos; // == 1
208
215
  wasQuotedColumn = true;
209
216
  quotedValue = new StringBuilder();
210
217
  columnState = ColumnState.QUOTED_VALUE;
211
-
212
- } else {
218
+ }
219
+ else {
213
220
  columnState = ColumnState.VALUE;
214
221
  }
215
222
  break;
@@ -219,7 +226,8 @@ public class CsvTokenizer
219
226
  // empty value
220
227
  if (delimiterFollowingString == null) {
221
228
  return "";
222
- } else if (isDelimiterFollowingFrom(linePos)) {
229
+ }
230
+ else if (isDelimiterFollowingFrom(linePos)) {
223
231
  linePos += delimiterFollowingString.length();
224
232
  return "";
225
233
  }
@@ -229,18 +237,18 @@ public class CsvTokenizer
229
237
  // empty value
230
238
  recordState = RecordState.END;
231
239
  return "";
232
-
233
- } else if (isQuote(c)) {
240
+ }
241
+ else if (isQuote(c)) {
234
242
  // column has heading spaces and quoted. TODO should this be rejected?
235
243
  valueStartPos = linePos;
236
244
  wasQuotedColumn = true;
237
245
  quotedValue = new StringBuilder();
238
246
  columnState = ColumnState.QUOTED_VALUE;
239
-
240
- } else if (isSpace(c)) {
247
+ }
248
+ else if (isSpace(c)) {
241
249
  // skip this character
242
-
243
- } else {
250
+ }
251
+ else {
244
252
  valueStartPos = linePos - 1;
245
253
  columnState = ColumnState.VALUE;
246
254
  }
@@ -250,7 +258,8 @@ public class CsvTokenizer
250
258
  if (isDelimiter(c)) {
251
259
  if (delimiterFollowingString == null) {
252
260
  return line.substring(valueStartPos, linePos - 1);
253
- } else if (isDelimiterFollowingFrom(linePos)) {
261
+ }
262
+ else if (isDelimiterFollowingFrom(linePos)) {
254
263
  String value = line.substring(valueStartPos, linePos - 1);
255
264
  linePos += delimiterFollowingString.length();
256
265
  return value;
@@ -260,8 +269,8 @@ public class CsvTokenizer
260
269
  if (isEndOfLine(c)) {
261
270
  recordState = RecordState.END;
262
271
  return line.substring(valueStartPos, linePos);
263
-
264
- } else if (isSpace(c) && trimIfNotQuoted) {
272
+ }
273
+ else if (isSpace(c) && trimIfNotQuoted) {
265
274
  valueEndPos = linePos - 1; // this is possibly end of value
266
275
  columnState = ColumnState.LAST_TRIM_OR_VALUE;
267
276
 
@@ -270,8 +279,8 @@ public class CsvTokenizer
270
279
  // // In RFC4180, If fields are not enclosed with double quotes, then
271
280
  // // double quotes may not appear inside the fields. But they are often
272
281
  // // included in the fields. We should care about them later.
273
-
274
- } else {
282
+ }
283
+ else {
275
284
  // keep VALUE state
276
285
  }
277
286
  break;
@@ -280,21 +289,23 @@ public class CsvTokenizer
280
289
  if (isDelimiter(c)) {
281
290
  if (delimiterFollowingString == null) {
282
291
  return line.substring(valueStartPos, valueEndPos);
283
- } else if (isDelimiterFollowingFrom(linePos)) {
292
+ }
293
+ else if (isDelimiterFollowingFrom(linePos)) {
284
294
  linePos += delimiterFollowingString.length();
285
295
  return line.substring(valueStartPos, valueEndPos);
286
- } else {
296
+ }
297
+ else {
287
298
  // not a delimiter
288
299
  }
289
300
  }
290
301
  if (isEndOfLine(c)) {
291
302
  recordState = RecordState.END;
292
303
  return line.substring(valueStartPos, valueEndPos);
293
-
294
- } else if (isSpace(c)) {
304
+ }
305
+ else if (isSpace(c)) {
295
306
  // keep LAST_TRIM_OR_VALUE state
296
-
297
- } else {
307
+ }
308
+ else {
298
309
  // this spaces are not trailing spaces. go back to VALUE state
299
310
  columnState = ColumnState.VALUE;
300
311
  }
@@ -310,18 +321,19 @@ public class CsvTokenizer
310
321
  throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
311
322
  }
312
323
  valueStartPos = 0;
313
-
314
- } else if (isQuote(c)) {
324
+ }
325
+ else if (isQuote(c)) {
315
326
  char next = peekNextChar();
316
327
  if (isQuote(next)) { // escaped quote
317
328
  quotedValue.append(line.substring(valueStartPos, linePos));
318
329
  valueStartPos = ++linePos;
319
- } else {
330
+ }
331
+ else {
320
332
  quotedValue.append(line.substring(valueStartPos, linePos - 1));
321
333
  columnState = ColumnState.AFTER_QUOTED_VALUE;
322
334
  }
323
-
324
- } else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
335
+ }
336
+ else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
325
337
  // In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
326
338
  char next = peekNextChar();
327
339
  if (isEndOfLine(c)) {
@@ -332,15 +344,16 @@ public class CsvTokenizer
332
344
  throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
333
345
  }
334
346
  valueStartPos = 0;
335
- } else if (isQuote(next) || isEscape(next)) { // escaped quote
347
+ }
348
+ else if (isQuote(next) || isEscape(next)) { // escaped quote
336
349
  quotedValue.append(line.substring(valueStartPos, linePos - 1));
337
350
  quotedValue.append(next);
338
351
  valueStartPos = ++linePos;
339
352
  }
340
-
341
- } else {
353
+ }
354
+ else {
342
355
  if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
343
- throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
356
+ throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
344
357
  }
345
358
  // keep QUOTED_VALUE state
346
359
  }
@@ -350,7 +363,8 @@ public class CsvTokenizer
350
363
  if (isDelimiter(c)) {
351
364
  if (delimiterFollowingString == null) {
352
365
  return quotedValue.toString();
353
- } else if (isDelimiterFollowingFrom(linePos)) {
366
+ }
367
+ else if (isDelimiterFollowingFrom(linePos)) {
354
368
  linePos += delimiterFollowingString.length();
355
369
  return quotedValue.toString();
356
370
  }
@@ -359,11 +373,11 @@ public class CsvTokenizer
359
373
  if (isEndOfLine(c)) {
360
374
  recordState = RecordState.END;
361
375
  return quotedValue.toString();
362
-
363
- } else if (isSpace(c)) {
376
+ }
377
+ else if (isSpace(c)) {
364
378
  // column has trailing spaces and quoted. TODO should this be rejected?
365
-
366
- } else {
379
+ }
380
+ else {
367
381
  throw new InvalidValueException(String.format("Unexpected extra character '%c' after a value quoted by '%c'", c, quote));
368
382
  }
369
383
  break;
@@ -411,7 +425,8 @@ public class CsvTokenizer
411
425
 
412
426
  if (linePos >= line.length()) {
413
427
  return END_OF_LINE;
414
- } else {
428
+ }
429
+ else {
415
430
  return line.charAt(linePos++);
416
431
  }
417
432
  }
@@ -422,7 +437,8 @@ public class CsvTokenizer
422
437
 
423
438
  if (linePos >= line.length()) {
424
439
  return END_OF_LINE;
425
- } else {
440
+ }
441
+ else {
426
442
  return line.charAt(linePos);
427
443
  }
428
444
  }
@@ -1,17 +1,16 @@
1
1
  package org.embulk.parser.csv_guessable;
2
2
 
3
+ import org.embulk.EmbulkTestRuntime;
3
4
  import org.embulk.config.ConfigException;
4
5
  import org.embulk.config.ConfigLoader;
5
6
  import org.embulk.config.ConfigSource;
6
- import org.embulk.EmbulkTestRuntime;
7
7
  import org.embulk.spi.Exec;
8
8
  import org.junit.Rule;
9
- import org.junit.rules.ExpectedException;
10
9
  import org.junit.Test;
11
-
12
- import static org.junit.Assert.assertFalse;
10
+ import org.junit.rules.ExpectedException;
13
11
 
14
12
  import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
13
+ import static org.junit.Assert.assertFalse;
15
14
 
16
15
  public class TestCsvGuessableParserPlugin
17
16
  {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-csv_guessable
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - koooge
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-01 00:00:00.000000000 Z
11
+ date: 2017-06-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -57,7 +57,6 @@ files:
57
57
  - gradlew.bat
58
58
  - lib/embulk/guess/csv_guessable.rb
59
59
  - lib/embulk/parser/csv_guessable.rb
60
- - libs/embulk-standards-0.8.22.jar
61
60
  - src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
62
61
  - src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
63
62
  - src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
@@ -67,9 +66,10 @@ files:
67
66
  - src/test/resources/yml/original-csv.yml
68
67
  - src/test/resources/yml/replace_column_name.yml
69
68
  - classpath/commons-lang3-3.5.jar
70
- - classpath/embulk-parser-csv_guessable-0.1.1.jar
71
69
  - classpath/opencsv-3.9.jar
72
70
  - classpath/commons-beanutils-1.9.3.jar
71
+ - classpath/commons-compress-1.10.jar
72
+ - classpath/embulk-parser-csv_guessable-0.1.2.jar
73
73
  - classpath/embulk-standards-0.8.22.jar
74
74
  - classpath/commons-collections-3.2.2.jar
75
75
  - classpath/commons-logging-1.2.jar
Binary file