embulk-parser-csv_guessable 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/build.gradle +2 -2
- data/gradlew.bat +84 -84
- data/src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java +58 -35
- data/src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java +67 -51
- data/src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java +3 -4
- metadata +4 -4
- data/libs/embulk-standards-0.8.22.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90dc39f04076979425a69d11b3177e1e4b1d5e7a
|
4
|
+
data.tar.gz: 5dca8965baaeb7fbe51f5f9df63f385b09d9bdb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61bd54ee36352ab6667654f6dfdfcccd77de37efdd49f3325924538ba3d921737a1e9139691cc8aa8617f47893cef8219f0f50bff454a456df01309ed4668617
|
7
|
+
data.tar.gz: d6ab6e9d35ae8932ee5aa1a035c8f1440e6c4d1cf41565010278ca88d13a91cd85e2444476106a29cae6bd4312c5b2987d1211dfb45c26e7b6ae9bf511e12b75
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Csv Guessable parser plugin for Embulk
|
2
|
-
**embulk-parser-csv_gussable** guesses and parses csv which has schema in header.
|
2
|
+
**embulk-parser-csv_gussable** (runtime) guesses and parses csv which has schema in header.
|
3
|
+
This plugin is useful in case of target csv schema changes frequently.
|
3
4
|
|
4
5
|
Also it can behave as original csv parser without **embulk-parser-csv_guessable** specified configs.
|
5
6
|
|
@@ -12,11 +13,11 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
|
|
12
13
|
|
13
14
|
- **schema_file**: filename which has schema.(string, default: `null`)
|
14
15
|
- **schema_line**: schema line in header. (integer default: `"1"`)
|
15
|
-
- **columns**: Columns
|
16
|
+
- **(TODO)columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `"schema_file"` is set. If `"schema_file"` isn't set, this is same as original csv parser's `"columns"`. (hash, default: `null`)
|
16
17
|
- any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
|
17
18
|
|
18
19
|
## Example
|
19
|
-
|
20
|
+
test.csv
|
20
21
|
|
21
22
|
```csv
|
22
23
|
id, title, description
|
@@ -35,16 +36,31 @@ in:
|
|
35
36
|
schema_line: 1
|
36
37
|
```
|
37
38
|
|
39
|
+
(To explain)
|
40
|
+
In case original csv parser
|
41
|
+
config.yml
|
42
|
+
```yaml
|
43
|
+
in:
|
44
|
+
type: any file input plugin type
|
45
|
+
parser:
|
46
|
+
type: csv
|
47
|
+
skip_header_lines: 1
|
48
|
+
column:
|
49
|
+
- {name: id, type: string}
|
50
|
+
- {name: title, type: string}
|
51
|
+
- {name: description, type: string}
|
52
|
+
```
|
53
|
+
|
38
54
|
<!--
|
39
55
|
(If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
|
40
56
|
-->
|
41
57
|
|
42
58
|
```
|
43
59
|
$ embulk gem install embulk-parser-csv_guessable
|
60
|
+
```
|
44
61
|
<!--
|
45
62
|
$ embulk guess -g csv_guessable config.yml -o guessed.yml
|
46
63
|
-->
|
47
|
-
```
|
48
64
|
|
49
65
|
## Build
|
50
66
|
|
data/build.gradle
CHANGED
@@ -13,14 +13,14 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.2"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.7
|
19
19
|
targetCompatibility = 1.7
|
20
20
|
|
21
21
|
dependencies {
|
22
|
-
compile fileTree(dir: 'libs', include: '*.jar')
|
23
22
|
compile "org.embulk:embulk-core:0.8.22"
|
23
|
+
compile "org.embulk:embulk-standards:0.8.22"
|
24
24
|
compile "com.opencsv:opencsv:3.9"
|
25
25
|
provided "org.embulk:embulk-core:0.8.22"
|
26
26
|
testCompile "junit:junit:4.+"
|
data/gradlew.bat
CHANGED
@@ -1,84 +1,84 @@
|
|
1
|
-
@if "%DEBUG%" == "" @echo off
|
2
|
-
@rem ##########################################################################
|
3
|
-
@rem
|
4
|
-
@rem Gradle startup script for Windows
|
5
|
-
@rem
|
6
|
-
@rem ##########################################################################
|
7
|
-
|
8
|
-
@rem Set local scope for the variables with windows NT shell
|
9
|
-
if "%OS%"=="Windows_NT" setlocal
|
10
|
-
|
11
|
-
set DIRNAME=%~dp0
|
12
|
-
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
-
set APP_BASE_NAME=%~n0
|
14
|
-
set APP_HOME=%DIRNAME%
|
15
|
-
|
16
|
-
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
-
set DEFAULT_JVM_OPTS=
|
18
|
-
|
19
|
-
@rem Find java.exe
|
20
|
-
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
-
|
22
|
-
set JAVA_EXE=java.exe
|
23
|
-
%JAVA_EXE% -version >NUL 2>&1
|
24
|
-
if "%ERRORLEVEL%" == "0" goto init
|
25
|
-
|
26
|
-
echo.
|
27
|
-
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
-
echo.
|
29
|
-
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
-
echo location of your Java installation.
|
31
|
-
|
32
|
-
goto fail
|
33
|
-
|
34
|
-
:findJavaFromJavaHome
|
35
|
-
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
-
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
-
|
38
|
-
if exist "%JAVA_EXE%" goto init
|
39
|
-
|
40
|
-
echo.
|
41
|
-
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
-
echo.
|
43
|
-
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
-
echo location of your Java installation.
|
45
|
-
|
46
|
-
goto fail
|
47
|
-
|
48
|
-
:init
|
49
|
-
@rem Get command-line arguments, handling Windows variants
|
50
|
-
|
51
|
-
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
-
|
53
|
-
:win9xME_args
|
54
|
-
@rem Slurp the command line arguments.
|
55
|
-
set CMD_LINE_ARGS=
|
56
|
-
set _SKIP=2
|
57
|
-
|
58
|
-
:win9xME_args_slurp
|
59
|
-
if "x%~1" == "x" goto execute
|
60
|
-
|
61
|
-
set CMD_LINE_ARGS=%*
|
62
|
-
|
63
|
-
:execute
|
64
|
-
@rem Setup the command line
|
65
|
-
|
66
|
-
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
-
|
68
|
-
@rem Execute Gradle
|
69
|
-
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
-
|
71
|
-
:end
|
72
|
-
@rem End local scope for the variables with windows NT shell
|
73
|
-
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
-
|
75
|
-
:fail
|
76
|
-
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
-
rem the _cmd.exe /c_ return code!
|
78
|
-
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
-
exit /b 1
|
80
|
-
|
81
|
-
:mainEnd
|
82
|
-
if "%OS%"=="Windows_NT" endlocal
|
83
|
-
|
84
|
-
:omega
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
set DIRNAME=%~dp0
|
12
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
+
set APP_BASE_NAME=%~n0
|
14
|
+
set APP_HOME=%DIRNAME%
|
15
|
+
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
|
53
|
+
:win9xME_args
|
54
|
+
@rem Slurp the command line arguments.
|
55
|
+
set CMD_LINE_ARGS=
|
56
|
+
set _SKIP=2
|
57
|
+
|
58
|
+
:win9xME_args_slurp
|
59
|
+
if "x%~1" == "x" goto execute
|
60
|
+
|
61
|
+
set CMD_LINE_ARGS=%*
|
62
|
+
|
63
|
+
:execute
|
64
|
+
@rem Setup the command line
|
65
|
+
|
66
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
+
|
68
|
+
@rem Execute Gradle
|
69
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
+
|
71
|
+
:end
|
72
|
+
@rem End local scope for the variables with windows NT shell
|
73
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
+
|
75
|
+
:fail
|
76
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
+
rem the _cmd.exe /c_ return code!
|
78
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
+
exit /b 1
|
80
|
+
|
81
|
+
:mainEnd
|
82
|
+
if "%OS%"=="Windows_NT" endlocal
|
83
|
+
|
84
|
+
:omega
|
@@ -2,19 +2,10 @@ package org.embulk.parser.csv_guessable;
|
|
2
2
|
|
3
3
|
import com.google.common.base.Optional;
|
4
4
|
import com.google.common.collect.ImmutableSet;
|
5
|
-
import java.io.BufferedReader;
|
6
5
|
import com.opencsv.CSVReader; // TODO: use embulk's parser
|
7
|
-
import java.io.IOException;
|
8
|
-
import java.io.StringReader;
|
9
|
-
import java.nio.charset.StandardCharsets;
|
10
|
-
import java.nio.file.Files;
|
11
|
-
import java.nio.file.Path;
|
12
|
-
import java.util.ArrayList;
|
13
|
-
import org.slf4j.Logger;
|
14
6
|
|
15
7
|
import org.embulk.config.Config;
|
16
8
|
import org.embulk.config.ConfigDefault;
|
17
|
-
import org.embulk.config.ConfigDiff;
|
18
9
|
import org.embulk.config.ConfigException;
|
19
10
|
import org.embulk.config.ConfigSource;
|
20
11
|
import org.embulk.config.Task;
|
@@ -25,22 +16,31 @@ import org.embulk.spi.ColumnVisitor;
|
|
25
16
|
import org.embulk.spi.DataException;
|
26
17
|
import org.embulk.spi.Exec;
|
27
18
|
import org.embulk.spi.FileInput;
|
28
|
-
import org.embulk.spi.json.JsonParser;
|
29
|
-
import org.embulk.spi.json.JsonParseException;
|
30
19
|
import org.embulk.spi.PageBuilder;
|
31
20
|
import org.embulk.spi.PageOutput;
|
32
21
|
import org.embulk.spi.ParserPlugin;
|
33
22
|
import org.embulk.spi.Schema;
|
34
23
|
import org.embulk.spi.SchemaConfig;
|
35
|
-
import org.embulk.spi.
|
24
|
+
import org.embulk.spi.json.JsonParseException;
|
25
|
+
import org.embulk.spi.json.JsonParser;
|
36
26
|
import org.embulk.spi.time.TimestampParseException;
|
27
|
+
import org.embulk.spi.time.TimestampParser;
|
37
28
|
import org.embulk.spi.type.Types;
|
38
29
|
import org.embulk.spi.unit.LocalFile;
|
39
30
|
import org.embulk.spi.util.LineDecoder;
|
40
31
|
import org.embulk.spi.util.Timestamps;
|
41
|
-
|
42
32
|
import org.embulk.standards.CsvParserPlugin;
|
43
33
|
|
34
|
+
import org.slf4j.Logger;
|
35
|
+
|
36
|
+
import java.io.BufferedReader;
|
37
|
+
import java.io.IOException;
|
38
|
+
import java.io.StringReader;
|
39
|
+
import java.nio.charset.StandardCharsets;
|
40
|
+
import java.nio.file.Files;
|
41
|
+
import java.nio.file.Path;
|
42
|
+
import java.util.ArrayList;
|
43
|
+
|
44
44
|
public class CsvGuessableParserPlugin
|
45
45
|
extends CsvParserPlugin
|
46
46
|
{
|
@@ -136,7 +136,8 @@ public class CsvGuessableParserPlugin
|
|
136
136
|
if (task.getHeaderLine().isPresent()) {
|
137
137
|
// TODO: use 'columns' as hints for guess
|
138
138
|
throw new ConfigException("embulk-parsre-csv_gussable will use 'columnes' as hints for guess as hints for guess. Please delete 'columnes' now.");
|
139
|
-
}
|
139
|
+
}
|
140
|
+
else { /* guess from header */
|
140
141
|
int schemaLine = task.getSchemaLine();
|
141
142
|
task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
|
142
143
|
|
@@ -146,7 +147,8 @@ public class CsvGuessableParserPlugin
|
|
146
147
|
log.debug(columns.toString());
|
147
148
|
schemaConfig = new SchemaConfig(columns);
|
148
149
|
}
|
149
|
-
}
|
150
|
+
}
|
151
|
+
else { /* embulk-parser-csv embulk */
|
150
152
|
// backward compatibility
|
151
153
|
if (task.getHeaderLine().isPresent()) {
|
152
154
|
if (task.getSkipHeaderLines() > 0) {
|
@@ -154,7 +156,8 @@ public class CsvGuessableParserPlugin
|
|
154
156
|
}
|
155
157
|
if (task.getHeaderLine().get()) {
|
156
158
|
task.setSkipHeaderLines(1);
|
157
|
-
}
|
159
|
+
}
|
160
|
+
else {
|
158
161
|
task.setSkipHeaderLines(0);
|
159
162
|
}
|
160
163
|
}
|
@@ -199,80 +202,96 @@ public class CsvGuessableParserPlugin
|
|
199
202
|
|
200
203
|
try {
|
201
204
|
schema.visitColumns(new ColumnVisitor() {
|
205
|
+
@Override
|
202
206
|
public void booleanColumn(Column column)
|
203
207
|
{
|
204
208
|
String v = nextColumn();
|
205
209
|
if (v == null) {
|
206
210
|
pageBuilder.setNull(column);
|
207
|
-
}
|
211
|
+
}
|
212
|
+
else {
|
208
213
|
pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
|
209
214
|
}
|
210
215
|
}
|
211
216
|
|
217
|
+
@Override
|
212
218
|
public void longColumn(Column column)
|
213
219
|
{
|
214
220
|
String v = nextColumn();
|
215
221
|
if (v == null) {
|
216
222
|
pageBuilder.setNull(column);
|
217
|
-
}
|
223
|
+
}
|
224
|
+
else {
|
218
225
|
try {
|
219
226
|
pageBuilder.setLong(column, Long.parseLong(v));
|
220
|
-
}
|
227
|
+
}
|
228
|
+
catch (NumberFormatException e) {
|
221
229
|
// TODO support default value
|
222
230
|
throw new CsvRecordValidateException(e);
|
223
231
|
}
|
224
232
|
}
|
225
233
|
}
|
226
234
|
|
235
|
+
@Override
|
227
236
|
public void doubleColumn(Column column)
|
228
237
|
{
|
229
238
|
String v = nextColumn();
|
230
239
|
if (v == null) {
|
231
240
|
pageBuilder.setNull(column);
|
232
|
-
}
|
241
|
+
}
|
242
|
+
else {
|
233
243
|
try {
|
234
244
|
pageBuilder.setDouble(column, Double.parseDouble(v));
|
235
|
-
}
|
245
|
+
}
|
246
|
+
catch (NumberFormatException e) {
|
236
247
|
// TODO support default value
|
237
248
|
throw new CsvRecordValidateException(e);
|
238
249
|
}
|
239
250
|
}
|
240
251
|
}
|
241
252
|
|
253
|
+
@Override
|
242
254
|
public void stringColumn(Column column)
|
243
255
|
{
|
244
256
|
String v = nextColumn();
|
245
257
|
if (v == null) {
|
246
258
|
pageBuilder.setNull(column);
|
247
|
-
}
|
259
|
+
}
|
260
|
+
else {
|
248
261
|
pageBuilder.setString(column, v);
|
249
262
|
}
|
250
263
|
}
|
251
264
|
|
265
|
+
@Override
|
252
266
|
public void timestampColumn(Column column)
|
253
267
|
{
|
254
268
|
String v = nextColumn();
|
255
269
|
if (v == null) {
|
256
270
|
pageBuilder.setNull(column);
|
257
|
-
}
|
271
|
+
}
|
272
|
+
else {
|
258
273
|
try {
|
259
274
|
// pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
|
260
|
-
}
|
275
|
+
}
|
276
|
+
catch (TimestampParseException e) {
|
261
277
|
// TODO support default value
|
262
278
|
throw new CsvRecordValidateException(e);
|
263
279
|
}
|
264
280
|
}
|
265
281
|
}
|
266
282
|
|
283
|
+
@Override
|
267
284
|
public void jsonColumn(Column column)
|
268
285
|
{
|
269
286
|
String v = nextColumn();
|
270
287
|
if (v == null) {
|
271
288
|
pageBuilder.setNull(column);
|
272
|
-
}
|
289
|
+
}
|
290
|
+
else {
|
273
291
|
try {
|
274
292
|
pageBuilder.setJson(column, jsonParser.parse(v));
|
275
|
-
}
|
293
|
+
}
|
294
|
+
catch (JsonParseException e) {
|
276
295
|
// TODO support default value
|
277
296
|
throw new CsvRecordValidateException(e);
|
278
297
|
}
|
@@ -291,19 +310,21 @@ public class CsvGuessableParserPlugin
|
|
291
310
|
|
292
311
|
try {
|
293
312
|
hasNextRecord = tokenizer.nextRecord();
|
294
|
-
}
|
313
|
+
}
|
314
|
+
catch (CsvTokenizer.TooManyColumnsException ex) {
|
295
315
|
if (allowExtraColumns) {
|
296
316
|
String tooManyColumnsLine = tokenizer.skipCurrentLine();
|
297
317
|
// TODO warning
|
298
318
|
hasNextRecord = tokenizer.nextRecord();
|
299
|
-
}
|
319
|
+
}
|
320
|
+
else {
|
300
321
|
// this line will be skipped at the following catch section
|
301
322
|
throw ex;
|
302
323
|
}
|
303
324
|
}
|
304
325
|
pageBuilder.addRecord();
|
305
|
-
|
306
|
-
|
326
|
+
}
|
327
|
+
catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
|
307
328
|
String skippedLine = tokenizer.skipCurrentLine();
|
308
329
|
long lineNumber = tokenizer.getCurrentLineNumber();
|
309
330
|
if (stopOnInvalidRecord) {
|
@@ -334,14 +355,15 @@ public class CsvGuessableParserPlugin
|
|
334
355
|
}
|
335
356
|
}
|
336
357
|
|
337
|
-
private String readHeader(Path path, int schemaLine)
|
358
|
+
private String readHeader(Path path, int schemaLine)
|
359
|
+
{
|
338
360
|
if (schemaLine <= 0) {
|
339
361
|
throw new ConfigException("'schemaLine' must be set '> 0'");
|
340
362
|
}
|
341
363
|
|
342
364
|
String line = null;
|
343
365
|
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
344
|
-
for (int i=1; i <= schemaLine; ++i) {
|
366
|
+
for (int i = 1; i <= schemaLine; ++i) {
|
345
367
|
line = br.readLine();
|
346
368
|
if (line == null) {
|
347
369
|
throw new ConfigException("not found 'schema_line' in 'schema_file'");
|
@@ -353,13 +375,14 @@ public class CsvGuessableParserPlugin
|
|
353
375
|
return line;
|
354
376
|
}
|
355
377
|
|
356
|
-
private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
|
357
|
-
|
378
|
+
private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
|
379
|
+
{
|
380
|
+
ArrayList columns = new ArrayList<ArrayList>();
|
358
381
|
PluginTask task = config.loadConfig(PluginTask.class);
|
359
382
|
|
360
383
|
try (CSVReader reader = new CSVReader(new StringReader(header))) {
|
361
384
|
String[] csv = reader.readNext();
|
362
|
-
for (String column: csv) {
|
385
|
+
for (String column : csv) {
|
363
386
|
columns.add(new ColumnConfig(column, Types.STRING, config));
|
364
387
|
}
|
365
388
|
} catch (IOException e) {
|
@@ -1,13 +1,14 @@
|
|
1
1
|
package org.embulk.parser.csv_guessable;
|
2
2
|
|
3
3
|
import com.google.common.base.Preconditions;
|
4
|
-
import
|
5
|
-
import java.util.ArrayList;
|
6
|
-
import java.util.Deque;
|
7
|
-
import java.util.ArrayDeque;
|
4
|
+
import org.embulk.config.ConfigException;
|
8
5
|
import org.embulk.spi.DataException;
|
9
6
|
import org.embulk.spi.util.LineDecoder;
|
10
|
-
|
7
|
+
|
8
|
+
import java.util.ArrayDeque;
|
9
|
+
import java.util.ArrayList;
|
10
|
+
import java.util.Deque;
|
11
|
+
import java.util.List;
|
11
12
|
|
12
13
|
public class CsvTokenizer
|
13
14
|
{
|
@@ -50,11 +51,13 @@ public class CsvTokenizer
|
|
50
51
|
String delimiter = task.getDelimiter();
|
51
52
|
if (delimiter.length() == 0) {
|
52
53
|
throw new ConfigException("Empty delimiter is not allowed");
|
53
|
-
}
|
54
|
+
}
|
55
|
+
else {
|
54
56
|
this.delimiterChar = delimiter.charAt(0);
|
55
57
|
if (delimiter.length() > 1) {
|
56
58
|
delimiterFollowingString = delimiter.substring(1);
|
57
|
-
}
|
59
|
+
}
|
60
|
+
else {
|
58
61
|
delimiterFollowingString = null;
|
59
62
|
}
|
60
63
|
}
|
@@ -88,7 +91,8 @@ public class CsvTokenizer
|
|
88
91
|
String skippedLine;
|
89
92
|
if (quotedValueLines.isEmpty()) {
|
90
93
|
skippedLine = line;
|
91
|
-
}
|
94
|
+
}
|
95
|
+
else {
|
92
96
|
// recover lines of quoted value
|
93
97
|
skippedLine = quotedValueLines.remove(0); // TODO optimize performance
|
94
98
|
unreadLines.addAll(quotedValueLines);
|
@@ -129,7 +133,8 @@ public class CsvTokenizer
|
|
129
133
|
if (hasNext) {
|
130
134
|
recordState = RecordState.NOT_END;
|
131
135
|
return true;
|
132
|
-
}
|
136
|
+
}
|
137
|
+
else {
|
133
138
|
return false;
|
134
139
|
}
|
135
140
|
}
|
@@ -139,7 +144,8 @@ public class CsvTokenizer
|
|
139
144
|
while (true) {
|
140
145
|
if (!unreadLines.isEmpty()) {
|
141
146
|
line = unreadLines.removeFirst();
|
142
|
-
}
|
147
|
+
}
|
148
|
+
else {
|
143
149
|
line = input.poll();
|
144
150
|
if (line == null) {
|
145
151
|
return false;
|
@@ -189,7 +195,8 @@ public class CsvTokenizer
|
|
189
195
|
// empty value
|
190
196
|
if (delimiterFollowingString == null) {
|
191
197
|
return "";
|
192
|
-
}
|
198
|
+
}
|
199
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
193
200
|
linePos += delimiterFollowingString.length();
|
194
201
|
return "";
|
195
202
|
}
|
@@ -199,17 +206,17 @@ public class CsvTokenizer
|
|
199
206
|
// empty value
|
200
207
|
recordState = RecordState.END;
|
201
208
|
return "";
|
202
|
-
|
203
|
-
|
209
|
+
}
|
210
|
+
else if (isSpace(c) && trimIfNotQuoted) {
|
204
211
|
columnState = ColumnState.FIRST_TRIM;
|
205
|
-
|
206
|
-
|
212
|
+
}
|
213
|
+
else if (isQuote(c)) {
|
207
214
|
valueStartPos = linePos; // == 1
|
208
215
|
wasQuotedColumn = true;
|
209
216
|
quotedValue = new StringBuilder();
|
210
217
|
columnState = ColumnState.QUOTED_VALUE;
|
211
|
-
|
212
|
-
|
218
|
+
}
|
219
|
+
else {
|
213
220
|
columnState = ColumnState.VALUE;
|
214
221
|
}
|
215
222
|
break;
|
@@ -219,7 +226,8 @@ public class CsvTokenizer
|
|
219
226
|
// empty value
|
220
227
|
if (delimiterFollowingString == null) {
|
221
228
|
return "";
|
222
|
-
}
|
229
|
+
}
|
230
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
223
231
|
linePos += delimiterFollowingString.length();
|
224
232
|
return "";
|
225
233
|
}
|
@@ -229,18 +237,18 @@ public class CsvTokenizer
|
|
229
237
|
// empty value
|
230
238
|
recordState = RecordState.END;
|
231
239
|
return "";
|
232
|
-
|
233
|
-
|
240
|
+
}
|
241
|
+
else if (isQuote(c)) {
|
234
242
|
// column has heading spaces and quoted. TODO should this be rejected?
|
235
243
|
valueStartPos = linePos;
|
236
244
|
wasQuotedColumn = true;
|
237
245
|
quotedValue = new StringBuilder();
|
238
246
|
columnState = ColumnState.QUOTED_VALUE;
|
239
|
-
|
240
|
-
|
247
|
+
}
|
248
|
+
else if (isSpace(c)) {
|
241
249
|
// skip this character
|
242
|
-
|
243
|
-
|
250
|
+
}
|
251
|
+
else {
|
244
252
|
valueStartPos = linePos - 1;
|
245
253
|
columnState = ColumnState.VALUE;
|
246
254
|
}
|
@@ -250,7 +258,8 @@ public class CsvTokenizer
|
|
250
258
|
if (isDelimiter(c)) {
|
251
259
|
if (delimiterFollowingString == null) {
|
252
260
|
return line.substring(valueStartPos, linePos - 1);
|
253
|
-
}
|
261
|
+
}
|
262
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
254
263
|
String value = line.substring(valueStartPos, linePos - 1);
|
255
264
|
linePos += delimiterFollowingString.length();
|
256
265
|
return value;
|
@@ -260,8 +269,8 @@ public class CsvTokenizer
|
|
260
269
|
if (isEndOfLine(c)) {
|
261
270
|
recordState = RecordState.END;
|
262
271
|
return line.substring(valueStartPos, linePos);
|
263
|
-
|
264
|
-
|
272
|
+
}
|
273
|
+
else if (isSpace(c) && trimIfNotQuoted) {
|
265
274
|
valueEndPos = linePos - 1; // this is possibly end of value
|
266
275
|
columnState = ColumnState.LAST_TRIM_OR_VALUE;
|
267
276
|
|
@@ -270,8 +279,8 @@ public class CsvTokenizer
|
|
270
279
|
// // In RFC4180, If fields are not enclosed with double quotes, then
|
271
280
|
// // double quotes may not appear inside the fields. But they are often
|
272
281
|
// // included in the fields. We should care about them later.
|
273
|
-
|
274
|
-
|
282
|
+
}
|
283
|
+
else {
|
275
284
|
// keep VALUE state
|
276
285
|
}
|
277
286
|
break;
|
@@ -280,21 +289,23 @@ public class CsvTokenizer
|
|
280
289
|
if (isDelimiter(c)) {
|
281
290
|
if (delimiterFollowingString == null) {
|
282
291
|
return line.substring(valueStartPos, valueEndPos);
|
283
|
-
}
|
292
|
+
}
|
293
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
284
294
|
linePos += delimiterFollowingString.length();
|
285
295
|
return line.substring(valueStartPos, valueEndPos);
|
286
|
-
}
|
296
|
+
}
|
297
|
+
else {
|
287
298
|
// not a delimiter
|
288
299
|
}
|
289
300
|
}
|
290
301
|
if (isEndOfLine(c)) {
|
291
302
|
recordState = RecordState.END;
|
292
303
|
return line.substring(valueStartPos, valueEndPos);
|
293
|
-
|
294
|
-
|
304
|
+
}
|
305
|
+
else if (isSpace(c)) {
|
295
306
|
// keep LAST_TRIM_OR_VALUE state
|
296
|
-
|
297
|
-
|
307
|
+
}
|
308
|
+
else {
|
298
309
|
// this spaces are not trailing spaces. go back to VALUE state
|
299
310
|
columnState = ColumnState.VALUE;
|
300
311
|
}
|
@@ -310,18 +321,19 @@ public class CsvTokenizer
|
|
310
321
|
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
311
322
|
}
|
312
323
|
valueStartPos = 0;
|
313
|
-
|
314
|
-
|
324
|
+
}
|
325
|
+
else if (isQuote(c)) {
|
315
326
|
char next = peekNextChar();
|
316
327
|
if (isQuote(next)) { // escaped quote
|
317
328
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
318
329
|
valueStartPos = ++linePos;
|
319
|
-
}
|
330
|
+
}
|
331
|
+
else {
|
320
332
|
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
321
333
|
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
322
334
|
}
|
323
|
-
|
324
|
-
|
335
|
+
}
|
336
|
+
else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
325
337
|
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
326
338
|
char next = peekNextChar();
|
327
339
|
if (isEndOfLine(c)) {
|
@@ -332,15 +344,16 @@ public class CsvTokenizer
|
|
332
344
|
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
333
345
|
}
|
334
346
|
valueStartPos = 0;
|
335
|
-
}
|
347
|
+
}
|
348
|
+
else if (isQuote(next) || isEscape(next)) { // escaped quote
|
336
349
|
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
337
350
|
quotedValue.append(next);
|
338
351
|
valueStartPos = ++linePos;
|
339
352
|
}
|
340
|
-
|
341
|
-
|
353
|
+
}
|
354
|
+
else {
|
342
355
|
if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
|
343
|
-
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
|
356
|
+
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
|
344
357
|
}
|
345
358
|
// keep QUOTED_VALUE state
|
346
359
|
}
|
@@ -350,7 +363,8 @@ public class CsvTokenizer
|
|
350
363
|
if (isDelimiter(c)) {
|
351
364
|
if (delimiterFollowingString == null) {
|
352
365
|
return quotedValue.toString();
|
353
|
-
}
|
366
|
+
}
|
367
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
354
368
|
linePos += delimiterFollowingString.length();
|
355
369
|
return quotedValue.toString();
|
356
370
|
}
|
@@ -359,11 +373,11 @@ public class CsvTokenizer
|
|
359
373
|
if (isEndOfLine(c)) {
|
360
374
|
recordState = RecordState.END;
|
361
375
|
return quotedValue.toString();
|
362
|
-
|
363
|
-
|
376
|
+
}
|
377
|
+
else if (isSpace(c)) {
|
364
378
|
// column has trailing spaces and quoted. TODO should this be rejected?
|
365
|
-
|
366
|
-
|
379
|
+
}
|
380
|
+
else {
|
367
381
|
throw new InvalidValueException(String.format("Unexpected extra character '%c' after a value quoted by '%c'", c, quote));
|
368
382
|
}
|
369
383
|
break;
|
@@ -411,7 +425,8 @@ public class CsvTokenizer
|
|
411
425
|
|
412
426
|
if (linePos >= line.length()) {
|
413
427
|
return END_OF_LINE;
|
414
|
-
}
|
428
|
+
}
|
429
|
+
else {
|
415
430
|
return line.charAt(linePos++);
|
416
431
|
}
|
417
432
|
}
|
@@ -422,7 +437,8 @@ public class CsvTokenizer
|
|
422
437
|
|
423
438
|
if (linePos >= line.length()) {
|
424
439
|
return END_OF_LINE;
|
425
|
-
}
|
440
|
+
}
|
441
|
+
else {
|
426
442
|
return line.charAt(linePos);
|
427
443
|
}
|
428
444
|
}
|
@@ -1,17 +1,16 @@
|
|
1
1
|
package org.embulk.parser.csv_guessable;
|
2
2
|
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
3
4
|
import org.embulk.config.ConfigException;
|
4
5
|
import org.embulk.config.ConfigLoader;
|
5
6
|
import org.embulk.config.ConfigSource;
|
6
|
-
import org.embulk.EmbulkTestRuntime;
|
7
7
|
import org.embulk.spi.Exec;
|
8
8
|
import org.junit.Rule;
|
9
|
-
import org.junit.rules.ExpectedException;
|
10
9
|
import org.junit.Test;
|
11
|
-
|
12
|
-
import static org.junit.Assert.assertFalse;
|
10
|
+
import org.junit.rules.ExpectedException;
|
13
11
|
|
14
12
|
import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
|
13
|
+
import static org.junit.Assert.assertFalse;
|
15
14
|
|
16
15
|
public class TestCsvGuessableParserPlugin
|
17
16
|
{
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-csv_guessable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- koooge
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,7 +57,6 @@ files:
|
|
57
57
|
- gradlew.bat
|
58
58
|
- lib/embulk/guess/csv_guessable.rb
|
59
59
|
- lib/embulk/parser/csv_guessable.rb
|
60
|
-
- libs/embulk-standards-0.8.22.jar
|
61
60
|
- src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
|
62
61
|
- src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
|
63
62
|
- src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
|
@@ -67,9 +66,10 @@ files:
|
|
67
66
|
- src/test/resources/yml/original-csv.yml
|
68
67
|
- src/test/resources/yml/replace_column_name.yml
|
69
68
|
- classpath/commons-lang3-3.5.jar
|
70
|
-
- classpath/embulk-parser-csv_guessable-0.1.1.jar
|
71
69
|
- classpath/opencsv-3.9.jar
|
72
70
|
- classpath/commons-beanutils-1.9.3.jar
|
71
|
+
- classpath/commons-compress-1.10.jar
|
72
|
+
- classpath/embulk-parser-csv_guessable-0.1.2.jar
|
73
73
|
- classpath/embulk-standards-0.8.22.jar
|
74
74
|
- classpath/commons-collections-3.2.2.jar
|
75
75
|
- classpath/commons-logging-1.2.jar
|
Binary file
|