embulk-parser-csv_guessable 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/build.gradle +2 -2
- data/gradlew.bat +84 -84
- data/src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java +58 -35
- data/src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java +67 -51
- data/src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java +3 -4
- metadata +4 -4
- data/libs/embulk-standards-0.8.22.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90dc39f04076979425a69d11b3177e1e4b1d5e7a
|
4
|
+
data.tar.gz: 5dca8965baaeb7fbe51f5f9df63f385b09d9bdb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61bd54ee36352ab6667654f6dfdfcccd77de37efdd49f3325924538ba3d921737a1e9139691cc8aa8617f47893cef8219f0f50bff454a456df01309ed4668617
|
7
|
+
data.tar.gz: d6ab6e9d35ae8932ee5aa1a035c8f1440e6c4d1cf41565010278ca88d13a91cd85e2444476106a29cae6bd4312c5b2987d1211dfb45c26e7b6ae9bf511e12b75
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Csv Guessable parser plugin for Embulk
|
2
|
-
**embulk-parser-csv_gussable** guesses and parses csv which has schema in header.
|
2
|
+
**embulk-parser-csv_gussable** (runtime) guesses and parses csv which has schema in header.
|
3
|
+
This plugin is useful in case of target csv schema changes frequently.
|
3
4
|
|
4
5
|
Also it can behave as original csv parser without **embulk-parser-csv_guessable** specified configs.
|
5
6
|
|
@@ -12,11 +13,11 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
|
|
12
13
|
|
13
14
|
- **schema_file**: filename which has schema.(string, default: `null`)
|
14
15
|
- **schema_line**: schema line in header. (integer default: `"1"`)
|
15
|
-
- **columns**: Columns
|
16
|
+
- **(TODO)columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `"schema_file"` is set. If `"schema_file"` isn't set, this is same as original csv parser's `"columns"`. (hash, default: `null`)
|
16
17
|
- any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
|
17
18
|
|
18
19
|
## Example
|
19
|
-
|
20
|
+
test.csv
|
20
21
|
|
21
22
|
```csv
|
22
23
|
id, title, description
|
@@ -35,16 +36,31 @@ in:
|
|
35
36
|
schema_line: 1
|
36
37
|
```
|
37
38
|
|
39
|
+
(To explain)
|
40
|
+
In case original csv parser
|
41
|
+
config.yml
|
42
|
+
```yaml
|
43
|
+
in:
|
44
|
+
type: any file input plugin type
|
45
|
+
parser:
|
46
|
+
type: csv
|
47
|
+
skip_header_lines: 1
|
48
|
+
column:
|
49
|
+
- {name: id, type: string}
|
50
|
+
- {name: title, type: string}
|
51
|
+
- {name: description, type: string}
|
52
|
+
```
|
53
|
+
|
38
54
|
<!--
|
39
55
|
(If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
|
40
56
|
-->
|
41
57
|
|
42
58
|
```
|
43
59
|
$ embulk gem install embulk-parser-csv_guessable
|
60
|
+
```
|
44
61
|
<!--
|
45
62
|
$ embulk guess -g csv_guessable config.yml -o guessed.yml
|
46
63
|
-->
|
47
|
-
```
|
48
64
|
|
49
65
|
## Build
|
50
66
|
|
data/build.gradle
CHANGED
@@ -13,14 +13,14 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.2"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.7
|
19
19
|
targetCompatibility = 1.7
|
20
20
|
|
21
21
|
dependencies {
|
22
|
-
compile fileTree(dir: 'libs', include: '*.jar')
|
23
22
|
compile "org.embulk:embulk-core:0.8.22"
|
23
|
+
compile "org.embulk:embulk-standards:0.8.22"
|
24
24
|
compile "com.opencsv:opencsv:3.9"
|
25
25
|
provided "org.embulk:embulk-core:0.8.22"
|
26
26
|
testCompile "junit:junit:4.+"
|
data/gradlew.bat
CHANGED
@@ -1,84 +1,84 @@
|
|
1
|
-
@if "%DEBUG%" == "" @echo off
|
2
|
-
@rem ##########################################################################
|
3
|
-
@rem
|
4
|
-
@rem Gradle startup script for Windows
|
5
|
-
@rem
|
6
|
-
@rem ##########################################################################
|
7
|
-
|
8
|
-
@rem Set local scope for the variables with windows NT shell
|
9
|
-
if "%OS%"=="Windows_NT" setlocal
|
10
|
-
|
11
|
-
set DIRNAME=%~dp0
|
12
|
-
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
-
set APP_BASE_NAME=%~n0
|
14
|
-
set APP_HOME=%DIRNAME%
|
15
|
-
|
16
|
-
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
-
set DEFAULT_JVM_OPTS=
|
18
|
-
|
19
|
-
@rem Find java.exe
|
20
|
-
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
-
|
22
|
-
set JAVA_EXE=java.exe
|
23
|
-
%JAVA_EXE% -version >NUL 2>&1
|
24
|
-
if "%ERRORLEVEL%" == "0" goto init
|
25
|
-
|
26
|
-
echo.
|
27
|
-
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
-
echo.
|
29
|
-
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
-
echo location of your Java installation.
|
31
|
-
|
32
|
-
goto fail
|
33
|
-
|
34
|
-
:findJavaFromJavaHome
|
35
|
-
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
-
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
-
|
38
|
-
if exist "%JAVA_EXE%" goto init
|
39
|
-
|
40
|
-
echo.
|
41
|
-
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
-
echo.
|
43
|
-
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
-
echo location of your Java installation.
|
45
|
-
|
46
|
-
goto fail
|
47
|
-
|
48
|
-
:init
|
49
|
-
@rem Get command-line arguments, handling Windows variants
|
50
|
-
|
51
|
-
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
-
|
53
|
-
:win9xME_args
|
54
|
-
@rem Slurp the command line arguments.
|
55
|
-
set CMD_LINE_ARGS=
|
56
|
-
set _SKIP=2
|
57
|
-
|
58
|
-
:win9xME_args_slurp
|
59
|
-
if "x%~1" == "x" goto execute
|
60
|
-
|
61
|
-
set CMD_LINE_ARGS=%*
|
62
|
-
|
63
|
-
:execute
|
64
|
-
@rem Setup the command line
|
65
|
-
|
66
|
-
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
-
|
68
|
-
@rem Execute Gradle
|
69
|
-
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
-
|
71
|
-
:end
|
72
|
-
@rem End local scope for the variables with windows NT shell
|
73
|
-
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
-
|
75
|
-
:fail
|
76
|
-
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
-
rem the _cmd.exe /c_ return code!
|
78
|
-
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
-
exit /b 1
|
80
|
-
|
81
|
-
:mainEnd
|
82
|
-
if "%OS%"=="Windows_NT" endlocal
|
83
|
-
|
84
|
-
:omega
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
set DIRNAME=%~dp0
|
12
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
+
set APP_BASE_NAME=%~n0
|
14
|
+
set APP_HOME=%DIRNAME%
|
15
|
+
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
|
53
|
+
:win9xME_args
|
54
|
+
@rem Slurp the command line arguments.
|
55
|
+
set CMD_LINE_ARGS=
|
56
|
+
set _SKIP=2
|
57
|
+
|
58
|
+
:win9xME_args_slurp
|
59
|
+
if "x%~1" == "x" goto execute
|
60
|
+
|
61
|
+
set CMD_LINE_ARGS=%*
|
62
|
+
|
63
|
+
:execute
|
64
|
+
@rem Setup the command line
|
65
|
+
|
66
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
+
|
68
|
+
@rem Execute Gradle
|
69
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
+
|
71
|
+
:end
|
72
|
+
@rem End local scope for the variables with windows NT shell
|
73
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
+
|
75
|
+
:fail
|
76
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
+
rem the _cmd.exe /c_ return code!
|
78
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
+
exit /b 1
|
80
|
+
|
81
|
+
:mainEnd
|
82
|
+
if "%OS%"=="Windows_NT" endlocal
|
83
|
+
|
84
|
+
:omega
|
@@ -2,19 +2,10 @@ package org.embulk.parser.csv_guessable;
|
|
2
2
|
|
3
3
|
import com.google.common.base.Optional;
|
4
4
|
import com.google.common.collect.ImmutableSet;
|
5
|
-
import java.io.BufferedReader;
|
6
5
|
import com.opencsv.CSVReader; // TODO: use embulk's parser
|
7
|
-
import java.io.IOException;
|
8
|
-
import java.io.StringReader;
|
9
|
-
import java.nio.charset.StandardCharsets;
|
10
|
-
import java.nio.file.Files;
|
11
|
-
import java.nio.file.Path;
|
12
|
-
import java.util.ArrayList;
|
13
|
-
import org.slf4j.Logger;
|
14
6
|
|
15
7
|
import org.embulk.config.Config;
|
16
8
|
import org.embulk.config.ConfigDefault;
|
17
|
-
import org.embulk.config.ConfigDiff;
|
18
9
|
import org.embulk.config.ConfigException;
|
19
10
|
import org.embulk.config.ConfigSource;
|
20
11
|
import org.embulk.config.Task;
|
@@ -25,22 +16,31 @@ import org.embulk.spi.ColumnVisitor;
|
|
25
16
|
import org.embulk.spi.DataException;
|
26
17
|
import org.embulk.spi.Exec;
|
27
18
|
import org.embulk.spi.FileInput;
|
28
|
-
import org.embulk.spi.json.JsonParser;
|
29
|
-
import org.embulk.spi.json.JsonParseException;
|
30
19
|
import org.embulk.spi.PageBuilder;
|
31
20
|
import org.embulk.spi.PageOutput;
|
32
21
|
import org.embulk.spi.ParserPlugin;
|
33
22
|
import org.embulk.spi.Schema;
|
34
23
|
import org.embulk.spi.SchemaConfig;
|
35
|
-
import org.embulk.spi.
|
24
|
+
import org.embulk.spi.json.JsonParseException;
|
25
|
+
import org.embulk.spi.json.JsonParser;
|
36
26
|
import org.embulk.spi.time.TimestampParseException;
|
27
|
+
import org.embulk.spi.time.TimestampParser;
|
37
28
|
import org.embulk.spi.type.Types;
|
38
29
|
import org.embulk.spi.unit.LocalFile;
|
39
30
|
import org.embulk.spi.util.LineDecoder;
|
40
31
|
import org.embulk.spi.util.Timestamps;
|
41
|
-
|
42
32
|
import org.embulk.standards.CsvParserPlugin;
|
43
33
|
|
34
|
+
import org.slf4j.Logger;
|
35
|
+
|
36
|
+
import java.io.BufferedReader;
|
37
|
+
import java.io.IOException;
|
38
|
+
import java.io.StringReader;
|
39
|
+
import java.nio.charset.StandardCharsets;
|
40
|
+
import java.nio.file.Files;
|
41
|
+
import java.nio.file.Path;
|
42
|
+
import java.util.ArrayList;
|
43
|
+
|
44
44
|
public class CsvGuessableParserPlugin
|
45
45
|
extends CsvParserPlugin
|
46
46
|
{
|
@@ -136,7 +136,8 @@ public class CsvGuessableParserPlugin
|
|
136
136
|
if (task.getHeaderLine().isPresent()) {
|
137
137
|
// TODO: use 'columns' as hints for guess
|
138
138
|
throw new ConfigException("embulk-parsre-csv_gussable will use 'columnes' as hints for guess as hints for guess. Please delete 'columnes' now.");
|
139
|
-
}
|
139
|
+
}
|
140
|
+
else { /* guess from header */
|
140
141
|
int schemaLine = task.getSchemaLine();
|
141
142
|
task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
|
142
143
|
|
@@ -146,7 +147,8 @@ public class CsvGuessableParserPlugin
|
|
146
147
|
log.debug(columns.toString());
|
147
148
|
schemaConfig = new SchemaConfig(columns);
|
148
149
|
}
|
149
|
-
}
|
150
|
+
}
|
151
|
+
else { /* embulk-parser-csv embulk */
|
150
152
|
// backward compatibility
|
151
153
|
if (task.getHeaderLine().isPresent()) {
|
152
154
|
if (task.getSkipHeaderLines() > 0) {
|
@@ -154,7 +156,8 @@ public class CsvGuessableParserPlugin
|
|
154
156
|
}
|
155
157
|
if (task.getHeaderLine().get()) {
|
156
158
|
task.setSkipHeaderLines(1);
|
157
|
-
}
|
159
|
+
}
|
160
|
+
else {
|
158
161
|
task.setSkipHeaderLines(0);
|
159
162
|
}
|
160
163
|
}
|
@@ -199,80 +202,96 @@ public class CsvGuessableParserPlugin
|
|
199
202
|
|
200
203
|
try {
|
201
204
|
schema.visitColumns(new ColumnVisitor() {
|
205
|
+
@Override
|
202
206
|
public void booleanColumn(Column column)
|
203
207
|
{
|
204
208
|
String v = nextColumn();
|
205
209
|
if (v == null) {
|
206
210
|
pageBuilder.setNull(column);
|
207
|
-
}
|
211
|
+
}
|
212
|
+
else {
|
208
213
|
pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
|
209
214
|
}
|
210
215
|
}
|
211
216
|
|
217
|
+
@Override
|
212
218
|
public void longColumn(Column column)
|
213
219
|
{
|
214
220
|
String v = nextColumn();
|
215
221
|
if (v == null) {
|
216
222
|
pageBuilder.setNull(column);
|
217
|
-
}
|
223
|
+
}
|
224
|
+
else {
|
218
225
|
try {
|
219
226
|
pageBuilder.setLong(column, Long.parseLong(v));
|
220
|
-
}
|
227
|
+
}
|
228
|
+
catch (NumberFormatException e) {
|
221
229
|
// TODO support default value
|
222
230
|
throw new CsvRecordValidateException(e);
|
223
231
|
}
|
224
232
|
}
|
225
233
|
}
|
226
234
|
|
235
|
+
@Override
|
227
236
|
public void doubleColumn(Column column)
|
228
237
|
{
|
229
238
|
String v = nextColumn();
|
230
239
|
if (v == null) {
|
231
240
|
pageBuilder.setNull(column);
|
232
|
-
}
|
241
|
+
}
|
242
|
+
else {
|
233
243
|
try {
|
234
244
|
pageBuilder.setDouble(column, Double.parseDouble(v));
|
235
|
-
}
|
245
|
+
}
|
246
|
+
catch (NumberFormatException e) {
|
236
247
|
// TODO support default value
|
237
248
|
throw new CsvRecordValidateException(e);
|
238
249
|
}
|
239
250
|
}
|
240
251
|
}
|
241
252
|
|
253
|
+
@Override
|
242
254
|
public void stringColumn(Column column)
|
243
255
|
{
|
244
256
|
String v = nextColumn();
|
245
257
|
if (v == null) {
|
246
258
|
pageBuilder.setNull(column);
|
247
|
-
}
|
259
|
+
}
|
260
|
+
else {
|
248
261
|
pageBuilder.setString(column, v);
|
249
262
|
}
|
250
263
|
}
|
251
264
|
|
265
|
+
@Override
|
252
266
|
public void timestampColumn(Column column)
|
253
267
|
{
|
254
268
|
String v = nextColumn();
|
255
269
|
if (v == null) {
|
256
270
|
pageBuilder.setNull(column);
|
257
|
-
}
|
271
|
+
}
|
272
|
+
else {
|
258
273
|
try {
|
259
274
|
// pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
|
260
|
-
}
|
275
|
+
}
|
276
|
+
catch (TimestampParseException e) {
|
261
277
|
// TODO support default value
|
262
278
|
throw new CsvRecordValidateException(e);
|
263
279
|
}
|
264
280
|
}
|
265
281
|
}
|
266
282
|
|
283
|
+
@Override
|
267
284
|
public void jsonColumn(Column column)
|
268
285
|
{
|
269
286
|
String v = nextColumn();
|
270
287
|
if (v == null) {
|
271
288
|
pageBuilder.setNull(column);
|
272
|
-
}
|
289
|
+
}
|
290
|
+
else {
|
273
291
|
try {
|
274
292
|
pageBuilder.setJson(column, jsonParser.parse(v));
|
275
|
-
}
|
293
|
+
}
|
294
|
+
catch (JsonParseException e) {
|
276
295
|
// TODO support default value
|
277
296
|
throw new CsvRecordValidateException(e);
|
278
297
|
}
|
@@ -291,19 +310,21 @@ public class CsvGuessableParserPlugin
|
|
291
310
|
|
292
311
|
try {
|
293
312
|
hasNextRecord = tokenizer.nextRecord();
|
294
|
-
}
|
313
|
+
}
|
314
|
+
catch (CsvTokenizer.TooManyColumnsException ex) {
|
295
315
|
if (allowExtraColumns) {
|
296
316
|
String tooManyColumnsLine = tokenizer.skipCurrentLine();
|
297
317
|
// TODO warning
|
298
318
|
hasNextRecord = tokenizer.nextRecord();
|
299
|
-
}
|
319
|
+
}
|
320
|
+
else {
|
300
321
|
// this line will be skipped at the following catch section
|
301
322
|
throw ex;
|
302
323
|
}
|
303
324
|
}
|
304
325
|
pageBuilder.addRecord();
|
305
|
-
|
306
|
-
|
326
|
+
}
|
327
|
+
catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
|
307
328
|
String skippedLine = tokenizer.skipCurrentLine();
|
308
329
|
long lineNumber = tokenizer.getCurrentLineNumber();
|
309
330
|
if (stopOnInvalidRecord) {
|
@@ -334,14 +355,15 @@ public class CsvGuessableParserPlugin
|
|
334
355
|
}
|
335
356
|
}
|
336
357
|
|
337
|
-
private String readHeader(Path path, int schemaLine)
|
358
|
+
private String readHeader(Path path, int schemaLine)
|
359
|
+
{
|
338
360
|
if (schemaLine <= 0) {
|
339
361
|
throw new ConfigException("'schemaLine' must be set '> 0'");
|
340
362
|
}
|
341
363
|
|
342
364
|
String line = null;
|
343
365
|
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
344
|
-
for (int i=1; i <= schemaLine; ++i) {
|
366
|
+
for (int i = 1; i <= schemaLine; ++i) {
|
345
367
|
line = br.readLine();
|
346
368
|
if (line == null) {
|
347
369
|
throw new ConfigException("not found 'schema_line' in 'schema_file'");
|
@@ -353,13 +375,14 @@ public class CsvGuessableParserPlugin
|
|
353
375
|
return line;
|
354
376
|
}
|
355
377
|
|
356
|
-
private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
|
357
|
-
|
378
|
+
private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
|
379
|
+
{
|
380
|
+
ArrayList columns = new ArrayList<ArrayList>();
|
358
381
|
PluginTask task = config.loadConfig(PluginTask.class);
|
359
382
|
|
360
383
|
try (CSVReader reader = new CSVReader(new StringReader(header))) {
|
361
384
|
String[] csv = reader.readNext();
|
362
|
-
for (String column: csv) {
|
385
|
+
for (String column : csv) {
|
363
386
|
columns.add(new ColumnConfig(column, Types.STRING, config));
|
364
387
|
}
|
365
388
|
} catch (IOException e) {
|
@@ -1,13 +1,14 @@
|
|
1
1
|
package org.embulk.parser.csv_guessable;
|
2
2
|
|
3
3
|
import com.google.common.base.Preconditions;
|
4
|
-
import
|
5
|
-
import java.util.ArrayList;
|
6
|
-
import java.util.Deque;
|
7
|
-
import java.util.ArrayDeque;
|
4
|
+
import org.embulk.config.ConfigException;
|
8
5
|
import org.embulk.spi.DataException;
|
9
6
|
import org.embulk.spi.util.LineDecoder;
|
10
|
-
|
7
|
+
|
8
|
+
import java.util.ArrayDeque;
|
9
|
+
import java.util.ArrayList;
|
10
|
+
import java.util.Deque;
|
11
|
+
import java.util.List;
|
11
12
|
|
12
13
|
public class CsvTokenizer
|
13
14
|
{
|
@@ -50,11 +51,13 @@ public class CsvTokenizer
|
|
50
51
|
String delimiter = task.getDelimiter();
|
51
52
|
if (delimiter.length() == 0) {
|
52
53
|
throw new ConfigException("Empty delimiter is not allowed");
|
53
|
-
}
|
54
|
+
}
|
55
|
+
else {
|
54
56
|
this.delimiterChar = delimiter.charAt(0);
|
55
57
|
if (delimiter.length() > 1) {
|
56
58
|
delimiterFollowingString = delimiter.substring(1);
|
57
|
-
}
|
59
|
+
}
|
60
|
+
else {
|
58
61
|
delimiterFollowingString = null;
|
59
62
|
}
|
60
63
|
}
|
@@ -88,7 +91,8 @@ public class CsvTokenizer
|
|
88
91
|
String skippedLine;
|
89
92
|
if (quotedValueLines.isEmpty()) {
|
90
93
|
skippedLine = line;
|
91
|
-
}
|
94
|
+
}
|
95
|
+
else {
|
92
96
|
// recover lines of quoted value
|
93
97
|
skippedLine = quotedValueLines.remove(0); // TODO optimize performance
|
94
98
|
unreadLines.addAll(quotedValueLines);
|
@@ -129,7 +133,8 @@ public class CsvTokenizer
|
|
129
133
|
if (hasNext) {
|
130
134
|
recordState = RecordState.NOT_END;
|
131
135
|
return true;
|
132
|
-
}
|
136
|
+
}
|
137
|
+
else {
|
133
138
|
return false;
|
134
139
|
}
|
135
140
|
}
|
@@ -139,7 +144,8 @@ public class CsvTokenizer
|
|
139
144
|
while (true) {
|
140
145
|
if (!unreadLines.isEmpty()) {
|
141
146
|
line = unreadLines.removeFirst();
|
142
|
-
}
|
147
|
+
}
|
148
|
+
else {
|
143
149
|
line = input.poll();
|
144
150
|
if (line == null) {
|
145
151
|
return false;
|
@@ -189,7 +195,8 @@ public class CsvTokenizer
|
|
189
195
|
// empty value
|
190
196
|
if (delimiterFollowingString == null) {
|
191
197
|
return "";
|
192
|
-
}
|
198
|
+
}
|
199
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
193
200
|
linePos += delimiterFollowingString.length();
|
194
201
|
return "";
|
195
202
|
}
|
@@ -199,17 +206,17 @@ public class CsvTokenizer
|
|
199
206
|
// empty value
|
200
207
|
recordState = RecordState.END;
|
201
208
|
return "";
|
202
|
-
|
203
|
-
|
209
|
+
}
|
210
|
+
else if (isSpace(c) && trimIfNotQuoted) {
|
204
211
|
columnState = ColumnState.FIRST_TRIM;
|
205
|
-
|
206
|
-
|
212
|
+
}
|
213
|
+
else if (isQuote(c)) {
|
207
214
|
valueStartPos = linePos; // == 1
|
208
215
|
wasQuotedColumn = true;
|
209
216
|
quotedValue = new StringBuilder();
|
210
217
|
columnState = ColumnState.QUOTED_VALUE;
|
211
|
-
|
212
|
-
|
218
|
+
}
|
219
|
+
else {
|
213
220
|
columnState = ColumnState.VALUE;
|
214
221
|
}
|
215
222
|
break;
|
@@ -219,7 +226,8 @@ public class CsvTokenizer
|
|
219
226
|
// empty value
|
220
227
|
if (delimiterFollowingString == null) {
|
221
228
|
return "";
|
222
|
-
}
|
229
|
+
}
|
230
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
223
231
|
linePos += delimiterFollowingString.length();
|
224
232
|
return "";
|
225
233
|
}
|
@@ -229,18 +237,18 @@ public class CsvTokenizer
|
|
229
237
|
// empty value
|
230
238
|
recordState = RecordState.END;
|
231
239
|
return "";
|
232
|
-
|
233
|
-
|
240
|
+
}
|
241
|
+
else if (isQuote(c)) {
|
234
242
|
// column has heading spaces and quoted. TODO should this be rejected?
|
235
243
|
valueStartPos = linePos;
|
236
244
|
wasQuotedColumn = true;
|
237
245
|
quotedValue = new StringBuilder();
|
238
246
|
columnState = ColumnState.QUOTED_VALUE;
|
239
|
-
|
240
|
-
|
247
|
+
}
|
248
|
+
else if (isSpace(c)) {
|
241
249
|
// skip this character
|
242
|
-
|
243
|
-
|
250
|
+
}
|
251
|
+
else {
|
244
252
|
valueStartPos = linePos - 1;
|
245
253
|
columnState = ColumnState.VALUE;
|
246
254
|
}
|
@@ -250,7 +258,8 @@ public class CsvTokenizer
|
|
250
258
|
if (isDelimiter(c)) {
|
251
259
|
if (delimiterFollowingString == null) {
|
252
260
|
return line.substring(valueStartPos, linePos - 1);
|
253
|
-
}
|
261
|
+
}
|
262
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
254
263
|
String value = line.substring(valueStartPos, linePos - 1);
|
255
264
|
linePos += delimiterFollowingString.length();
|
256
265
|
return value;
|
@@ -260,8 +269,8 @@ public class CsvTokenizer
|
|
260
269
|
if (isEndOfLine(c)) {
|
261
270
|
recordState = RecordState.END;
|
262
271
|
return line.substring(valueStartPos, linePos);
|
263
|
-
|
264
|
-
|
272
|
+
}
|
273
|
+
else if (isSpace(c) && trimIfNotQuoted) {
|
265
274
|
valueEndPos = linePos - 1; // this is possibly end of value
|
266
275
|
columnState = ColumnState.LAST_TRIM_OR_VALUE;
|
267
276
|
|
@@ -270,8 +279,8 @@ public class CsvTokenizer
|
|
270
279
|
// // In RFC4180, If fields are not enclosed with double quotes, then
|
271
280
|
// // double quotes may not appear inside the fields. But they are often
|
272
281
|
// // included in the fields. We should care about them later.
|
273
|
-
|
274
|
-
|
282
|
+
}
|
283
|
+
else {
|
275
284
|
// keep VALUE state
|
276
285
|
}
|
277
286
|
break;
|
@@ -280,21 +289,23 @@ public class CsvTokenizer
|
|
280
289
|
if (isDelimiter(c)) {
|
281
290
|
if (delimiterFollowingString == null) {
|
282
291
|
return line.substring(valueStartPos, valueEndPos);
|
283
|
-
}
|
292
|
+
}
|
293
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
284
294
|
linePos += delimiterFollowingString.length();
|
285
295
|
return line.substring(valueStartPos, valueEndPos);
|
286
|
-
}
|
296
|
+
}
|
297
|
+
else {
|
287
298
|
// not a delimiter
|
288
299
|
}
|
289
300
|
}
|
290
301
|
if (isEndOfLine(c)) {
|
291
302
|
recordState = RecordState.END;
|
292
303
|
return line.substring(valueStartPos, valueEndPos);
|
293
|
-
|
294
|
-
|
304
|
+
}
|
305
|
+
else if (isSpace(c)) {
|
295
306
|
// keep LAST_TRIM_OR_VALUE state
|
296
|
-
|
297
|
-
|
307
|
+
}
|
308
|
+
else {
|
298
309
|
// this spaces are not trailing spaces. go back to VALUE state
|
299
310
|
columnState = ColumnState.VALUE;
|
300
311
|
}
|
@@ -310,18 +321,19 @@ public class CsvTokenizer
|
|
310
321
|
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
311
322
|
}
|
312
323
|
valueStartPos = 0;
|
313
|
-
|
314
|
-
|
324
|
+
}
|
325
|
+
else if (isQuote(c)) {
|
315
326
|
char next = peekNextChar();
|
316
327
|
if (isQuote(next)) { // escaped quote
|
317
328
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
318
329
|
valueStartPos = ++linePos;
|
319
|
-
}
|
330
|
+
}
|
331
|
+
else {
|
320
332
|
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
321
333
|
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
322
334
|
}
|
323
|
-
|
324
|
-
|
335
|
+
}
|
336
|
+
else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
325
337
|
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
326
338
|
char next = peekNextChar();
|
327
339
|
if (isEndOfLine(c)) {
|
@@ -332,15 +344,16 @@ public class CsvTokenizer
|
|
332
344
|
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
333
345
|
}
|
334
346
|
valueStartPos = 0;
|
335
|
-
}
|
347
|
+
}
|
348
|
+
else if (isQuote(next) || isEscape(next)) { // escaped quote
|
336
349
|
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
337
350
|
quotedValue.append(next);
|
338
351
|
valueStartPos = ++linePos;
|
339
352
|
}
|
340
|
-
|
341
|
-
|
353
|
+
}
|
354
|
+
else {
|
342
355
|
if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
|
343
|
-
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
|
356
|
+
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
|
344
357
|
}
|
345
358
|
// keep QUOTED_VALUE state
|
346
359
|
}
|
@@ -350,7 +363,8 @@ public class CsvTokenizer
|
|
350
363
|
if (isDelimiter(c)) {
|
351
364
|
if (delimiterFollowingString == null) {
|
352
365
|
return quotedValue.toString();
|
353
|
-
}
|
366
|
+
}
|
367
|
+
else if (isDelimiterFollowingFrom(linePos)) {
|
354
368
|
linePos += delimiterFollowingString.length();
|
355
369
|
return quotedValue.toString();
|
356
370
|
}
|
@@ -359,11 +373,11 @@ public class CsvTokenizer
|
|
359
373
|
if (isEndOfLine(c)) {
|
360
374
|
recordState = RecordState.END;
|
361
375
|
return quotedValue.toString();
|
362
|
-
|
363
|
-
|
376
|
+
}
|
377
|
+
else if (isSpace(c)) {
|
364
378
|
// column has trailing spaces and quoted. TODO should this be rejected?
|
365
|
-
|
366
|
-
|
379
|
+
}
|
380
|
+
else {
|
367
381
|
throw new InvalidValueException(String.format("Unexpected extra character '%c' after a value quoted by '%c'", c, quote));
|
368
382
|
}
|
369
383
|
break;
|
@@ -411,7 +425,8 @@ public class CsvTokenizer
|
|
411
425
|
|
412
426
|
if (linePos >= line.length()) {
|
413
427
|
return END_OF_LINE;
|
414
|
-
}
|
428
|
+
}
|
429
|
+
else {
|
415
430
|
return line.charAt(linePos++);
|
416
431
|
}
|
417
432
|
}
|
@@ -422,7 +437,8 @@ public class CsvTokenizer
|
|
422
437
|
|
423
438
|
if (linePos >= line.length()) {
|
424
439
|
return END_OF_LINE;
|
425
|
-
}
|
440
|
+
}
|
441
|
+
else {
|
426
442
|
return line.charAt(linePos);
|
427
443
|
}
|
428
444
|
}
|
@@ -1,17 +1,16 @@
|
|
1
1
|
package org.embulk.parser.csv_guessable;
|
2
2
|
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
3
4
|
import org.embulk.config.ConfigException;
|
4
5
|
import org.embulk.config.ConfigLoader;
|
5
6
|
import org.embulk.config.ConfigSource;
|
6
|
-
import org.embulk.EmbulkTestRuntime;
|
7
7
|
import org.embulk.spi.Exec;
|
8
8
|
import org.junit.Rule;
|
9
|
-
import org.junit.rules.ExpectedException;
|
10
9
|
import org.junit.Test;
|
11
|
-
|
12
|
-
import static org.junit.Assert.assertFalse;
|
10
|
+
import org.junit.rules.ExpectedException;
|
13
11
|
|
14
12
|
import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
|
13
|
+
import static org.junit.Assert.assertFalse;
|
15
14
|
|
16
15
|
public class TestCsvGuessableParserPlugin
|
17
16
|
{
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-csv_guessable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- koooge
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,7 +57,6 @@ files:
|
|
57
57
|
- gradlew.bat
|
58
58
|
- lib/embulk/guess/csv_guessable.rb
|
59
59
|
- lib/embulk/parser/csv_guessable.rb
|
60
|
-
- libs/embulk-standards-0.8.22.jar
|
61
60
|
- src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
|
62
61
|
- src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
|
63
62
|
- src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
|
@@ -67,9 +66,10 @@ files:
|
|
67
66
|
- src/test/resources/yml/original-csv.yml
|
68
67
|
- src/test/resources/yml/replace_column_name.yml
|
69
68
|
- classpath/commons-lang3-3.5.jar
|
70
|
-
- classpath/embulk-parser-csv_guessable-0.1.1.jar
|
71
69
|
- classpath/opencsv-3.9.jar
|
72
70
|
- classpath/commons-beanutils-1.9.3.jar
|
71
|
+
- classpath/commons-compress-1.10.jar
|
72
|
+
- classpath/embulk-parser-csv_guessable-0.1.2.jar
|
73
73
|
- classpath/embulk-standards-0.8.22.jar
|
74
74
|
- classpath/commons-collections-3.2.2.jar
|
75
75
|
- classpath/commons-logging-1.2.jar
|
Binary file
|