embulk 0.6.10 → 0.6.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/build.gradle +1 -1
- data/classpath/{embulk-core-0.6.10.jar → embulk-core-0.6.11.jar} +0 -0
- data/classpath/{embulk-standards-0.6.10.jar → embulk-standards-0.6.11.jar} +0 -0
- data/embulk-cli/src/main/sh/selfrun.sh +96 -2
- data/embulk-cli/src/test/java/org/embulk/cli/DummyMain.java +23 -0
- data/embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java +281 -0
- data/embulk-docs/src/built-in.rst +3 -0
- data/embulk-docs/src/release/release-0.6.11.rst +19 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +4 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +7 -2
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +16 -4
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +14 -0
- data/lib/embulk/data/new/README.md.erb +1 -1
- data/lib/embulk/guess/csv.rb +34 -4
- data/lib/embulk/guess/time_format_guess.rb +6 -8
- data/lib/embulk/version.rb +1 -1
- data/test/guess/test_time_format_guess.rb +7 -0
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d45b70ffc7ebef537906a3f471d74f83d40f18ab
|
4
|
+
data.tar.gz: 792c390284e3b949d84b4b9860941620ba6d3f2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c297663fe08c2c1a83d68942facc2c00801fceab5acdf90a12dcef36b37eac272e10964ddf4f2dd6d869adab6cdeca9bc8925d763f51a6b27feac5c43b78ea65
|
7
|
+
data.tar.gz: ea1a1b4a7c3df3a6f61cb52f2487050d1f2b2a8abf1bf2616b9ee97b2f1d493e2a3b6135f1e32e3d5a0edf40d2a135615be309294e2bc7bdfd6a15c1808a8238
|
data/build.gradle
CHANGED
Binary file
|
Binary file
|
@@ -2,9 +2,103 @@
|
|
2
2
|
: <<BAT
|
3
3
|
@echo off
|
4
4
|
|
5
|
-
|
5
|
+
setlocal
|
6
|
+
|
7
|
+
set this=%~f0
|
8
|
+
set java_args=
|
9
|
+
set jruby_args=
|
10
|
+
set default_optimize=
|
11
|
+
set overwrite_optimize=
|
12
|
+
set status=
|
13
|
+
set error=
|
14
|
+
set args=
|
15
|
+
|
16
|
+
rem In jar file, cannot goto ahread for some reason.
|
17
|
+
|
18
|
+
for %%a in ( %* ) do (
|
19
|
+
call :check_arg %%a
|
20
|
+
)
|
21
|
+
|
22
|
+
if "%error%" == "true" exit /b 1
|
23
|
+
|
24
|
+
set optimize=false
|
25
|
+
if "%overwrite_optimize%" == "true" (
|
26
|
+
set optimize=true
|
27
|
+
) else (
|
28
|
+
if "%default_optimize%" == "true" (
|
29
|
+
if not "%overwrite_optimize%" == "false" (
|
30
|
+
set optimize=true
|
31
|
+
)
|
32
|
+
)
|
33
|
+
)
|
34
|
+
|
35
|
+
if "%optimize%" == "true" (
|
36
|
+
set java_args=-XX:+AggressiveOpts -XX:+UseConcMarkSweepGC %java_args%
|
37
|
+
) else (
|
38
|
+
set java_args=-XX:+AggressiveOpts -XX:+TieredCompilation -XX:TieredStopAtLevel=1 -Xverify:none %java_args%
|
39
|
+
)
|
40
|
+
|
41
|
+
java %java_args% -jar %this% %jruby_args% %args%
|
42
|
+
|
43
|
+
endlocal
|
44
|
+
|
45
|
+
exit /b
|
46
|
+
|
47
|
+
:check_arg
|
48
|
+
set arg=%*
|
49
|
+
|
50
|
+
rem Remove double quotations
|
51
|
+
set p1=%arg:~0,1%
|
52
|
+
set p1=%p1:"=%
|
53
|
+
set p2=%arg:~-1,1%
|
54
|
+
set p2=%p2:"=%
|
55
|
+
set arg=%p1%%arg:~1,-1%%p2%
|
56
|
+
|
57
|
+
if "%status%" == "rest" (
|
58
|
+
set args=%args% %arg%
|
59
|
+
|
60
|
+
) else if "%status%" == "read" (
|
61
|
+
call :read_file %arg%
|
62
|
+
|
63
|
+
) else if "%arg%" == "-J+O" (
|
64
|
+
set overwrite_optimize=true
|
65
|
+
set status=rest
|
66
|
+
|
67
|
+
) else if "%arg%" == "-J-O" (
|
68
|
+
set overwrite_optimize=false
|
69
|
+
set status=rest
|
70
|
+
|
71
|
+
) else if "%arg:~0,2%" == "-J" (
|
72
|
+
if not "%arg:~2%" == "" (
|
73
|
+
set java_args=%java_args% %arg:~2%
|
74
|
+
) else (
|
75
|
+
set status=read
|
76
|
+
)
|
77
|
+
|
78
|
+
) else if "%arg:~0,2%" == "-R" (
|
79
|
+
set jruby_args=%jruby_args% %arg:~2%
|
80
|
+
|
81
|
+
) else if "%arg%" == "run" (
|
82
|
+
set default_optimize=true
|
83
|
+
set args=%args% %arg%
|
84
|
+
set status=rest
|
85
|
+
|
86
|
+
) else (
|
87
|
+
set args=%args% %arg%
|
88
|
+
set status=rest
|
89
|
+
)
|
90
|
+
exit /b
|
91
|
+
|
92
|
+
:read_file
|
93
|
+
if not exist "%~1" (
|
94
|
+
echo "failed to load java argument file."
|
95
|
+
set error=true
|
96
|
+
) else (
|
97
|
+
for /f "delims=" %%i in (%~1) do set java_args=%java_args% %%i
|
98
|
+
)
|
99
|
+
set status=
|
100
|
+
exit /b
|
6
101
|
|
7
|
-
exit /B
|
8
102
|
BAT
|
9
103
|
|
10
104
|
java_args=""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.cli;
|
2
|
+
|
3
|
+
import java.io.BufferedWriter;
|
4
|
+
import java.io.File;
|
5
|
+
import java.io.FileOutputStream;
|
6
|
+
import java.io.OutputStreamWriter;
|
7
|
+
import java.nio.charset.Charset;
|
8
|
+
import java.util.Arrays;
|
9
|
+
|
10
|
+
public class DummyMain {
|
11
|
+
|
12
|
+
public static void main(String[] args) throws Exception {
|
13
|
+
System.out.println(Arrays.asList(args));
|
14
|
+
File thisFolder = new File(SelfrunTest.class.getResource("/org/embulk/cli/DummyMain.class").toURI()).getParentFile();
|
15
|
+
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(thisFolder, "args.txt")), Charset.defaultCharset()))) {
|
16
|
+
for (String arg : args) {
|
17
|
+
writer.write(arg);
|
18
|
+
writer.newLine();
|
19
|
+
}
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
}
|
@@ -0,0 +1,281 @@
|
|
1
|
+
package org.embulk.cli;
|
2
|
+
|
3
|
+
import static org.junit.Assert.assertEquals;
|
4
|
+
|
5
|
+
import java.io.BufferedReader;
|
6
|
+
import java.io.BufferedWriter;
|
7
|
+
import java.io.File;
|
8
|
+
import java.io.FileOutputStream;
|
9
|
+
import java.io.IOException;
|
10
|
+
import java.io.InputStreamReader;
|
11
|
+
import java.io.OutputStreamWriter;
|
12
|
+
import java.nio.charset.Charset;
|
13
|
+
import java.nio.file.FileSystem;
|
14
|
+
import java.nio.file.FileSystems;
|
15
|
+
import java.nio.file.Files;
|
16
|
+
import java.nio.file.StandardOpenOption;
|
17
|
+
import java.util.Arrays;
|
18
|
+
import java.util.List;
|
19
|
+
|
20
|
+
import org.junit.BeforeClass;
|
21
|
+
import org.junit.Test;
|
22
|
+
|
23
|
+
|
24
|
+
public class SelfrunTest {
|
25
|
+
|
26
|
+
private static File testSelfrun;
|
27
|
+
|
28
|
+
@BeforeClass
|
29
|
+
public static void prepare() throws Exception {
|
30
|
+
File selfrun = findSelfrun();
|
31
|
+
FileSystem fs = FileSystems.getDefault();
|
32
|
+
String line = new String(Files.readAllBytes(fs.getPath(selfrun.getAbsolutePath())), Charset.defaultCharset());
|
33
|
+
|
34
|
+
File thisFolder = new File(SelfrunTest.class.getResource("/org/embulk/cli/SelfrunTest.class").toURI()).getParentFile();
|
35
|
+
testSelfrun = new File(thisFolder, System.getProperty("file.separator").equals("\\") ? "selfrun.bat" : "selfrun.sh");
|
36
|
+
|
37
|
+
File classpath = thisFolder.getParentFile().getParentFile().getParentFile();
|
38
|
+
line = line.replaceAll("java ", "java -classpath " + classpath.getAbsolutePath().replaceAll("\\\\", "\\\\\\\\") + " org.embulk.cli.DummyMain ");
|
39
|
+
|
40
|
+
// Modify selfrun so that arguments are written in 'args.txt' .
|
41
|
+
Files.write(fs.getPath(testSelfrun.getAbsolutePath()), line.getBytes(Charset.defaultCharset()), StandardOpenOption.CREATE);
|
42
|
+
if (!testSelfrun.setExecutable(true)) {
|
43
|
+
throw new Exception("Cannot se executable.");
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
@Test
|
49
|
+
public void testNoArgument() throws Exception {
|
50
|
+
List<String> args = execute();
|
51
|
+
assertEquals(Arrays.asList(
|
52
|
+
"-XX:+AggressiveOpts",
|
53
|
+
"-XX:+TieredCompilation",
|
54
|
+
"-XX:TieredStopAtLevel=1",
|
55
|
+
"-Xverify:none",
|
56
|
+
"-jar",
|
57
|
+
testSelfrun.getAbsolutePath()),
|
58
|
+
args);
|
59
|
+
}
|
60
|
+
|
61
|
+
@Test
|
62
|
+
public void testArguments() throws Exception {
|
63
|
+
List<String> args = execute("a1", "a2", "\"a3=v3\"");
|
64
|
+
assertEquals(Arrays.asList(
|
65
|
+
"-XX:+AggressiveOpts",
|
66
|
+
"-XX:+TieredCompilation",
|
67
|
+
"-XX:TieredStopAtLevel=1",
|
68
|
+
"-Xverify:none",
|
69
|
+
"-jar",
|
70
|
+
testSelfrun.getAbsolutePath(),
|
71
|
+
"a1",
|
72
|
+
"a2",
|
73
|
+
"a3=v3"),
|
74
|
+
args);
|
75
|
+
}
|
76
|
+
|
77
|
+
@Test
|
78
|
+
public void testRun() throws Exception {
|
79
|
+
List<String> args = execute("run", "a1");
|
80
|
+
assertEquals(Arrays.asList(
|
81
|
+
"-XX:+AggressiveOpts",
|
82
|
+
"-XX:+UseConcMarkSweepGC",
|
83
|
+
"-jar",
|
84
|
+
testSelfrun.getAbsolutePath(),
|
85
|
+
"run",
|
86
|
+
"a1"),
|
87
|
+
args);
|
88
|
+
}
|
89
|
+
|
90
|
+
@Test
|
91
|
+
public void testJpO() throws Exception {
|
92
|
+
List<String> args = execute("-J+O", "a1", "a2");
|
93
|
+
assertEquals(Arrays.asList(
|
94
|
+
"-XX:+AggressiveOpts",
|
95
|
+
"-XX:+UseConcMarkSweepGC",
|
96
|
+
"-jar",
|
97
|
+
testSelfrun.getAbsolutePath(),
|
98
|
+
"a1",
|
99
|
+
"a2"),
|
100
|
+
args);
|
101
|
+
}
|
102
|
+
|
103
|
+
@Test
|
104
|
+
public void testJmO() throws Exception {
|
105
|
+
List<String> args = execute("-J-O", "a1", "a2");
|
106
|
+
assertEquals(Arrays.asList(
|
107
|
+
"-XX:+AggressiveOpts",
|
108
|
+
"-XX:+TieredCompilation",
|
109
|
+
"-XX:TieredStopAtLevel=1",
|
110
|
+
"-Xverify:none",
|
111
|
+
"-jar",
|
112
|
+
testSelfrun.getAbsolutePath(),
|
113
|
+
"a1",
|
114
|
+
"a2"),
|
115
|
+
args);
|
116
|
+
}
|
117
|
+
|
118
|
+
@Test
|
119
|
+
public void testR1() throws Exception {
|
120
|
+
List<String> args = execute("-Rr1", "a1", "a2");
|
121
|
+
assertEquals(Arrays.asList(
|
122
|
+
"-XX:+AggressiveOpts",
|
123
|
+
"-XX:+TieredCompilation",
|
124
|
+
"-XX:TieredStopAtLevel=1",
|
125
|
+
"-Xverify:none",
|
126
|
+
"-jar",
|
127
|
+
testSelfrun.getAbsolutePath(),
|
128
|
+
"r1",
|
129
|
+
"a1",
|
130
|
+
"a2"),
|
131
|
+
args);
|
132
|
+
}
|
133
|
+
|
134
|
+
@Test
|
135
|
+
public void testR2() throws Exception {
|
136
|
+
List<String> args = execute("\"-Rr1=v1\"", "\"-Rr2=v2\"", "a1", "a2");
|
137
|
+
assertEquals(Arrays.asList(
|
138
|
+
"-XX:+AggressiveOpts",
|
139
|
+
"-XX:+TieredCompilation",
|
140
|
+
"-XX:TieredStopAtLevel=1",
|
141
|
+
"-Xverify:none",
|
142
|
+
"-jar",
|
143
|
+
testSelfrun.getAbsolutePath(),
|
144
|
+
"r1=v1",
|
145
|
+
"r2=v2",
|
146
|
+
"a1",
|
147
|
+
"a2"),
|
148
|
+
args);
|
149
|
+
}
|
150
|
+
|
151
|
+
@Test
|
152
|
+
public void testRRun() throws Exception {
|
153
|
+
List<String> args = execute("-Rr1", "run", "a1");
|
154
|
+
assertEquals(Arrays.asList(
|
155
|
+
"-XX:+AggressiveOpts",
|
156
|
+
"-XX:+UseConcMarkSweepGC",
|
157
|
+
"-jar",
|
158
|
+
testSelfrun.getAbsolutePath(),
|
159
|
+
"r1",
|
160
|
+
"run",
|
161
|
+
"a1"),
|
162
|
+
args);
|
163
|
+
}
|
164
|
+
|
165
|
+
@Test
|
166
|
+
public void testJ1() throws Exception {
|
167
|
+
List<String> args = execute("-J-Dj1", "a1", "a2");
|
168
|
+
assertEquals(Arrays.asList(
|
169
|
+
"-XX:+AggressiveOpts",
|
170
|
+
"-XX:+TieredCompilation",
|
171
|
+
"-XX:TieredStopAtLevel=1",
|
172
|
+
"-Xverify:none",
|
173
|
+
"-Dj1",
|
174
|
+
"-jar",
|
175
|
+
testSelfrun.getAbsolutePath(),
|
176
|
+
"a1",
|
177
|
+
"a2"),
|
178
|
+
args);
|
179
|
+
}
|
180
|
+
|
181
|
+
@Test
|
182
|
+
public void testJ2() throws Exception {
|
183
|
+
List<String> args = execute("\"-J-Dj1=v1\"", "\"-J-Dj2=v2\"", "a1", "a2");
|
184
|
+
assertEquals(Arrays.asList(
|
185
|
+
"-XX:+AggressiveOpts",
|
186
|
+
"-XX:+TieredCompilation",
|
187
|
+
"-XX:TieredStopAtLevel=1",
|
188
|
+
"-Xverify:none",
|
189
|
+
"-Dj1=v1",
|
190
|
+
"-Dj2=v2",
|
191
|
+
"-jar",
|
192
|
+
testSelfrun.getAbsolutePath(),
|
193
|
+
"a1",
|
194
|
+
"a2"),
|
195
|
+
args);
|
196
|
+
}
|
197
|
+
|
198
|
+
@Test
|
199
|
+
public void testJR() throws Exception {
|
200
|
+
List<String> args = execute("-Jj1", "-Rr1", "a1", "a2");
|
201
|
+
assertEquals(Arrays.asList(
|
202
|
+
"-XX:+AggressiveOpts",
|
203
|
+
"-XX:+TieredCompilation",
|
204
|
+
"-XX:TieredStopAtLevel=1",
|
205
|
+
"-Xverify:none",
|
206
|
+
"j1",
|
207
|
+
"-jar",
|
208
|
+
testSelfrun.getAbsolutePath(),
|
209
|
+
"r1",
|
210
|
+
"a1",
|
211
|
+
"a2"),
|
212
|
+
args);
|
213
|
+
}
|
214
|
+
|
215
|
+
@Test
|
216
|
+
public void testJFile() throws Exception {
|
217
|
+
File javaArgsFile = new File(testSelfrun.getParentFile(), "java_args.txt");
|
218
|
+
FileSystem fs = FileSystems.getDefault();
|
219
|
+
Files.write(fs.getPath(javaArgsFile.getAbsolutePath()), "j1 j2 j3".getBytes(Charset.defaultCharset()), StandardOpenOption.CREATE);
|
220
|
+
|
221
|
+
List<String> args = execute("-J", javaArgsFile.getAbsolutePath(), "a1", "a2");
|
222
|
+
assertEquals(Arrays.asList(
|
223
|
+
"-XX:+AggressiveOpts",
|
224
|
+
"-XX:+TieredCompilation",
|
225
|
+
"-XX:TieredStopAtLevel=1",
|
226
|
+
"-Xverify:none",
|
227
|
+
"j1",
|
228
|
+
"j2",
|
229
|
+
"j3",
|
230
|
+
"-jar",
|
231
|
+
testSelfrun.getAbsolutePath(),
|
232
|
+
"a1",
|
233
|
+
"a2"),
|
234
|
+
args);
|
235
|
+
}
|
236
|
+
|
237
|
+
private List<String> execute(String... arguments) throws Exception {
|
238
|
+
File temp = new File(testSelfrun.getParentFile(), "call-" + testSelfrun.getName());
|
239
|
+
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(temp), Charset.defaultCharset()))) {
|
240
|
+
writer.write(testSelfrun.getAbsolutePath());
|
241
|
+
for (String argument : arguments) {
|
242
|
+
writer.write(" ");
|
243
|
+
writer.write(argument);
|
244
|
+
}
|
245
|
+
}
|
246
|
+
if (!temp.setExecutable(true)) {
|
247
|
+
throw new Exception("Cannot se executable.");
|
248
|
+
}
|
249
|
+
|
250
|
+
File argsFile = new File(testSelfrun.getParentFile(), "args.txt");
|
251
|
+
if (argsFile.exists()) {
|
252
|
+
if (!argsFile.delete()) {
|
253
|
+
throw new IOException("Cannot delete " + argsFile);
|
254
|
+
}
|
255
|
+
}
|
256
|
+
|
257
|
+
Process process = Runtime.getRuntime().exec(temp.getAbsolutePath());
|
258
|
+
int exitCode = process.waitFor();
|
259
|
+
if (exitCode != 0 || !argsFile.exists()) {
|
260
|
+
StringBuilder builder = new StringBuilder();
|
261
|
+
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getErrorStream(), Charset.defaultCharset()))) {
|
262
|
+
builder.append(reader.readLine());
|
263
|
+
builder.append(System.getProperty("line.separator"));
|
264
|
+
}
|
265
|
+
throw new Exception(builder.toString());
|
266
|
+
}
|
267
|
+
|
268
|
+
FileSystem fs = FileSystems.getDefault();
|
269
|
+
List<String> args = Files.readAllLines(fs.getPath(argsFile.getAbsolutePath()), Charset.defaultCharset());
|
270
|
+
return args;
|
271
|
+
}
|
272
|
+
|
273
|
+
private static File findSelfrun() {
|
274
|
+
File folder = new File(".");
|
275
|
+
if (new File(folder, "embulk-cli").exists()) {
|
276
|
+
folder = new File(folder, "embulk-cli");
|
277
|
+
}
|
278
|
+
return new File(new File(new File(new File(folder, "src"), "main"), "sh"), "selfrun.sh");
|
279
|
+
}
|
280
|
+
|
281
|
+
}
|
@@ -143,6 +143,8 @@ Options
|
|
143
143
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
144
144
|
| trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
|
145
145
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
146
|
+
| comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
|
147
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
146
148
|
| allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
|
147
149
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
148
150
|
| allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
|
@@ -204,6 +206,7 @@ Example
|
|
204
206
|
escape: ''
|
205
207
|
null_string: 'NULL'
|
206
208
|
skip_header_lines: 1
|
209
|
+
comment_line_marker: '#'
|
207
210
|
columns:
|
208
211
|
- {name: id, type: long}
|
209
212
|
- {name: account, type: long}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Release 0.6.11
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* ``input-file`` plugin sets ``last_path`` when there are no input files.
|
8
|
+
* ``parser-csv`` supports **comment_line_marker** option to skip lines starting with comment characters such as '#'.
|
9
|
+
* Fixed a bug where ``guess-csv`` guesses timestamp wrongly if order of date is month-day-year.
|
10
|
+
|
11
|
+
General Changes
|
12
|
+
------------------
|
13
|
+
|
14
|
+
* Command line execution supports ``-J``, ``-R``, ``-J+O``, and ``-J-O`` arguments on Windows (@hito4++)
|
15
|
+
|
16
|
+
|
17
|
+
Release Date
|
18
|
+
------------------
|
19
|
+
2015-05-30
|
data/embulk-docs/src/release.rst
CHANGED
@@ -75,6 +75,10 @@ public class CsvParserPlugin
|
|
75
75
|
@ConfigDefault("131072") //128kB
|
76
76
|
public long getMaxQuotedSizeLimit();
|
77
77
|
|
78
|
+
@Config("comment_line_marker")
|
79
|
+
@ConfigDefault("null")
|
80
|
+
public Optional<String> getCommentLineMarker();
|
81
|
+
|
78
82
|
@Config("allow_optional_columns")
|
79
83
|
@ConfigDefault("false")
|
80
84
|
public boolean getAllowOptionalColumns();
|
@@ -27,6 +27,7 @@ public class CsvTokenizer
|
|
27
27
|
private final String newline;
|
28
28
|
private final boolean trimIfNotQuoted;
|
29
29
|
private final long maxQuotedSizeLimit;
|
30
|
+
private final String commentLineMarker;
|
30
31
|
private final LineDecoder input;
|
31
32
|
|
32
33
|
private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
|
@@ -46,6 +47,7 @@ public class CsvTokenizer
|
|
46
47
|
newline = task.getNewline().getString();
|
47
48
|
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
48
49
|
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
50
|
+
commentLineMarker = task.getCommentLineMarker().orNull();
|
49
51
|
this.input = input;
|
50
52
|
}
|
51
53
|
|
@@ -93,7 +95,7 @@ public class CsvTokenizer
|
|
93
95
|
}
|
94
96
|
}
|
95
97
|
|
96
|
-
private boolean nextLine(boolean
|
98
|
+
private boolean nextLine(boolean skipEmptyLine)
|
97
99
|
{
|
98
100
|
while (true) {
|
99
101
|
if (!unreadLines.isEmpty()) {
|
@@ -107,7 +109,10 @@ public class CsvTokenizer
|
|
107
109
|
linePos = 0;
|
108
110
|
lineNumber++;
|
109
111
|
|
110
|
-
|
112
|
+
boolean skip = skipEmptyLine && (
|
113
|
+
line.isEmpty() ||
|
114
|
+
(commentLineMarker != null && line.startsWith(commentLineMarker)));
|
115
|
+
if (!skip) {
|
111
116
|
return true;
|
112
117
|
}
|
113
118
|
}
|
@@ -79,10 +79,22 @@ public class LocalFileInputPlugin
|
|
79
79
|
|
80
80
|
control.run(taskSource, taskCount);
|
81
81
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
82
|
+
// build next config
|
83
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
84
|
+
|
85
|
+
// last_path
|
86
|
+
if (task.getFiles().isEmpty()) {
|
87
|
+
// keep the last value
|
88
|
+
if (task.getLastPath().isPresent()) {
|
89
|
+
configDiff.set("last_path", task.getLastPath().get());
|
90
|
+
}
|
91
|
+
} else {
|
92
|
+
List<String> files = new ArrayList<String>(task.getFiles());
|
93
|
+
Collections.sort(files);
|
94
|
+
configDiff.set("last_path", files.get(files.size() - 1));
|
95
|
+
}
|
96
|
+
|
97
|
+
return configDiff;
|
86
98
|
}
|
87
99
|
|
88
100
|
@Override
|
@@ -226,6 +226,20 @@ public class TestCsvTokenizer
|
|
226
226
|
"\n\"a\\\"aa\",\"b,bb\\\"\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
|
227
227
|
}
|
228
228
|
|
229
|
+
@Test
|
230
|
+
public void testCommentLineMarker() throws Exception
|
231
|
+
{
|
232
|
+
config.set("comment_line_marker", JsonNodeFactory.instance.textNode("#"));
|
233
|
+
reloadPluginTask();
|
234
|
+
assertEquals(expectedRecords(2,
|
235
|
+
"aaa", "bbb",
|
236
|
+
"eee", "fff"),
|
237
|
+
parse(task,
|
238
|
+
"aaa,bbb",
|
239
|
+
"#ccc,ddd",
|
240
|
+
"eee,fff"));
|
241
|
+
}
|
242
|
+
|
229
243
|
@Test
|
230
244
|
public void trimNonQuotedValues() throws Exception
|
231
245
|
{
|
@@ -93,7 +93,7 @@ out:
|
|
93
93
|
(If guess supported) you don't have to write `<%= category %>:` section in the configuration file. After writing `in:` section, you can let embulk guess `<%= category %>:` section using this command:
|
94
94
|
|
95
95
|
```
|
96
|
-
$ embulk install <%= project_name %>
|
96
|
+
$ embulk gem install <%= project_name %>
|
97
97
|
$ embulk guess -g <%= name %> config.yml -o guessed.yml
|
98
98
|
```
|
99
99
|
%end
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -24,6 +24,11 @@ module Embulk
|
|
24
24
|
"\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
|
25
25
|
]
|
26
26
|
|
27
|
+
COMMENT_LINE_MARKER_CANDIDATES = [
|
28
|
+
"#",
|
29
|
+
"//",
|
30
|
+
]
|
31
|
+
|
27
32
|
MAX_SKIP_LINES = 10
|
28
33
|
NO_SKIP_DETECT_LINES = 10
|
29
34
|
|
@@ -56,9 +61,12 @@ module Embulk
|
|
56
61
|
end
|
57
62
|
|
58
63
|
sample_records = split_lines(parser_guessed, sample_lines, delim)
|
64
|
+
|
59
65
|
skip_header_lines = guess_skip_header_lines(sample_records)
|
60
66
|
sample_records = sample_records[skip_header_lines..-1]
|
61
67
|
|
68
|
+
comment_line_marker, sample_records = guess_comment_line_marker(sample_records)
|
69
|
+
|
62
70
|
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
63
71
|
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
|
64
72
|
|
@@ -75,6 +83,8 @@ module Embulk
|
|
75
83
|
parser_guessed["skip_header_lines"] = skip_header_lines
|
76
84
|
end
|
77
85
|
|
86
|
+
parser_guessed["comment_line_marker"] = comment_line_marker # always set comment_line_marker even if it's null
|
87
|
+
|
78
88
|
parser_guessed["allow_extra_columns"] = false
|
79
89
|
parser_guessed["allow_optional_columns"] = false
|
80
90
|
|
@@ -113,7 +123,10 @@ module Embulk
|
|
113
123
|
columns = []
|
114
124
|
while true
|
115
125
|
begin
|
116
|
-
|
126
|
+
column = tokenizer.nextColumn
|
127
|
+
quoted = tokenizer.wasQuotedColumn
|
128
|
+
column.define_singleton_method(:quoted?) { quoted }
|
129
|
+
columns << column
|
117
130
|
rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
|
118
131
|
rows << columns
|
119
132
|
break
|
@@ -200,21 +213,38 @@ module Embulk
|
|
200
213
|
count = counts.inject(0) {|r,c| r + c }
|
201
214
|
[str, count]
|
202
215
|
end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
|
203
|
-
|
204
|
-
return
|
216
|
+
found_str, found_count = guessed.first
|
217
|
+
return found_str ? found_str : nil
|
205
218
|
end
|
206
219
|
|
207
220
|
def guess_skip_header_lines(sample_records)
|
208
221
|
counts = sample_records.map {|records| records.size }
|
209
222
|
(1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
|
210
223
|
check_row_count = counts[i-1]
|
211
|
-
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c
|
224
|
+
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
|
212
225
|
return i - 1
|
213
226
|
end
|
214
227
|
end
|
215
228
|
return 0
|
216
229
|
end
|
217
230
|
|
231
|
+
def guess_comment_line_marker(sample_records)
|
232
|
+
guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
|
233
|
+
regexp = /^#{Regexp.quote(str)}/
|
234
|
+
records = sample_records.reject do |records|
|
235
|
+
!records[0].quoted? && !NULL_STRING_CANDIDATES.include?(records[0]) && records[0] =~ regexp
|
236
|
+
end
|
237
|
+
count = sample_records.size - records.size
|
238
|
+
[str, count, records]
|
239
|
+
end.select {|str,count,records| count > 0 }.sort_by {|str,count,records| -count }
|
240
|
+
found_str, found_count, found_records = guessed.first
|
241
|
+
if found_str
|
242
|
+
return found_str, found_records
|
243
|
+
else
|
244
|
+
return nil, sample_records
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
218
248
|
def array_sum(array)
|
219
249
|
array.inject(0) {|r,i| r += i }
|
220
250
|
end
|
@@ -196,12 +196,12 @@ module Embulk::Guess
|
|
196
196
|
|
197
197
|
parts << :year
|
198
198
|
part_options << nil
|
199
|
-
delimiters << date_delim
|
200
199
|
|
200
|
+
delimiters << date_delim
|
201
201
|
parts << :month
|
202
202
|
part_options << part_heading_option(dm["month"])
|
203
|
-
delimiters << date_delim
|
204
203
|
|
204
|
+
delimiters << date_delim
|
205
205
|
parts << :day
|
206
206
|
part_options << part_heading_option(dm["day"])
|
207
207
|
|
@@ -210,30 +210,28 @@ module Embulk::Guess
|
|
210
210
|
|
211
211
|
parts << :month
|
212
212
|
part_options << part_heading_option(dm["month"])
|
213
|
-
delimiters << date_delim
|
214
213
|
|
214
|
+
delimiters << date_delim
|
215
215
|
parts << :day
|
216
216
|
part_options << part_heading_option(dm["day"])
|
217
|
-
delimiters << date_delim
|
218
217
|
|
218
|
+
delimiters << date_delim
|
219
219
|
parts << :year
|
220
220
|
part_options << nil
|
221
|
-
delimiters << date_delim
|
222
221
|
|
223
222
|
elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
|
224
223
|
date_delim = dm["date_delim"] rescue ""
|
225
224
|
|
226
225
|
parts << :day
|
227
226
|
part_options << part_heading_option(dm["day"])
|
228
|
-
delimiters << date_delim
|
229
227
|
|
228
|
+
delimiters << date_delim
|
230
229
|
parts << :month
|
231
230
|
part_options << part_heading_option(dm["month"])
|
232
|
-
delimiters << date_delim
|
233
231
|
|
232
|
+
delimiters << date_delim
|
234
233
|
parts << :year
|
235
234
|
part_options << nil
|
236
|
-
delimiters << date_delim
|
237
235
|
|
238
236
|
else
|
239
237
|
date_delim = ""
|
data/lib/embulk/version.rb
CHANGED
@@ -45,6 +45,13 @@ class TimeFormatGuessTest < ::Test::Unit::TestCase
|
|
45
45
|
assert_guess "%m.%d.%Y", "01.01.2014"
|
46
46
|
assert_guess "%d/%m/%Y", "13/01/2014"
|
47
47
|
assert_guess "%d/%m/%Y", "21/01/2014"
|
48
|
+
|
49
|
+
assert_guess "%d/%m/%Y %H-%M-%S,%N", "21/01/2014 01-01-01,000000001"
|
50
|
+
assert_guess "%d/%m/%Y %H-%M-%S,%N", "21/01/2014 01-01-01,000001"
|
51
|
+
assert_guess "%d/%m/%Y %H-%M-%S,%L", "21/01/2014 01-01-01,001"
|
52
|
+
assert_guess "%d/%m/%Y %H-%M-%S", "21/01/2014 01-01-01"
|
53
|
+
assert_guess "%d/%m/%Y %H-%M", "21/01/2014 01-01"
|
54
|
+
assert_guess "%d/%m/%Y", "21/01/2014"
|
48
55
|
end
|
49
56
|
|
50
57
|
def test_format_borders
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05-
|
11
|
+
date: 2015-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,6 +100,8 @@ files:
|
|
100
100
|
- embulk-cli/build.gradle
|
101
101
|
- embulk-cli/src/main/java/org/embulk/cli/Main.java
|
102
102
|
- embulk-cli/src/main/sh/selfrun.sh
|
103
|
+
- embulk-cli/src/test/java/org/embulk/cli/DummyMain.java
|
104
|
+
- embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java
|
103
105
|
- embulk-core/build.gradle
|
104
106
|
- embulk-core/src/main/java/org/embulk/EmbulkService.java
|
105
107
|
- embulk-core/src/main/java/org/embulk/command/PreviewPrinter.java
|
@@ -294,6 +296,7 @@ files:
|
|
294
296
|
- embulk-docs/src/release/release-0.6.0.rst
|
295
297
|
- embulk-docs/src/release/release-0.6.1.rst
|
296
298
|
- embulk-docs/src/release/release-0.6.10.rst
|
299
|
+
- embulk-docs/src/release/release-0.6.11.rst
|
297
300
|
- embulk-docs/src/release/release-0.6.2.rst
|
298
301
|
- embulk-docs/src/release/release-0.6.3.rst
|
299
302
|
- embulk-docs/src/release/release-0.6.4.rst
|
@@ -409,8 +412,8 @@ files:
|
|
409
412
|
- classpath/bval-jsr303-0.5.jar
|
410
413
|
- classpath/commons-beanutils-core-1.8.3.jar
|
411
414
|
- classpath/commons-lang3-3.1.jar
|
412
|
-
- classpath/embulk-core-0.6.
|
413
|
-
- classpath/embulk-standards-0.6.
|
415
|
+
- classpath/embulk-core-0.6.11.jar
|
416
|
+
- classpath/embulk-standards-0.6.11.jar
|
414
417
|
- classpath/guava-18.0.jar
|
415
418
|
- classpath/guice-4.0.jar
|
416
419
|
- classpath/guice-multibindings-4.0.jar
|