embulk 0.6.10 → 0.6.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/build.gradle +1 -1
- data/classpath/{embulk-core-0.6.10.jar → embulk-core-0.6.11.jar} +0 -0
- data/classpath/{embulk-standards-0.6.10.jar → embulk-standards-0.6.11.jar} +0 -0
- data/embulk-cli/src/main/sh/selfrun.sh +96 -2
- data/embulk-cli/src/test/java/org/embulk/cli/DummyMain.java +23 -0
- data/embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java +281 -0
- data/embulk-docs/src/built-in.rst +3 -0
- data/embulk-docs/src/release/release-0.6.11.rst +19 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +4 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +7 -2
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +16 -4
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +14 -0
- data/lib/embulk/data/new/README.md.erb +1 -1
- data/lib/embulk/guess/csv.rb +34 -4
- data/lib/embulk/guess/time_format_guess.rb +6 -8
- data/lib/embulk/version.rb +1 -1
- data/test/guess/test_time_format_guess.rb +7 -0
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d45b70ffc7ebef537906a3f471d74f83d40f18ab
|
4
|
+
data.tar.gz: 792c390284e3b949d84b4b9860941620ba6d3f2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c297663fe08c2c1a83d68942facc2c00801fceab5acdf90a12dcef36b37eac272e10964ddf4f2dd6d869adab6cdeca9bc8925d763f51a6b27feac5c43b78ea65
|
7
|
+
data.tar.gz: ea1a1b4a7c3df3a6f61cb52f2487050d1f2b2a8abf1bf2616b9ee97b2f1d493e2a3b6135f1e32e3d5a0edf40d2a135615be309294e2bc7bdfd6a15c1808a8238
|
data/build.gradle
CHANGED
Binary file
|
Binary file
|
@@ -2,9 +2,103 @@
|
|
2
2
|
: <<BAT
|
3
3
|
@echo off
|
4
4
|
|
5
|
-
|
5
|
+
setlocal
|
6
|
+
|
7
|
+
set this=%~f0
|
8
|
+
set java_args=
|
9
|
+
set jruby_args=
|
10
|
+
set default_optimize=
|
11
|
+
set overwrite_optimize=
|
12
|
+
set status=
|
13
|
+
set error=
|
14
|
+
set args=
|
15
|
+
|
16
|
+
rem In jar file, cannot goto ahread for some reason.
|
17
|
+
|
18
|
+
for %%a in ( %* ) do (
|
19
|
+
call :check_arg %%a
|
20
|
+
)
|
21
|
+
|
22
|
+
if "%error%" == "true" exit /b 1
|
23
|
+
|
24
|
+
set optimize=false
|
25
|
+
if "%overwrite_optimize%" == "true" (
|
26
|
+
set optimize=true
|
27
|
+
) else (
|
28
|
+
if "%default_optimize%" == "true" (
|
29
|
+
if not "%overwrite_optimize%" == "false" (
|
30
|
+
set optimize=true
|
31
|
+
)
|
32
|
+
)
|
33
|
+
)
|
34
|
+
|
35
|
+
if "%optimize%" == "true" (
|
36
|
+
set java_args=-XX:+AggressiveOpts -XX:+UseConcMarkSweepGC %java_args%
|
37
|
+
) else (
|
38
|
+
set java_args=-XX:+AggressiveOpts -XX:+TieredCompilation -XX:TieredStopAtLevel=1 -Xverify:none %java_args%
|
39
|
+
)
|
40
|
+
|
41
|
+
java %java_args% -jar %this% %jruby_args% %args%
|
42
|
+
|
43
|
+
endlocal
|
44
|
+
|
45
|
+
exit /b
|
46
|
+
|
47
|
+
:check_arg
|
48
|
+
set arg=%*
|
49
|
+
|
50
|
+
rem Remove double quotations
|
51
|
+
set p1=%arg:~0,1%
|
52
|
+
set p1=%p1:"=%
|
53
|
+
set p2=%arg:~-1,1%
|
54
|
+
set p2=%p2:"=%
|
55
|
+
set arg=%p1%%arg:~1,-1%%p2%
|
56
|
+
|
57
|
+
if "%status%" == "rest" (
|
58
|
+
set args=%args% %arg%
|
59
|
+
|
60
|
+
) else if "%status%" == "read" (
|
61
|
+
call :read_file %arg%
|
62
|
+
|
63
|
+
) else if "%arg%" == "-J+O" (
|
64
|
+
set overwrite_optimize=true
|
65
|
+
set status=rest
|
66
|
+
|
67
|
+
) else if "%arg%" == "-J-O" (
|
68
|
+
set overwrite_optimize=false
|
69
|
+
set status=rest
|
70
|
+
|
71
|
+
) else if "%arg:~0,2%" == "-J" (
|
72
|
+
if not "%arg:~2%" == "" (
|
73
|
+
set java_args=%java_args% %arg:~2%
|
74
|
+
) else (
|
75
|
+
set status=read
|
76
|
+
)
|
77
|
+
|
78
|
+
) else if "%arg:~0,2%" == "-R" (
|
79
|
+
set jruby_args=%jruby_args% %arg:~2%
|
80
|
+
|
81
|
+
) else if "%arg%" == "run" (
|
82
|
+
set default_optimize=true
|
83
|
+
set args=%args% %arg%
|
84
|
+
set status=rest
|
85
|
+
|
86
|
+
) else (
|
87
|
+
set args=%args% %arg%
|
88
|
+
set status=rest
|
89
|
+
)
|
90
|
+
exit /b
|
91
|
+
|
92
|
+
:read_file
|
93
|
+
if not exist "%~1" (
|
94
|
+
echo "failed to load java argument file."
|
95
|
+
set error=true
|
96
|
+
) else (
|
97
|
+
for /f "delims=" %%i in (%~1) do set java_args=%java_args% %%i
|
98
|
+
)
|
99
|
+
set status=
|
100
|
+
exit /b
|
6
101
|
|
7
|
-
exit /B
|
8
102
|
BAT
|
9
103
|
|
10
104
|
java_args=""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.cli;
|
2
|
+
|
3
|
+
import java.io.BufferedWriter;
|
4
|
+
import java.io.File;
|
5
|
+
import java.io.FileOutputStream;
|
6
|
+
import java.io.OutputStreamWriter;
|
7
|
+
import java.nio.charset.Charset;
|
8
|
+
import java.util.Arrays;
|
9
|
+
|
10
|
+
public class DummyMain {
|
11
|
+
|
12
|
+
public static void main(String[] args) throws Exception {
|
13
|
+
System.out.println(Arrays.asList(args));
|
14
|
+
File thisFolder = new File(SelfrunTest.class.getResource("/org/embulk/cli/DummyMain.class").toURI()).getParentFile();
|
15
|
+
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(thisFolder, "args.txt")), Charset.defaultCharset()))) {
|
16
|
+
for (String arg : args) {
|
17
|
+
writer.write(arg);
|
18
|
+
writer.newLine();
|
19
|
+
}
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
}
|
@@ -0,0 +1,281 @@
|
|
1
|
+
package org.embulk.cli;
|
2
|
+
|
3
|
+
import static org.junit.Assert.assertEquals;
|
4
|
+
|
5
|
+
import java.io.BufferedReader;
|
6
|
+
import java.io.BufferedWriter;
|
7
|
+
import java.io.File;
|
8
|
+
import java.io.FileOutputStream;
|
9
|
+
import java.io.IOException;
|
10
|
+
import java.io.InputStreamReader;
|
11
|
+
import java.io.OutputStreamWriter;
|
12
|
+
import java.nio.charset.Charset;
|
13
|
+
import java.nio.file.FileSystem;
|
14
|
+
import java.nio.file.FileSystems;
|
15
|
+
import java.nio.file.Files;
|
16
|
+
import java.nio.file.StandardOpenOption;
|
17
|
+
import java.util.Arrays;
|
18
|
+
import java.util.List;
|
19
|
+
|
20
|
+
import org.junit.BeforeClass;
|
21
|
+
import org.junit.Test;
|
22
|
+
|
23
|
+
|
24
|
+
public class SelfrunTest {
|
25
|
+
|
26
|
+
private static File testSelfrun;
|
27
|
+
|
28
|
+
@BeforeClass
|
29
|
+
public static void prepare() throws Exception {
|
30
|
+
File selfrun = findSelfrun();
|
31
|
+
FileSystem fs = FileSystems.getDefault();
|
32
|
+
String line = new String(Files.readAllBytes(fs.getPath(selfrun.getAbsolutePath())), Charset.defaultCharset());
|
33
|
+
|
34
|
+
File thisFolder = new File(SelfrunTest.class.getResource("/org/embulk/cli/SelfrunTest.class").toURI()).getParentFile();
|
35
|
+
testSelfrun = new File(thisFolder, System.getProperty("file.separator").equals("\\") ? "selfrun.bat" : "selfrun.sh");
|
36
|
+
|
37
|
+
File classpath = thisFolder.getParentFile().getParentFile().getParentFile();
|
38
|
+
line = line.replaceAll("java ", "java -classpath " + classpath.getAbsolutePath().replaceAll("\\\\", "\\\\\\\\") + " org.embulk.cli.DummyMain ");
|
39
|
+
|
40
|
+
// Modify selfrun so that arguments are written in 'args.txt' .
|
41
|
+
Files.write(fs.getPath(testSelfrun.getAbsolutePath()), line.getBytes(Charset.defaultCharset()), StandardOpenOption.CREATE);
|
42
|
+
if (!testSelfrun.setExecutable(true)) {
|
43
|
+
throw new Exception("Cannot se executable.");
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
@Test
|
49
|
+
public void testNoArgument() throws Exception {
|
50
|
+
List<String> args = execute();
|
51
|
+
assertEquals(Arrays.asList(
|
52
|
+
"-XX:+AggressiveOpts",
|
53
|
+
"-XX:+TieredCompilation",
|
54
|
+
"-XX:TieredStopAtLevel=1",
|
55
|
+
"-Xverify:none",
|
56
|
+
"-jar",
|
57
|
+
testSelfrun.getAbsolutePath()),
|
58
|
+
args);
|
59
|
+
}
|
60
|
+
|
61
|
+
@Test
|
62
|
+
public void testArguments() throws Exception {
|
63
|
+
List<String> args = execute("a1", "a2", "\"a3=v3\"");
|
64
|
+
assertEquals(Arrays.asList(
|
65
|
+
"-XX:+AggressiveOpts",
|
66
|
+
"-XX:+TieredCompilation",
|
67
|
+
"-XX:TieredStopAtLevel=1",
|
68
|
+
"-Xverify:none",
|
69
|
+
"-jar",
|
70
|
+
testSelfrun.getAbsolutePath(),
|
71
|
+
"a1",
|
72
|
+
"a2",
|
73
|
+
"a3=v3"),
|
74
|
+
args);
|
75
|
+
}
|
76
|
+
|
77
|
+
@Test
|
78
|
+
public void testRun() throws Exception {
|
79
|
+
List<String> args = execute("run", "a1");
|
80
|
+
assertEquals(Arrays.asList(
|
81
|
+
"-XX:+AggressiveOpts",
|
82
|
+
"-XX:+UseConcMarkSweepGC",
|
83
|
+
"-jar",
|
84
|
+
testSelfrun.getAbsolutePath(),
|
85
|
+
"run",
|
86
|
+
"a1"),
|
87
|
+
args);
|
88
|
+
}
|
89
|
+
|
90
|
+
@Test
|
91
|
+
public void testJpO() throws Exception {
|
92
|
+
List<String> args = execute("-J+O", "a1", "a2");
|
93
|
+
assertEquals(Arrays.asList(
|
94
|
+
"-XX:+AggressiveOpts",
|
95
|
+
"-XX:+UseConcMarkSweepGC",
|
96
|
+
"-jar",
|
97
|
+
testSelfrun.getAbsolutePath(),
|
98
|
+
"a1",
|
99
|
+
"a2"),
|
100
|
+
args);
|
101
|
+
}
|
102
|
+
|
103
|
+
@Test
|
104
|
+
public void testJmO() throws Exception {
|
105
|
+
List<String> args = execute("-J-O", "a1", "a2");
|
106
|
+
assertEquals(Arrays.asList(
|
107
|
+
"-XX:+AggressiveOpts",
|
108
|
+
"-XX:+TieredCompilation",
|
109
|
+
"-XX:TieredStopAtLevel=1",
|
110
|
+
"-Xverify:none",
|
111
|
+
"-jar",
|
112
|
+
testSelfrun.getAbsolutePath(),
|
113
|
+
"a1",
|
114
|
+
"a2"),
|
115
|
+
args);
|
116
|
+
}
|
117
|
+
|
118
|
+
@Test
|
119
|
+
public void testR1() throws Exception {
|
120
|
+
List<String> args = execute("-Rr1", "a1", "a2");
|
121
|
+
assertEquals(Arrays.asList(
|
122
|
+
"-XX:+AggressiveOpts",
|
123
|
+
"-XX:+TieredCompilation",
|
124
|
+
"-XX:TieredStopAtLevel=1",
|
125
|
+
"-Xverify:none",
|
126
|
+
"-jar",
|
127
|
+
testSelfrun.getAbsolutePath(),
|
128
|
+
"r1",
|
129
|
+
"a1",
|
130
|
+
"a2"),
|
131
|
+
args);
|
132
|
+
}
|
133
|
+
|
134
|
+
@Test
|
135
|
+
public void testR2() throws Exception {
|
136
|
+
List<String> args = execute("\"-Rr1=v1\"", "\"-Rr2=v2\"", "a1", "a2");
|
137
|
+
assertEquals(Arrays.asList(
|
138
|
+
"-XX:+AggressiveOpts",
|
139
|
+
"-XX:+TieredCompilation",
|
140
|
+
"-XX:TieredStopAtLevel=1",
|
141
|
+
"-Xverify:none",
|
142
|
+
"-jar",
|
143
|
+
testSelfrun.getAbsolutePath(),
|
144
|
+
"r1=v1",
|
145
|
+
"r2=v2",
|
146
|
+
"a1",
|
147
|
+
"a2"),
|
148
|
+
args);
|
149
|
+
}
|
150
|
+
|
151
|
+
@Test
|
152
|
+
public void testRRun() throws Exception {
|
153
|
+
List<String> args = execute("-Rr1", "run", "a1");
|
154
|
+
assertEquals(Arrays.asList(
|
155
|
+
"-XX:+AggressiveOpts",
|
156
|
+
"-XX:+UseConcMarkSweepGC",
|
157
|
+
"-jar",
|
158
|
+
testSelfrun.getAbsolutePath(),
|
159
|
+
"r1",
|
160
|
+
"run",
|
161
|
+
"a1"),
|
162
|
+
args);
|
163
|
+
}
|
164
|
+
|
165
|
+
@Test
|
166
|
+
public void testJ1() throws Exception {
|
167
|
+
List<String> args = execute("-J-Dj1", "a1", "a2");
|
168
|
+
assertEquals(Arrays.asList(
|
169
|
+
"-XX:+AggressiveOpts",
|
170
|
+
"-XX:+TieredCompilation",
|
171
|
+
"-XX:TieredStopAtLevel=1",
|
172
|
+
"-Xverify:none",
|
173
|
+
"-Dj1",
|
174
|
+
"-jar",
|
175
|
+
testSelfrun.getAbsolutePath(),
|
176
|
+
"a1",
|
177
|
+
"a2"),
|
178
|
+
args);
|
179
|
+
}
|
180
|
+
|
181
|
+
@Test
|
182
|
+
public void testJ2() throws Exception {
|
183
|
+
List<String> args = execute("\"-J-Dj1=v1\"", "\"-J-Dj2=v2\"", "a1", "a2");
|
184
|
+
assertEquals(Arrays.asList(
|
185
|
+
"-XX:+AggressiveOpts",
|
186
|
+
"-XX:+TieredCompilation",
|
187
|
+
"-XX:TieredStopAtLevel=1",
|
188
|
+
"-Xverify:none",
|
189
|
+
"-Dj1=v1",
|
190
|
+
"-Dj2=v2",
|
191
|
+
"-jar",
|
192
|
+
testSelfrun.getAbsolutePath(),
|
193
|
+
"a1",
|
194
|
+
"a2"),
|
195
|
+
args);
|
196
|
+
}
|
197
|
+
|
198
|
+
@Test
|
199
|
+
public void testJR() throws Exception {
|
200
|
+
List<String> args = execute("-Jj1", "-Rr1", "a1", "a2");
|
201
|
+
assertEquals(Arrays.asList(
|
202
|
+
"-XX:+AggressiveOpts",
|
203
|
+
"-XX:+TieredCompilation",
|
204
|
+
"-XX:TieredStopAtLevel=1",
|
205
|
+
"-Xverify:none",
|
206
|
+
"j1",
|
207
|
+
"-jar",
|
208
|
+
testSelfrun.getAbsolutePath(),
|
209
|
+
"r1",
|
210
|
+
"a1",
|
211
|
+
"a2"),
|
212
|
+
args);
|
213
|
+
}
|
214
|
+
|
215
|
+
@Test
|
216
|
+
public void testJFile() throws Exception {
|
217
|
+
File javaArgsFile = new File(testSelfrun.getParentFile(), "java_args.txt");
|
218
|
+
FileSystem fs = FileSystems.getDefault();
|
219
|
+
Files.write(fs.getPath(javaArgsFile.getAbsolutePath()), "j1 j2 j3".getBytes(Charset.defaultCharset()), StandardOpenOption.CREATE);
|
220
|
+
|
221
|
+
List<String> args = execute("-J", javaArgsFile.getAbsolutePath(), "a1", "a2");
|
222
|
+
assertEquals(Arrays.asList(
|
223
|
+
"-XX:+AggressiveOpts",
|
224
|
+
"-XX:+TieredCompilation",
|
225
|
+
"-XX:TieredStopAtLevel=1",
|
226
|
+
"-Xverify:none",
|
227
|
+
"j1",
|
228
|
+
"j2",
|
229
|
+
"j3",
|
230
|
+
"-jar",
|
231
|
+
testSelfrun.getAbsolutePath(),
|
232
|
+
"a1",
|
233
|
+
"a2"),
|
234
|
+
args);
|
235
|
+
}
|
236
|
+
|
237
|
+
private List<String> execute(String... arguments) throws Exception {
|
238
|
+
File temp = new File(testSelfrun.getParentFile(), "call-" + testSelfrun.getName());
|
239
|
+
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(temp), Charset.defaultCharset()))) {
|
240
|
+
writer.write(testSelfrun.getAbsolutePath());
|
241
|
+
for (String argument : arguments) {
|
242
|
+
writer.write(" ");
|
243
|
+
writer.write(argument);
|
244
|
+
}
|
245
|
+
}
|
246
|
+
if (!temp.setExecutable(true)) {
|
247
|
+
throw new Exception("Cannot se executable.");
|
248
|
+
}
|
249
|
+
|
250
|
+
File argsFile = new File(testSelfrun.getParentFile(), "args.txt");
|
251
|
+
if (argsFile.exists()) {
|
252
|
+
if (!argsFile.delete()) {
|
253
|
+
throw new IOException("Cannot delete " + argsFile);
|
254
|
+
}
|
255
|
+
}
|
256
|
+
|
257
|
+
Process process = Runtime.getRuntime().exec(temp.getAbsolutePath());
|
258
|
+
int exitCode = process.waitFor();
|
259
|
+
if (exitCode != 0 || !argsFile.exists()) {
|
260
|
+
StringBuilder builder = new StringBuilder();
|
261
|
+
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getErrorStream(), Charset.defaultCharset()))) {
|
262
|
+
builder.append(reader.readLine());
|
263
|
+
builder.append(System.getProperty("line.separator"));
|
264
|
+
}
|
265
|
+
throw new Exception(builder.toString());
|
266
|
+
}
|
267
|
+
|
268
|
+
FileSystem fs = FileSystems.getDefault();
|
269
|
+
List<String> args = Files.readAllLines(fs.getPath(argsFile.getAbsolutePath()), Charset.defaultCharset());
|
270
|
+
return args;
|
271
|
+
}
|
272
|
+
|
273
|
+
private static File findSelfrun() {
|
274
|
+
File folder = new File(".");
|
275
|
+
if (new File(folder, "embulk-cli").exists()) {
|
276
|
+
folder = new File(folder, "embulk-cli");
|
277
|
+
}
|
278
|
+
return new File(new File(new File(new File(folder, "src"), "main"), "sh"), "selfrun.sh");
|
279
|
+
}
|
280
|
+
|
281
|
+
}
|
@@ -143,6 +143,8 @@ Options
|
|
143
143
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
144
144
|
| trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
|
145
145
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
146
|
+
| comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
|
147
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
146
148
|
| allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
|
147
149
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
148
150
|
| allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
|
@@ -204,6 +206,7 @@ Example
|
|
204
206
|
escape: ''
|
205
207
|
null_string: 'NULL'
|
206
208
|
skip_header_lines: 1
|
209
|
+
comment_line_marker: '#'
|
207
210
|
columns:
|
208
211
|
- {name: id, type: long}
|
209
212
|
- {name: account, type: long}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Release 0.6.11
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* ``input-file`` plugin sets ``last_path`` when there are no input files.
|
8
|
+
* ``parser-csv`` supports **comment_line_marker** option to skip lines starting with comment characters such as '#'.
|
9
|
+
* Fixed a bug where ``guess-csv`` guesses timestamp wrongly if order of date is month-day-year.
|
10
|
+
|
11
|
+
General Changes
|
12
|
+
------------------
|
13
|
+
|
14
|
+
* Command line execution supports ``-J``, ``-R``, ``-J+O``, and ``-J-O`` arguments on Windows (@hito4++)
|
15
|
+
|
16
|
+
|
17
|
+
Release Date
|
18
|
+
------------------
|
19
|
+
2015-05-30
|
data/embulk-docs/src/release.rst
CHANGED
@@ -75,6 +75,10 @@ public class CsvParserPlugin
|
|
75
75
|
@ConfigDefault("131072") //128kB
|
76
76
|
public long getMaxQuotedSizeLimit();
|
77
77
|
|
78
|
+
@Config("comment_line_marker")
|
79
|
+
@ConfigDefault("null")
|
80
|
+
public Optional<String> getCommentLineMarker();
|
81
|
+
|
78
82
|
@Config("allow_optional_columns")
|
79
83
|
@ConfigDefault("false")
|
80
84
|
public boolean getAllowOptionalColumns();
|
@@ -27,6 +27,7 @@ public class CsvTokenizer
|
|
27
27
|
private final String newline;
|
28
28
|
private final boolean trimIfNotQuoted;
|
29
29
|
private final long maxQuotedSizeLimit;
|
30
|
+
private final String commentLineMarker;
|
30
31
|
private final LineDecoder input;
|
31
32
|
|
32
33
|
private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
|
@@ -46,6 +47,7 @@ public class CsvTokenizer
|
|
46
47
|
newline = task.getNewline().getString();
|
47
48
|
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
48
49
|
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
50
|
+
commentLineMarker = task.getCommentLineMarker().orNull();
|
49
51
|
this.input = input;
|
50
52
|
}
|
51
53
|
|
@@ -93,7 +95,7 @@ public class CsvTokenizer
|
|
93
95
|
}
|
94
96
|
}
|
95
97
|
|
96
|
-
private boolean nextLine(boolean
|
98
|
+
private boolean nextLine(boolean skipEmptyLine)
|
97
99
|
{
|
98
100
|
while (true) {
|
99
101
|
if (!unreadLines.isEmpty()) {
|
@@ -107,7 +109,10 @@ public class CsvTokenizer
|
|
107
109
|
linePos = 0;
|
108
110
|
lineNumber++;
|
109
111
|
|
110
|
-
|
112
|
+
boolean skip = skipEmptyLine && (
|
113
|
+
line.isEmpty() ||
|
114
|
+
(commentLineMarker != null && line.startsWith(commentLineMarker)));
|
115
|
+
if (!skip) {
|
111
116
|
return true;
|
112
117
|
}
|
113
118
|
}
|
@@ -79,10 +79,22 @@ public class LocalFileInputPlugin
|
|
79
79
|
|
80
80
|
control.run(taskSource, taskCount);
|
81
81
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
82
|
+
// build next config
|
83
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
84
|
+
|
85
|
+
// last_path
|
86
|
+
if (task.getFiles().isEmpty()) {
|
87
|
+
// keep the last value
|
88
|
+
if (task.getLastPath().isPresent()) {
|
89
|
+
configDiff.set("last_path", task.getLastPath().get());
|
90
|
+
}
|
91
|
+
} else {
|
92
|
+
List<String> files = new ArrayList<String>(task.getFiles());
|
93
|
+
Collections.sort(files);
|
94
|
+
configDiff.set("last_path", files.get(files.size() - 1));
|
95
|
+
}
|
96
|
+
|
97
|
+
return configDiff;
|
86
98
|
}
|
87
99
|
|
88
100
|
@Override
|
@@ -226,6 +226,20 @@ public class TestCsvTokenizer
|
|
226
226
|
"\n\"a\\\"aa\",\"b,bb\\\"\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
|
227
227
|
}
|
228
228
|
|
229
|
+
@Test
|
230
|
+
public void testCommentLineMarker() throws Exception
|
231
|
+
{
|
232
|
+
config.set("comment_line_marker", JsonNodeFactory.instance.textNode("#"));
|
233
|
+
reloadPluginTask();
|
234
|
+
assertEquals(expectedRecords(2,
|
235
|
+
"aaa", "bbb",
|
236
|
+
"eee", "fff"),
|
237
|
+
parse(task,
|
238
|
+
"aaa,bbb",
|
239
|
+
"#ccc,ddd",
|
240
|
+
"eee,fff"));
|
241
|
+
}
|
242
|
+
|
229
243
|
@Test
|
230
244
|
public void trimNonQuotedValues() throws Exception
|
231
245
|
{
|
@@ -93,7 +93,7 @@ out:
|
|
93
93
|
(If guess supported) you don't have to write `<%= category %>:` section in the configuration file. After writing `in:` section, you can let embulk guess `<%= category %>:` section using this command:
|
94
94
|
|
95
95
|
```
|
96
|
-
$ embulk install <%= project_name %>
|
96
|
+
$ embulk gem install <%= project_name %>
|
97
97
|
$ embulk guess -g <%= name %> config.yml -o guessed.yml
|
98
98
|
```
|
99
99
|
%end
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -24,6 +24,11 @@ module Embulk
|
|
24
24
|
"\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
|
25
25
|
]
|
26
26
|
|
27
|
+
COMMENT_LINE_MARKER_CANDIDATES = [
|
28
|
+
"#",
|
29
|
+
"//",
|
30
|
+
]
|
31
|
+
|
27
32
|
MAX_SKIP_LINES = 10
|
28
33
|
NO_SKIP_DETECT_LINES = 10
|
29
34
|
|
@@ -56,9 +61,12 @@ module Embulk
|
|
56
61
|
end
|
57
62
|
|
58
63
|
sample_records = split_lines(parser_guessed, sample_lines, delim)
|
64
|
+
|
59
65
|
skip_header_lines = guess_skip_header_lines(sample_records)
|
60
66
|
sample_records = sample_records[skip_header_lines..-1]
|
61
67
|
|
68
|
+
comment_line_marker, sample_records = guess_comment_line_marker(sample_records)
|
69
|
+
|
62
70
|
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
63
71
|
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
|
64
72
|
|
@@ -75,6 +83,8 @@ module Embulk
|
|
75
83
|
parser_guessed["skip_header_lines"] = skip_header_lines
|
76
84
|
end
|
77
85
|
|
86
|
+
parser_guessed["comment_line_marker"] = comment_line_marker # always set comment_line_marker even if it's null
|
87
|
+
|
78
88
|
parser_guessed["allow_extra_columns"] = false
|
79
89
|
parser_guessed["allow_optional_columns"] = false
|
80
90
|
|
@@ -113,7 +123,10 @@ module Embulk
|
|
113
123
|
columns = []
|
114
124
|
while true
|
115
125
|
begin
|
116
|
-
|
126
|
+
column = tokenizer.nextColumn
|
127
|
+
quoted = tokenizer.wasQuotedColumn
|
128
|
+
column.define_singleton_method(:quoted?) { quoted }
|
129
|
+
columns << column
|
117
130
|
rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
|
118
131
|
rows << columns
|
119
132
|
break
|
@@ -200,21 +213,38 @@ module Embulk
|
|
200
213
|
count = counts.inject(0) {|r,c| r + c }
|
201
214
|
[str, count]
|
202
215
|
end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
|
203
|
-
|
204
|
-
return
|
216
|
+
found_str, found_count = guessed.first
|
217
|
+
return found_str ? found_str : nil
|
205
218
|
end
|
206
219
|
|
207
220
|
def guess_skip_header_lines(sample_records)
|
208
221
|
counts = sample_records.map {|records| records.size }
|
209
222
|
(1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
|
210
223
|
check_row_count = counts[i-1]
|
211
|
-
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c
|
224
|
+
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
|
212
225
|
return i - 1
|
213
226
|
end
|
214
227
|
end
|
215
228
|
return 0
|
216
229
|
end
|
217
230
|
|
231
|
+
def guess_comment_line_marker(sample_records)
|
232
|
+
guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
|
233
|
+
regexp = /^#{Regexp.quote(str)}/
|
234
|
+
records = sample_records.reject do |records|
|
235
|
+
!records[0].quoted? && !NULL_STRING_CANDIDATES.include?(records[0]) && records[0] =~ regexp
|
236
|
+
end
|
237
|
+
count = sample_records.size - records.size
|
238
|
+
[str, count, records]
|
239
|
+
end.select {|str,count,records| count > 0 }.sort_by {|str,count,records| -count }
|
240
|
+
found_str, found_count, found_records = guessed.first
|
241
|
+
if found_str
|
242
|
+
return found_str, found_records
|
243
|
+
else
|
244
|
+
return nil, sample_records
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
218
248
|
def array_sum(array)
|
219
249
|
array.inject(0) {|r,i| r += i }
|
220
250
|
end
|
@@ -196,12 +196,12 @@ module Embulk::Guess
|
|
196
196
|
|
197
197
|
parts << :year
|
198
198
|
part_options << nil
|
199
|
-
delimiters << date_delim
|
200
199
|
|
200
|
+
delimiters << date_delim
|
201
201
|
parts << :month
|
202
202
|
part_options << part_heading_option(dm["month"])
|
203
|
-
delimiters << date_delim
|
204
203
|
|
204
|
+
delimiters << date_delim
|
205
205
|
parts << :day
|
206
206
|
part_options << part_heading_option(dm["day"])
|
207
207
|
|
@@ -210,30 +210,28 @@ module Embulk::Guess
|
|
210
210
|
|
211
211
|
parts << :month
|
212
212
|
part_options << part_heading_option(dm["month"])
|
213
|
-
delimiters << date_delim
|
214
213
|
|
214
|
+
delimiters << date_delim
|
215
215
|
parts << :day
|
216
216
|
part_options << part_heading_option(dm["day"])
|
217
|
-
delimiters << date_delim
|
218
217
|
|
218
|
+
delimiters << date_delim
|
219
219
|
parts << :year
|
220
220
|
part_options << nil
|
221
|
-
delimiters << date_delim
|
222
221
|
|
223
222
|
elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
|
224
223
|
date_delim = dm["date_delim"] rescue ""
|
225
224
|
|
226
225
|
parts << :day
|
227
226
|
part_options << part_heading_option(dm["day"])
|
228
|
-
delimiters << date_delim
|
229
227
|
|
228
|
+
delimiters << date_delim
|
230
229
|
parts << :month
|
231
230
|
part_options << part_heading_option(dm["month"])
|
232
|
-
delimiters << date_delim
|
233
231
|
|
232
|
+
delimiters << date_delim
|
234
233
|
parts << :year
|
235
234
|
part_options << nil
|
236
|
-
delimiters << date_delim
|
237
235
|
|
238
236
|
else
|
239
237
|
date_delim = ""
|
data/lib/embulk/version.rb
CHANGED
@@ -45,6 +45,13 @@ class TimeFormatGuessTest < ::Test::Unit::TestCase
|
|
45
45
|
assert_guess "%m.%d.%Y", "01.01.2014"
|
46
46
|
assert_guess "%d/%m/%Y", "13/01/2014"
|
47
47
|
assert_guess "%d/%m/%Y", "21/01/2014"
|
48
|
+
|
49
|
+
assert_guess "%d/%m/%Y %H-%M-%S,%N", "21/01/2014 01-01-01,000000001"
|
50
|
+
assert_guess "%d/%m/%Y %H-%M-%S,%N", "21/01/2014 01-01-01,000001"
|
51
|
+
assert_guess "%d/%m/%Y %H-%M-%S,%L", "21/01/2014 01-01-01,001"
|
52
|
+
assert_guess "%d/%m/%Y %H-%M-%S", "21/01/2014 01-01-01"
|
53
|
+
assert_guess "%d/%m/%Y %H-%M", "21/01/2014 01-01"
|
54
|
+
assert_guess "%d/%m/%Y", "21/01/2014"
|
48
55
|
end
|
49
56
|
|
50
57
|
def test_format_borders
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05-
|
11
|
+
date: 2015-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,6 +100,8 @@ files:
|
|
100
100
|
- embulk-cli/build.gradle
|
101
101
|
- embulk-cli/src/main/java/org/embulk/cli/Main.java
|
102
102
|
- embulk-cli/src/main/sh/selfrun.sh
|
103
|
+
- embulk-cli/src/test/java/org/embulk/cli/DummyMain.java
|
104
|
+
- embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java
|
103
105
|
- embulk-core/build.gradle
|
104
106
|
- embulk-core/src/main/java/org/embulk/EmbulkService.java
|
105
107
|
- embulk-core/src/main/java/org/embulk/command/PreviewPrinter.java
|
@@ -294,6 +296,7 @@ files:
|
|
294
296
|
- embulk-docs/src/release/release-0.6.0.rst
|
295
297
|
- embulk-docs/src/release/release-0.6.1.rst
|
296
298
|
- embulk-docs/src/release/release-0.6.10.rst
|
299
|
+
- embulk-docs/src/release/release-0.6.11.rst
|
297
300
|
- embulk-docs/src/release/release-0.6.2.rst
|
298
301
|
- embulk-docs/src/release/release-0.6.3.rst
|
299
302
|
- embulk-docs/src/release/release-0.6.4.rst
|
@@ -409,8 +412,8 @@ files:
|
|
409
412
|
- classpath/bval-jsr303-0.5.jar
|
410
413
|
- classpath/commons-beanutils-core-1.8.3.jar
|
411
414
|
- classpath/commons-lang3-3.1.jar
|
412
|
-
- classpath/embulk-core-0.6.
|
413
|
-
- classpath/embulk-standards-0.6.
|
415
|
+
- classpath/embulk-core-0.6.11.jar
|
416
|
+
- classpath/embulk-standards-0.6.11.jar
|
414
417
|
- classpath/guava-18.0.jar
|
415
418
|
- classpath/guice-4.0.jar
|
416
419
|
- classpath/guice-multibindings-4.0.jar
|