embulk 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -5
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +16 -3
- data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +4 -3
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +3 -3
- data/embulk-docs/src/index.rst +6 -0
- data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +1 -1
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.5.2.rst +30 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +9 -4
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +35 -20
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +6 -1
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +4 -2
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +2 -2
- data/lib/embulk/command/embulk_run.rb +1 -1
- data/lib/embulk/file_output.rb +2 -1
- data/lib/embulk/guess/csv.rb +52 -5
- data/lib/embulk/guess_plugin.rb +1 -0
- data/lib/embulk/plugin_registry.rb +0 -2
- data/lib/embulk/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c731d85a93052e3fa0e40a49c048f365e150eace
|
4
|
+
data.tar.gz: 5266d8207396e64c995d8fd6e2b9e60cc228041c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 658da0f21342555a4d267bfa5bd7ee048dddd875fdc45dff82d22fb7f00e19d1f735afed15549adf94f135ccae3ed327636a86b37a5b6a252de0821f4049a5a9
|
7
|
+
data.tar.gz: 7888c38856e8e3816eb819bc1b12f33852634882c7e40b68496ad5700cf7c1a202174e097cfeb1589f6847752e768d40fdfc9ef5d5465b698708882488c9aeeb
|
data/README.md
CHANGED
@@ -25,23 +25,31 @@ The single-file package is the simplest way to try Embulk. You can download the
|
|
25
25
|
|
26
26
|
### Linux & Mac & BSD
|
27
27
|
|
28
|
+
Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
|
29
|
+
|
28
30
|
Following 4 commands install embulk to your home directory:
|
29
31
|
|
30
32
|
```
|
31
|
-
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
33
|
+
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar
|
32
34
|
chmod +x ~/.embulk/bin/embulk
|
33
35
|
echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
|
34
36
|
source ~/.bashrc
|
35
37
|
```
|
36
38
|
|
39
|
+
Next step: [Trying examples](#trying-examples)
|
40
|
+
|
37
41
|
### Windows
|
38
42
|
|
43
|
+
Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
|
44
|
+
|
39
45
|
You can assume the jar file is a .bat file.
|
40
46
|
|
41
47
|
```
|
42
|
-
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
48
|
+
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -OutFile embulk.bat}"
|
43
49
|
```
|
44
50
|
|
51
|
+
Next step: [Trying examples](#trying-examples)
|
52
|
+
|
45
53
|
### Trying examples
|
46
54
|
|
47
55
|
Let's load a CSV file, for example. `embulk example` subcommand generates a csv file and config file for you.
|
@@ -53,6 +61,8 @@ embulk preview config.yml
|
|
53
61
|
embulk run config.yml
|
54
62
|
```
|
55
63
|
|
64
|
+
Next step: [Using plugins](#using-plugins)
|
65
|
+
|
56
66
|
### Using plugins
|
57
67
|
|
58
68
|
You can use plugins to load data from/to various systems and file formats.
|
@@ -63,14 +73,14 @@ embulk gem install embulk-output-postgres-json
|
|
63
73
|
embulk gem list
|
64
74
|
```
|
65
75
|
|
66
|
-
You can
|
76
|
+
You can find plugins at the [list of plugins by category](http://www.embulk.org/plugins/).
|
67
77
|
|
68
78
|
### Using plugin bundle
|
69
79
|
|
70
|
-
`embulk bundle` subcommand creates (or updates if already exists) a
|
80
|
+
`embulk bundle` subcommand creates (or updates if already exists) a private (isolated) bundle of a plugins.
|
71
81
|
You can use the bundle using `-b <bundle_dir>` option. `embulk bundle` also generates some example plugins to \<bundle_dir>/embulk/\*.rb directory.
|
72
82
|
|
73
|
-
See generated \<bundle_dir>/Gemfile file how to plugin bundles work.
|
83
|
+
See the generated \<bundle_dir>/Gemfile file how to plugin bundles work.
|
74
84
|
|
75
85
|
```
|
76
86
|
embulk bundle ./embulk_bundle
|
data/build.gradle
CHANGED
@@ -35,7 +35,7 @@ public class PooledBufferAllocator
|
|
35
35
|
extends Buffer
|
36
36
|
{
|
37
37
|
private ByteBuf buf;
|
38
|
-
private
|
38
|
+
private BufferReleasedBeforeAt doubleFreeCheck;
|
39
39
|
|
40
40
|
public NettyByteBufBuffer(ByteBuf buf)
|
41
41
|
{
|
@@ -46,13 +46,26 @@ public class PooledBufferAllocator
|
|
46
46
|
public void release()
|
47
47
|
{
|
48
48
|
if (doubleFreeCheck != null) {
|
49
|
-
doubleFreeCheck.printStackTrace();
|
49
|
+
new BufferDoubleReleasedException(doubleFreeCheck).printStackTrace();
|
50
50
|
}
|
51
51
|
if (buf != null) {
|
52
52
|
buf.release();
|
53
53
|
buf = null;
|
54
|
-
doubleFreeCheck = new
|
54
|
+
doubleFreeCheck = new BufferReleasedBeforeAt();
|
55
55
|
}
|
56
56
|
}
|
57
57
|
}
|
58
|
+
|
59
|
+
static class BufferReleasedBeforeAt
|
60
|
+
extends Throwable
|
61
|
+
{ }
|
62
|
+
|
63
|
+
static class BufferDoubleReleasedException
|
64
|
+
extends IllegalStateException
|
65
|
+
{
|
66
|
+
public BufferDoubleReleasedException(BufferReleasedBeforeAt releasedAt)
|
67
|
+
{
|
68
|
+
super("Detected double release() call of a buffer", releasedAt);
|
69
|
+
}
|
70
|
+
}
|
58
71
|
}
|
@@ -98,12 +98,13 @@ public class PreviewExecutor
|
|
98
98
|
{
|
99
99
|
InputPlugin input = newInputPlugin(task);
|
100
100
|
List<FilterPlugin> filterPlugins = newFilterPlugins(task);
|
101
|
-
Schema
|
101
|
+
Schema inputSchema = filterSchemas.get(0);
|
102
|
+
Schema outputSchema = filterSchemas.get(filterSchemas.size() - 1);
|
102
103
|
|
103
|
-
PageOutput out = new SamplingPageOutput(task.getSampleRows(),
|
104
|
+
PageOutput out = new SamplingPageOutput(task.getSampleRows(), outputSchema);
|
104
105
|
try {
|
105
106
|
out = Filters.open(filterPlugins, filterTasks, filterSchemas, out);
|
106
|
-
input.run(inputTask,
|
107
|
+
input.run(inputTask, inputSchema, 0, out);
|
107
108
|
} finally {
|
108
109
|
out.close();
|
109
110
|
}
|
@@ -46,17 +46,17 @@ public class SamplingParserPlugin
|
|
46
46
|
throw new SampledNoticeError(buffer);
|
47
47
|
}
|
48
48
|
|
49
|
-
public static Buffer runFileInputSampling(final FileInputRunner
|
49
|
+
public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig)
|
50
50
|
{
|
51
51
|
// override in.parser.type so that FileInputRunner creates GuessParserPlugin
|
52
52
|
ConfigSource samplingInputConfig = inputConfig.deepCopy();
|
53
53
|
samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling");
|
54
54
|
|
55
55
|
try {
|
56
|
-
|
56
|
+
runner.transaction(samplingInputConfig, new InputPlugin.Control() {
|
57
57
|
public List<CommitReport> run(TaskSource taskSource, Schema schema, int taskCount)
|
58
58
|
{
|
59
|
-
|
59
|
+
runner.run(taskSource, schema, 0, new PageOutput() {
|
60
60
|
@Override
|
61
61
|
public void add(Page page)
|
62
62
|
{
|
data/embulk-docs/src/index.rst
CHANGED
@@ -8,6 +8,12 @@ Embulk documentation
|
|
8
8
|
|
9
9
|
https://github.com/embulk/embulk
|
10
10
|
|
11
|
+
* `Quick Start <https://github.com/embulk/embulk#quick-start>`_
|
12
|
+
|
13
|
+
* `Linux and Mac OS X <https://github.com/embulk/embulk#linux--mac--bsd>`_
|
14
|
+
|
15
|
+
* `Windows <https://github.com/embulk/embulk#windows>`_
|
16
|
+
|
11
17
|
* `List of Plugins by Category <http://www.embulk.org/plugins/>`_
|
12
18
|
|
13
19
|
.. toctree::
|
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
|
|
56
56
|
|
57
57
|
.. code-block:: console
|
58
58
|
|
59
|
-
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
59
|
+
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -O /usr/local/bin/embulk
|
60
60
|
$ sudo chmod +x /usr/local/bin/embulk
|
61
61
|
|
62
62
|
Step 2. Install Elasticsearch plugin
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
Release 0.5.2
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* ``parser-csv`` plugin supports ``skip_header_lines`` parameter to skip first some lines.
|
8
|
+
|
9
|
+
* ``header_line`` parameter is obsoleted. Although the parameter still works for backward compatibility, setting both ``header_line`` and ``skip_header_lines`` becomes configuration error.
|
10
|
+
|
11
|
+
* ``guess-csv`` plugin guesses first ignorable lines and sets ``skip_header_lines`` parameter automatically.
|
12
|
+
|
13
|
+
* ``guess-csv`` plugin guesses quoted column names correctly.
|
14
|
+
|
15
|
+
* ``formatter-csv`` pugin supports ``delimiter`` parameter (@hiroyuki-sato++).
|
16
|
+
|
17
|
+
* ``output-stdout`` fixed warning messages due to double-release of buffers.
|
18
|
+
|
19
|
+
|
20
|
+
General Changes
|
21
|
+
------------------
|
22
|
+
|
23
|
+
* Improved error message when double-release of a ``spi.Buffer`` is detected.
|
24
|
+
* Fixed ``preview`` when a filter plugin changes schema (@llibra++).
|
25
|
+
* Fixed infinite loop at ``Embulk::FileOutput#flush`` (@goronao++). It happened if a formatter plugin written in Ruby writes more than 32KB of data.
|
26
|
+
|
27
|
+
|
28
|
+
Release Date
|
29
|
+
------------------
|
30
|
+
2015-03-11
|
@@ -31,6 +31,10 @@ public class CsvFormatterPlugin
|
|
31
31
|
@Config("header_line")
|
32
32
|
@ConfigDefault("true")
|
33
33
|
public boolean getHeaderLine();
|
34
|
+
|
35
|
+
@Config("delimiter")
|
36
|
+
@ConfigDefault("\",\"")
|
37
|
+
public String getDelimiterChar();
|
34
38
|
}
|
35
39
|
|
36
40
|
@Override
|
@@ -62,13 +66,14 @@ public class CsvFormatterPlugin
|
|
62
66
|
final LineEncoder encoder = new LineEncoder(output, task);
|
63
67
|
final Map<Integer, TimestampFormatter> timestampFormatters =
|
64
68
|
newTimestampFormatters(task, schema);
|
69
|
+
final String delimiter = task.getDelimiterChar();
|
65
70
|
|
66
71
|
// create a file
|
67
72
|
encoder.nextFile();
|
68
73
|
|
69
74
|
// write header
|
70
75
|
if (task.getHeaderLine()) {
|
71
|
-
writeHeader(schema, encoder);
|
76
|
+
writeHeader(schema, encoder, delimiter);
|
72
77
|
}
|
73
78
|
|
74
79
|
return new PageOutput() {
|
@@ -124,7 +129,7 @@ public class CsvFormatterPlugin
|
|
124
129
|
private void addDelimiter(Column column)
|
125
130
|
{
|
126
131
|
if (column.getIndex() != 0) {
|
127
|
-
encoder.addText(
|
132
|
+
encoder.addText(delimiter);
|
128
133
|
}
|
129
134
|
}
|
130
135
|
});
|
@@ -145,11 +150,11 @@ public class CsvFormatterPlugin
|
|
145
150
|
};
|
146
151
|
}
|
147
152
|
|
148
|
-
private void writeHeader(Schema schema, LineEncoder encoder)
|
153
|
+
private void writeHeader(Schema schema, LineEncoder encoder, String delimiter)
|
149
154
|
{
|
150
155
|
for (Column column : schema.getColumns()) {
|
151
156
|
if (column.getIndex() != 0) {
|
152
|
-
encoder.addText(
|
157
|
+
encoder.addText(delimiter);
|
153
158
|
}
|
154
159
|
encoder.addText(column.getName());
|
155
160
|
}
|
@@ -1,13 +1,13 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
3
|
import com.google.common.base.Preconditions;
|
4
|
-
import com.google.common.collect.ImmutableMap;
|
5
4
|
import com.google.common.base.Optional;
|
6
5
|
import com.google.common.collect.ImmutableSet;
|
7
6
|
import org.embulk.config.Task;
|
8
7
|
import org.embulk.config.Config;
|
9
8
|
import org.embulk.config.ConfigDefault;
|
10
9
|
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.config.ConfigException;
|
11
11
|
import org.embulk.config.TaskSource;
|
12
12
|
import org.embulk.spi.type.TimestampType;
|
13
13
|
import org.embulk.spi.time.TimestampParser;
|
@@ -25,8 +25,6 @@ import org.embulk.spi.BufferAllocator;
|
|
25
25
|
import org.embulk.spi.util.LineDecoder;
|
26
26
|
import org.slf4j.Logger;
|
27
27
|
|
28
|
-
import java.util.Map;
|
29
|
-
|
30
28
|
public class CsvParserPlugin
|
31
29
|
implements ParserPlugin
|
32
30
|
{
|
@@ -44,9 +42,14 @@ public class CsvParserPlugin
|
|
44
42
|
@Config("columns")
|
45
43
|
public SchemaConfig getSchemaConfig();
|
46
44
|
|
47
|
-
@Config("header_line")
|
48
|
-
@ConfigDefault("
|
49
|
-
public
|
45
|
+
@Config("header_line")
|
46
|
+
@ConfigDefault("null")
|
47
|
+
public Optional<Boolean> getHeaderLine();
|
48
|
+
|
49
|
+
@Config("skip_header_lines")
|
50
|
+
@ConfigDefault("0")
|
51
|
+
public int getSkipHeaderLines();
|
52
|
+
public void setSkipHeaderLines(int n);
|
50
53
|
|
51
54
|
@Config("delimiter")
|
52
55
|
@ConfigDefault("\",\"")
|
@@ -86,20 +89,33 @@ public class CsvParserPlugin
|
|
86
89
|
public void transaction(ConfigSource config, ParserPlugin.Control control)
|
87
90
|
{
|
88
91
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
+
|
93
|
+
// backward compatibility
|
94
|
+
if (task.getHeaderLine().isPresent()) {
|
95
|
+
if (task.getSkipHeaderLines() > 0) {
|
96
|
+
throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set.");
|
97
|
+
}
|
98
|
+
if (task.getHeaderLine().get()) {
|
99
|
+
task.setSkipHeaderLines(1);
|
100
|
+
} else {
|
101
|
+
task.setSkipHeaderLines(0);
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
89
105
|
control.run(task.dump(), task.getSchemaConfig().toSchema());
|
90
106
|
}
|
91
107
|
|
92
|
-
private
|
108
|
+
private TimestampParser[] newTimestampParsers(
|
93
109
|
TimestampParser.ParserTask task, Schema schema)
|
94
110
|
{
|
95
|
-
|
111
|
+
TimestampParser[] parsers = new TimestampParser[schema.getColumnCount()];
|
96
112
|
for (Column column : schema.getColumns()) {
|
97
113
|
if (column.getType() instanceof TimestampType) {
|
98
114
|
TimestampType tt = (TimestampType) column.getType();
|
99
|
-
|
115
|
+
parsers[column.getIndex()] = new TimestampParser(tt.getFormat(), task);
|
100
116
|
}
|
101
117
|
}
|
102
|
-
return
|
118
|
+
return parsers;
|
103
119
|
}
|
104
120
|
|
105
121
|
@Override
|
@@ -107,19 +123,18 @@ public class CsvParserPlugin
|
|
107
123
|
FileInput input, PageOutput output)
|
108
124
|
{
|
109
125
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
110
|
-
final
|
111
|
-
|
126
|
+
final TimestampParser[] timestampFormatters = newTimestampParsers(task, schema);
|
127
|
+
LineDecoder lineDecoder = new LineDecoder(input, task);
|
128
|
+
final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
|
112
129
|
final String nullStringOrNull = task.getNullString().orNull();
|
113
|
-
|
130
|
+
int skipHeaderLines = task.getSkipHeaderLines();
|
114
131
|
|
115
132
|
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
116
133
|
while (tokenizer.nextFile()) {
|
117
|
-
|
118
|
-
|
119
|
-
if (
|
120
|
-
|
121
|
-
tokenizer.nextColumn(); // TODO check return value?
|
122
|
-
}
|
134
|
+
// skip the header lines for each file
|
135
|
+
for (; skipHeaderLines > 0; skipHeaderLines--) {
|
136
|
+
if (lineDecoder.poll() == null) {
|
137
|
+
break;
|
123
138
|
}
|
124
139
|
}
|
125
140
|
|
@@ -187,7 +202,7 @@ public class CsvParserPlugin
|
|
187
202
|
pageBuilder.setNull(column);
|
188
203
|
} else {
|
189
204
|
try {
|
190
|
-
pageBuilder.setTimestamp(column,
|
205
|
+
pageBuilder.setTimestamp(column, timestampFormatters[column.getIndex()].parse(v));
|
191
206
|
} catch (TimestampParseException e) {
|
192
207
|
// TODO support default value
|
193
208
|
throw new CsvRecordValidateException(e);
|
@@ -115,9 +115,14 @@ public class CsvTokenizer
|
|
115
115
|
}
|
116
116
|
}
|
117
117
|
|
118
|
+
public boolean hasNextColumn()
|
119
|
+
{
|
120
|
+
return recordState == RecordState.NOT_END;
|
121
|
+
}
|
122
|
+
|
118
123
|
public String nextColumn()
|
119
124
|
{
|
120
|
-
Preconditions.checkState(
|
125
|
+
Preconditions.checkState(hasNextColumn(), "doesn't have enough columns"); // TODO exception class
|
121
126
|
|
122
127
|
// reset last state
|
123
128
|
wasQuotedColumn = false;
|
@@ -64,7 +64,6 @@ public class StdoutOutputPlugin
|
|
64
64
|
while (reader.nextRecord()) {
|
65
65
|
System.out.println(printer.printRecord(reader, ","));
|
66
66
|
}
|
67
|
-
page.release();
|
68
67
|
}
|
69
68
|
|
70
69
|
public void finish()
|
@@ -72,7 +71,10 @@ public class StdoutOutputPlugin
|
|
72
71
|
System.out.flush();
|
73
72
|
}
|
74
73
|
|
75
|
-
public void close()
|
74
|
+
public void close()
|
75
|
+
{
|
76
|
+
reader.close();
|
77
|
+
}
|
76
78
|
|
77
79
|
public void abort() { }
|
78
80
|
|
@@ -31,7 +31,7 @@ public class TestCsvParserPlugin
|
|
31
31
|
CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
|
32
32
|
assertEquals(Charset.forName("utf-8"), task.getCharset());
|
33
33
|
assertEquals(Newline.CRLF, task.getNewline());
|
34
|
-
assertEquals(false, task.getHeaderLine());
|
34
|
+
assertEquals(false, task.getHeaderLine().or(false));
|
35
35
|
assertEquals(',', task.getDelimiterChar());
|
36
36
|
assertEquals('\"', task.getQuoteChar());
|
37
37
|
}
|
@@ -62,7 +62,7 @@ public class TestCsvParserPlugin
|
|
62
62
|
CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
|
63
63
|
assertEquals(Charset.forName("utf-16"), task.getCharset());
|
64
64
|
assertEquals(Newline.LF, task.getNewline());
|
65
|
-
assertEquals(true, task.getHeaderLine());
|
65
|
+
assertEquals(true, task.getHeaderLine().or(false));
|
66
66
|
assertEquals('\t', task.getDelimiterChar());
|
67
67
|
assertEquals('\\', task.getQuoteChar());
|
68
68
|
}
|
data/lib/embulk/file_output.rb
CHANGED
@@ -34,6 +34,7 @@ module Embulk
|
|
34
34
|
def flush
|
35
35
|
unless @buffer.empty?
|
36
36
|
@java_file_output.add(@buffer.to_java)
|
37
|
+
@buffer.clear
|
37
38
|
end
|
38
39
|
nil
|
39
40
|
end
|
@@ -44,7 +45,7 @@ module Embulk
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def close
|
47
|
-
@java_file_output.
|
48
|
+
@java_file_output.close
|
48
49
|
end
|
49
50
|
end
|
50
51
|
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -24,6 +24,9 @@ module Embulk
|
|
24
24
|
"\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
|
25
25
|
]
|
26
26
|
|
27
|
+
MAX_SKIP_LINES = 10
|
28
|
+
NO_SKIP_DETECT_LINES = 10
|
29
|
+
|
27
30
|
def guess_lines(config, sample_lines)
|
28
31
|
delim = guess_delimiter(sample_lines)
|
29
32
|
unless delim
|
@@ -32,7 +35,7 @@ module Embulk
|
|
32
35
|
end
|
33
36
|
|
34
37
|
parser_config = config["parser"] || {}
|
35
|
-
parser_guessed = {"type" => "csv", "delimiter" => delim}
|
38
|
+
parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})
|
36
39
|
|
37
40
|
quote = guess_quote(sample_lines, delim)
|
38
41
|
parser_guessed["quote"] = quote ? quote : ''
|
@@ -44,7 +47,10 @@ module Embulk
|
|
44
47
|
parser_guessed["null_string"] = null_string if null_string
|
45
48
|
# don't even set null_string to avoid confusion of null and 'null' in YAML format
|
46
49
|
|
47
|
-
sample_records = sample_lines
|
50
|
+
sample_records = split_lines(parser_guessed, sample_lines, delim)
|
51
|
+
skip_header_lines = guess_skip_header_lines(sample_records)
|
52
|
+
sample_records = sample_records[skip_header_lines..-1]
|
53
|
+
|
48
54
|
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
49
55
|
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
|
50
56
|
|
@@ -53,12 +59,16 @@ module Embulk
|
|
53
59
|
return {}
|
54
60
|
end
|
55
61
|
|
56
|
-
|
57
|
-
|
62
|
+
header_line = (first_types != other_types && !first_types.any? {|t| t != "string" })
|
63
|
+
|
64
|
+
if header_line
|
65
|
+
parser_guessed["skip_header_lines"] = skip_header_lines + 1
|
66
|
+
else
|
67
|
+
parser_guessed["skip_header_lines"] = skip_header_lines
|
58
68
|
end
|
59
69
|
|
60
70
|
unless parser_config.has_key?("columns")
|
61
|
-
if
|
71
|
+
if header_line
|
62
72
|
column_names = sample_records.first
|
63
73
|
else
|
64
74
|
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
|
@@ -81,6 +91,32 @@ module Embulk
|
|
81
91
|
|
82
92
|
private
|
83
93
|
|
94
|
+
def split_lines(parser_config, sample_lines, delim)
|
95
|
+
parser_task = parser_config.merge({"columns" => []}).load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
|
96
|
+
data = sample_lines.map {|x| x.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
|
97
|
+
sample = Buffer.from_ruby_string(data)
|
98
|
+
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
|
99
|
+
tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
|
100
|
+
rows = []
|
101
|
+
while tokenizer.nextFile
|
102
|
+
while tokenizer.nextRecord
|
103
|
+
columns = []
|
104
|
+
while true
|
105
|
+
begin
|
106
|
+
columns << tokenizer.nextColumn
|
107
|
+
rescue java.lang.IllegalStateException # TODO exception class
|
108
|
+
rows << columns
|
109
|
+
break
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
return rows
|
115
|
+
rescue
|
116
|
+
# TODO warning if fallback to this ad-hoc implementation
|
117
|
+
sample_lines.map {|line| line.split(delim) }
|
118
|
+
end
|
119
|
+
|
84
120
|
def guess_delimiter(sample_lines)
|
85
121
|
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
86
122
|
counts = sample_lines.map {|line| line.count(d) }
|
@@ -154,6 +190,17 @@ module Embulk
|
|
154
190
|
return found ? found[0] : nil
|
155
191
|
end
|
156
192
|
|
193
|
+
def guess_skip_header_lines(sample_records)
|
194
|
+
counts = sample_records.map {|records| records.size }
|
195
|
+
(1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
|
196
|
+
check_row_count = counts[i-1]
|
197
|
+
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c == check_row_count }
|
198
|
+
return i - 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
return 0
|
202
|
+
end
|
203
|
+
|
157
204
|
def array_sum(array)
|
158
205
|
array.inject(0) {|r,i| r += i }
|
159
206
|
end
|
data/lib/embulk/guess_plugin.rb
CHANGED
@@ -38,8 +38,6 @@ module Embulk
|
|
38
38
|
return true
|
39
39
|
rescue LoadError => e
|
40
40
|
# catch LoadError but don't catch ClassNotFoundException
|
41
|
-
# TODO: the best code here is to raise exception only if
|
42
|
-
# `name` file is not in $LOAD_PATH.
|
43
41
|
raise e if e.to_s =~ /java.lang.ClassNotFoundException/
|
44
42
|
raise e if $LOAD_PATH.any? {|dir| File.exists? File.join(dir, "#{name}.rb") }
|
45
43
|
end
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -273,6 +273,7 @@ files:
|
|
273
273
|
- embulk-docs/src/release/release-0.4.9.rst
|
274
274
|
- embulk-docs/src/release/release-0.5.0.rst
|
275
275
|
- embulk-docs/src/release/release-0.5.1.rst
|
276
|
+
- embulk-docs/src/release/release-0.5.2.rst
|
276
277
|
- embulk-standards/build.gradle
|
277
278
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
278
279
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
@@ -377,8 +378,8 @@ files:
|
|
377
378
|
- classpath/bval-jsr303-0.5.jar
|
378
379
|
- classpath/commons-beanutils-core-1.8.3.jar
|
379
380
|
- classpath/commons-lang3-3.1.jar
|
380
|
-
- classpath/embulk-core-0.5.
|
381
|
-
- classpath/embulk-standards-0.5.
|
381
|
+
- classpath/embulk-core-0.5.2.jar
|
382
|
+
- classpath/embulk-standards-0.5.2.jar
|
382
383
|
- classpath/guava-18.0.jar
|
383
384
|
- classpath/guice-3.0.jar
|
384
385
|
- classpath/guice-multibindings-3.0.jar
|