embulk 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -5
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +16 -3
- data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +4 -3
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +3 -3
- data/embulk-docs/src/index.rst +6 -0
- data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +1 -1
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.5.2.rst +30 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +9 -4
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +35 -20
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +6 -1
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +4 -2
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +2 -2
- data/lib/embulk/command/embulk_run.rb +1 -1
- data/lib/embulk/file_output.rb +2 -1
- data/lib/embulk/guess/csv.rb +52 -5
- data/lib/embulk/guess_plugin.rb +1 -0
- data/lib/embulk/plugin_registry.rb +0 -2
- data/lib/embulk/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c731d85a93052e3fa0e40a49c048f365e150eace
|
4
|
+
data.tar.gz: 5266d8207396e64c995d8fd6e2b9e60cc228041c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 658da0f21342555a4d267bfa5bd7ee048dddd875fdc45dff82d22fb7f00e19d1f735afed15549adf94f135ccae3ed327636a86b37a5b6a252de0821f4049a5a9
|
7
|
+
data.tar.gz: 7888c38856e8e3816eb819bc1b12f33852634882c7e40b68496ad5700cf7c1a202174e097cfeb1589f6847752e768d40fdfc9ef5d5465b698708882488c9aeeb
|
data/README.md
CHANGED
@@ -25,23 +25,31 @@ The single-file package is the simplest way to try Embulk. You can download the
|
|
25
25
|
|
26
26
|
### Linux & Mac & BSD
|
27
27
|
|
28
|
+
Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
|
29
|
+
|
28
30
|
Following 4 commands install embulk to your home directory:
|
29
31
|
|
30
32
|
```
|
31
|
-
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
33
|
+
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar
|
32
34
|
chmod +x ~/.embulk/bin/embulk
|
33
35
|
echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
|
34
36
|
source ~/.bashrc
|
35
37
|
```
|
36
38
|
|
39
|
+
Next step: [Trying examples](#trying-examples)
|
40
|
+
|
37
41
|
### Windows
|
38
42
|
|
43
|
+
Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
|
44
|
+
|
39
45
|
You can assume the jar file is a .bat file.
|
40
46
|
|
41
47
|
```
|
42
|
-
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
48
|
+
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -OutFile embulk.bat}"
|
43
49
|
```
|
44
50
|
|
51
|
+
Next step: [Trying examples](#trying-examples)
|
52
|
+
|
45
53
|
### Trying examples
|
46
54
|
|
47
55
|
Let's load a CSV file, for example. `embulk example` subcommand generates a csv file and config file for you.
|
@@ -53,6 +61,8 @@ embulk preview config.yml
|
|
53
61
|
embulk run config.yml
|
54
62
|
```
|
55
63
|
|
64
|
+
Next step: [Using plugins](#using-plugins)
|
65
|
+
|
56
66
|
### Using plugins
|
57
67
|
|
58
68
|
You can use plugins to load data from/to various systems and file formats.
|
@@ -63,14 +73,14 @@ embulk gem install embulk-output-postgres-json
|
|
63
73
|
embulk gem list
|
64
74
|
```
|
65
75
|
|
66
|
-
You can
|
76
|
+
You can find plugins at the [list of plugins by category](http://www.embulk.org/plugins/).
|
67
77
|
|
68
78
|
### Using plugin bundle
|
69
79
|
|
70
|
-
`embulk bundle` subcommand creates (or updates if already exists) a
|
80
|
+
`embulk bundle` subcommand creates (or updates if already exists) a private (isolated) bundle of a plugins.
|
71
81
|
You can use the bundle using `-b <bundle_dir>` option. `embulk bundle` also generates some example plugins to \<bundle_dir>/embulk/\*.rb directory.
|
72
82
|
|
73
|
-
See generated \<bundle_dir>/Gemfile file how to plugin bundles work.
|
83
|
+
See the generated \<bundle_dir>/Gemfile file how to plugin bundles work.
|
74
84
|
|
75
85
|
```
|
76
86
|
embulk bundle ./embulk_bundle
|
data/build.gradle
CHANGED
@@ -35,7 +35,7 @@ public class PooledBufferAllocator
|
|
35
35
|
extends Buffer
|
36
36
|
{
|
37
37
|
private ByteBuf buf;
|
38
|
-
private
|
38
|
+
private BufferReleasedBeforeAt doubleFreeCheck;
|
39
39
|
|
40
40
|
public NettyByteBufBuffer(ByteBuf buf)
|
41
41
|
{
|
@@ -46,13 +46,26 @@ public class PooledBufferAllocator
|
|
46
46
|
public void release()
|
47
47
|
{
|
48
48
|
if (doubleFreeCheck != null) {
|
49
|
-
doubleFreeCheck.printStackTrace();
|
49
|
+
new BufferDoubleReleasedException(doubleFreeCheck).printStackTrace();
|
50
50
|
}
|
51
51
|
if (buf != null) {
|
52
52
|
buf.release();
|
53
53
|
buf = null;
|
54
|
-
doubleFreeCheck = new
|
54
|
+
doubleFreeCheck = new BufferReleasedBeforeAt();
|
55
55
|
}
|
56
56
|
}
|
57
57
|
}
|
58
|
+
|
59
|
+
static class BufferReleasedBeforeAt
|
60
|
+
extends Throwable
|
61
|
+
{ }
|
62
|
+
|
63
|
+
static class BufferDoubleReleasedException
|
64
|
+
extends IllegalStateException
|
65
|
+
{
|
66
|
+
public BufferDoubleReleasedException(BufferReleasedBeforeAt releasedAt)
|
67
|
+
{
|
68
|
+
super("Detected double release() call of a buffer", releasedAt);
|
69
|
+
}
|
70
|
+
}
|
58
71
|
}
|
@@ -98,12 +98,13 @@ public class PreviewExecutor
|
|
98
98
|
{
|
99
99
|
InputPlugin input = newInputPlugin(task);
|
100
100
|
List<FilterPlugin> filterPlugins = newFilterPlugins(task);
|
101
|
-
Schema
|
101
|
+
Schema inputSchema = filterSchemas.get(0);
|
102
|
+
Schema outputSchema = filterSchemas.get(filterSchemas.size() - 1);
|
102
103
|
|
103
|
-
PageOutput out = new SamplingPageOutput(task.getSampleRows(),
|
104
|
+
PageOutput out = new SamplingPageOutput(task.getSampleRows(), outputSchema);
|
104
105
|
try {
|
105
106
|
out = Filters.open(filterPlugins, filterTasks, filterSchemas, out);
|
106
|
-
input.run(inputTask,
|
107
|
+
input.run(inputTask, inputSchema, 0, out);
|
107
108
|
} finally {
|
108
109
|
out.close();
|
109
110
|
}
|
@@ -46,17 +46,17 @@ public class SamplingParserPlugin
|
|
46
46
|
throw new SampledNoticeError(buffer);
|
47
47
|
}
|
48
48
|
|
49
|
-
public static Buffer runFileInputSampling(final FileInputRunner
|
49
|
+
public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig)
|
50
50
|
{
|
51
51
|
// override in.parser.type so that FileInputRunner creates GuessParserPlugin
|
52
52
|
ConfigSource samplingInputConfig = inputConfig.deepCopy();
|
53
53
|
samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling");
|
54
54
|
|
55
55
|
try {
|
56
|
-
|
56
|
+
runner.transaction(samplingInputConfig, new InputPlugin.Control() {
|
57
57
|
public List<CommitReport> run(TaskSource taskSource, Schema schema, int taskCount)
|
58
58
|
{
|
59
|
-
|
59
|
+
runner.run(taskSource, schema, 0, new PageOutput() {
|
60
60
|
@Override
|
61
61
|
public void add(Page page)
|
62
62
|
{
|
data/embulk-docs/src/index.rst
CHANGED
@@ -8,6 +8,12 @@ Embulk documentation
|
|
8
8
|
|
9
9
|
https://github.com/embulk/embulk
|
10
10
|
|
11
|
+
* `Quick Start <https://github.com/embulk/embulk#quick-start>`_
|
12
|
+
|
13
|
+
* `Linux and Mac OS X <https://github.com/embulk/embulk#linux--mac--bsd>`_
|
14
|
+
|
15
|
+
* `Windows <https://github.com/embulk/embulk#windows>`_
|
16
|
+
|
11
17
|
* `List of Plugins by Category <http://www.embulk.org/plugins/>`_
|
12
18
|
|
13
19
|
.. toctree::
|
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
|
|
56
56
|
|
57
57
|
.. code-block:: console
|
58
58
|
|
59
|
-
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
59
|
+
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -O /usr/local/bin/embulk
|
60
60
|
$ sudo chmod +x /usr/local/bin/embulk
|
61
61
|
|
62
62
|
Step 2. Install Elasticsearch plugin
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
Release 0.5.2
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* ``parser-csv`` plugin supports ``skip_header_lines`` parameter to skip first some lines.
|
8
|
+
|
9
|
+
* ``header_line`` parameter is obsoleted. Although the parameter still works for backward compatibility, setting both ``header_line`` and ``skip_header_lines`` becomes configuration error.
|
10
|
+
|
11
|
+
* ``guess-csv`` plugin guesses first ignorable lines and sets ``skip_header_lines`` parameter automatically.
|
12
|
+
|
13
|
+
* ``guess-csv`` plugin guesses quoted column names correctly.
|
14
|
+
|
15
|
+
* ``formatter-csv`` pugin supports ``delimiter`` parameter (@hiroyuki-sato++).
|
16
|
+
|
17
|
+
* ``output-stdout`` fixed warning messages due to double-release of buffers.
|
18
|
+
|
19
|
+
|
20
|
+
General Changes
|
21
|
+
------------------
|
22
|
+
|
23
|
+
* Improved error message when double-release of a ``spi.Buffer`` is detected.
|
24
|
+
* Fixed ``preview`` when a filter plugin changes schema (@llibra++).
|
25
|
+
* Fixed infinite loop at ``Embulk::FileOutput#flush`` (@goronao++). It happened if a formatter plugin written in Ruby writes more than 32KB of data.
|
26
|
+
|
27
|
+
|
28
|
+
Release Date
|
29
|
+
------------------
|
30
|
+
2015-03-11
|
@@ -31,6 +31,10 @@ public class CsvFormatterPlugin
|
|
31
31
|
@Config("header_line")
|
32
32
|
@ConfigDefault("true")
|
33
33
|
public boolean getHeaderLine();
|
34
|
+
|
35
|
+
@Config("delimiter")
|
36
|
+
@ConfigDefault("\",\"")
|
37
|
+
public String getDelimiterChar();
|
34
38
|
}
|
35
39
|
|
36
40
|
@Override
|
@@ -62,13 +66,14 @@ public class CsvFormatterPlugin
|
|
62
66
|
final LineEncoder encoder = new LineEncoder(output, task);
|
63
67
|
final Map<Integer, TimestampFormatter> timestampFormatters =
|
64
68
|
newTimestampFormatters(task, schema);
|
69
|
+
final String delimiter = task.getDelimiterChar();
|
65
70
|
|
66
71
|
// create a file
|
67
72
|
encoder.nextFile();
|
68
73
|
|
69
74
|
// write header
|
70
75
|
if (task.getHeaderLine()) {
|
71
|
-
writeHeader(schema, encoder);
|
76
|
+
writeHeader(schema, encoder, delimiter);
|
72
77
|
}
|
73
78
|
|
74
79
|
return new PageOutput() {
|
@@ -124,7 +129,7 @@ public class CsvFormatterPlugin
|
|
124
129
|
private void addDelimiter(Column column)
|
125
130
|
{
|
126
131
|
if (column.getIndex() != 0) {
|
127
|
-
encoder.addText(
|
132
|
+
encoder.addText(delimiter);
|
128
133
|
}
|
129
134
|
}
|
130
135
|
});
|
@@ -145,11 +150,11 @@ public class CsvFormatterPlugin
|
|
145
150
|
};
|
146
151
|
}
|
147
152
|
|
148
|
-
private void writeHeader(Schema schema, LineEncoder encoder)
|
153
|
+
private void writeHeader(Schema schema, LineEncoder encoder, String delimiter)
|
149
154
|
{
|
150
155
|
for (Column column : schema.getColumns()) {
|
151
156
|
if (column.getIndex() != 0) {
|
152
|
-
encoder.addText(
|
157
|
+
encoder.addText(delimiter);
|
153
158
|
}
|
154
159
|
encoder.addText(column.getName());
|
155
160
|
}
|
@@ -1,13 +1,13 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
3
|
import com.google.common.base.Preconditions;
|
4
|
-
import com.google.common.collect.ImmutableMap;
|
5
4
|
import com.google.common.base.Optional;
|
6
5
|
import com.google.common.collect.ImmutableSet;
|
7
6
|
import org.embulk.config.Task;
|
8
7
|
import org.embulk.config.Config;
|
9
8
|
import org.embulk.config.ConfigDefault;
|
10
9
|
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.config.ConfigException;
|
11
11
|
import org.embulk.config.TaskSource;
|
12
12
|
import org.embulk.spi.type.TimestampType;
|
13
13
|
import org.embulk.spi.time.TimestampParser;
|
@@ -25,8 +25,6 @@ import org.embulk.spi.BufferAllocator;
|
|
25
25
|
import org.embulk.spi.util.LineDecoder;
|
26
26
|
import org.slf4j.Logger;
|
27
27
|
|
28
|
-
import java.util.Map;
|
29
|
-
|
30
28
|
public class CsvParserPlugin
|
31
29
|
implements ParserPlugin
|
32
30
|
{
|
@@ -44,9 +42,14 @@ public class CsvParserPlugin
|
|
44
42
|
@Config("columns")
|
45
43
|
public SchemaConfig getSchemaConfig();
|
46
44
|
|
47
|
-
@Config("header_line")
|
48
|
-
@ConfigDefault("
|
49
|
-
public
|
45
|
+
@Config("header_line")
|
46
|
+
@ConfigDefault("null")
|
47
|
+
public Optional<Boolean> getHeaderLine();
|
48
|
+
|
49
|
+
@Config("skip_header_lines")
|
50
|
+
@ConfigDefault("0")
|
51
|
+
public int getSkipHeaderLines();
|
52
|
+
public void setSkipHeaderLines(int n);
|
50
53
|
|
51
54
|
@Config("delimiter")
|
52
55
|
@ConfigDefault("\",\"")
|
@@ -86,20 +89,33 @@ public class CsvParserPlugin
|
|
86
89
|
public void transaction(ConfigSource config, ParserPlugin.Control control)
|
87
90
|
{
|
88
91
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
+
|
93
|
+
// backward compatibility
|
94
|
+
if (task.getHeaderLine().isPresent()) {
|
95
|
+
if (task.getSkipHeaderLines() > 0) {
|
96
|
+
throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set.");
|
97
|
+
}
|
98
|
+
if (task.getHeaderLine().get()) {
|
99
|
+
task.setSkipHeaderLines(1);
|
100
|
+
} else {
|
101
|
+
task.setSkipHeaderLines(0);
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
89
105
|
control.run(task.dump(), task.getSchemaConfig().toSchema());
|
90
106
|
}
|
91
107
|
|
92
|
-
private
|
108
|
+
private TimestampParser[] newTimestampParsers(
|
93
109
|
TimestampParser.ParserTask task, Schema schema)
|
94
110
|
{
|
95
|
-
|
111
|
+
TimestampParser[] parsers = new TimestampParser[schema.getColumnCount()];
|
96
112
|
for (Column column : schema.getColumns()) {
|
97
113
|
if (column.getType() instanceof TimestampType) {
|
98
114
|
TimestampType tt = (TimestampType) column.getType();
|
99
|
-
|
115
|
+
parsers[column.getIndex()] = new TimestampParser(tt.getFormat(), task);
|
100
116
|
}
|
101
117
|
}
|
102
|
-
return
|
118
|
+
return parsers;
|
103
119
|
}
|
104
120
|
|
105
121
|
@Override
|
@@ -107,19 +123,18 @@ public class CsvParserPlugin
|
|
107
123
|
FileInput input, PageOutput output)
|
108
124
|
{
|
109
125
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
110
|
-
final
|
111
|
-
|
126
|
+
final TimestampParser[] timestampFormatters = newTimestampParsers(task, schema);
|
127
|
+
LineDecoder lineDecoder = new LineDecoder(input, task);
|
128
|
+
final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
|
112
129
|
final String nullStringOrNull = task.getNullString().orNull();
|
113
|
-
|
130
|
+
int skipHeaderLines = task.getSkipHeaderLines();
|
114
131
|
|
115
132
|
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
116
133
|
while (tokenizer.nextFile()) {
|
117
|
-
|
118
|
-
|
119
|
-
if (
|
120
|
-
|
121
|
-
tokenizer.nextColumn(); // TODO check return value?
|
122
|
-
}
|
134
|
+
// skip the header lines for each file
|
135
|
+
for (; skipHeaderLines > 0; skipHeaderLines--) {
|
136
|
+
if (lineDecoder.poll() == null) {
|
137
|
+
break;
|
123
138
|
}
|
124
139
|
}
|
125
140
|
|
@@ -187,7 +202,7 @@ public class CsvParserPlugin
|
|
187
202
|
pageBuilder.setNull(column);
|
188
203
|
} else {
|
189
204
|
try {
|
190
|
-
pageBuilder.setTimestamp(column,
|
205
|
+
pageBuilder.setTimestamp(column, timestampFormatters[column.getIndex()].parse(v));
|
191
206
|
} catch (TimestampParseException e) {
|
192
207
|
// TODO support default value
|
193
208
|
throw new CsvRecordValidateException(e);
|
@@ -115,9 +115,14 @@ public class CsvTokenizer
|
|
115
115
|
}
|
116
116
|
}
|
117
117
|
|
118
|
+
public boolean hasNextColumn()
|
119
|
+
{
|
120
|
+
return recordState == RecordState.NOT_END;
|
121
|
+
}
|
122
|
+
|
118
123
|
public String nextColumn()
|
119
124
|
{
|
120
|
-
Preconditions.checkState(
|
125
|
+
Preconditions.checkState(hasNextColumn(), "doesn't have enough columns"); // TODO exception class
|
121
126
|
|
122
127
|
// reset last state
|
123
128
|
wasQuotedColumn = false;
|
@@ -64,7 +64,6 @@ public class StdoutOutputPlugin
|
|
64
64
|
while (reader.nextRecord()) {
|
65
65
|
System.out.println(printer.printRecord(reader, ","));
|
66
66
|
}
|
67
|
-
page.release();
|
68
67
|
}
|
69
68
|
|
70
69
|
public void finish()
|
@@ -72,7 +71,10 @@ public class StdoutOutputPlugin
|
|
72
71
|
System.out.flush();
|
73
72
|
}
|
74
73
|
|
75
|
-
public void close()
|
74
|
+
public void close()
|
75
|
+
{
|
76
|
+
reader.close();
|
77
|
+
}
|
76
78
|
|
77
79
|
public void abort() { }
|
78
80
|
|
@@ -31,7 +31,7 @@ public class TestCsvParserPlugin
|
|
31
31
|
CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
|
32
32
|
assertEquals(Charset.forName("utf-8"), task.getCharset());
|
33
33
|
assertEquals(Newline.CRLF, task.getNewline());
|
34
|
-
assertEquals(false, task.getHeaderLine());
|
34
|
+
assertEquals(false, task.getHeaderLine().or(false));
|
35
35
|
assertEquals(',', task.getDelimiterChar());
|
36
36
|
assertEquals('\"', task.getQuoteChar());
|
37
37
|
}
|
@@ -62,7 +62,7 @@ public class TestCsvParserPlugin
|
|
62
62
|
CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
|
63
63
|
assertEquals(Charset.forName("utf-16"), task.getCharset());
|
64
64
|
assertEquals(Newline.LF, task.getNewline());
|
65
|
-
assertEquals(true, task.getHeaderLine());
|
65
|
+
assertEquals(true, task.getHeaderLine().or(false));
|
66
66
|
assertEquals('\t', task.getDelimiterChar());
|
67
67
|
assertEquals('\\', task.getQuoteChar());
|
68
68
|
}
|
data/lib/embulk/file_output.rb
CHANGED
@@ -34,6 +34,7 @@ module Embulk
|
|
34
34
|
def flush
|
35
35
|
unless @buffer.empty?
|
36
36
|
@java_file_output.add(@buffer.to_java)
|
37
|
+
@buffer.clear
|
37
38
|
end
|
38
39
|
nil
|
39
40
|
end
|
@@ -44,7 +45,7 @@ module Embulk
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def close
|
47
|
-
@java_file_output.
|
48
|
+
@java_file_output.close
|
48
49
|
end
|
49
50
|
end
|
50
51
|
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -24,6 +24,9 @@ module Embulk
|
|
24
24
|
"\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
|
25
25
|
]
|
26
26
|
|
27
|
+
MAX_SKIP_LINES = 10
|
28
|
+
NO_SKIP_DETECT_LINES = 10
|
29
|
+
|
27
30
|
def guess_lines(config, sample_lines)
|
28
31
|
delim = guess_delimiter(sample_lines)
|
29
32
|
unless delim
|
@@ -32,7 +35,7 @@ module Embulk
|
|
32
35
|
end
|
33
36
|
|
34
37
|
parser_config = config["parser"] || {}
|
35
|
-
parser_guessed = {"type" => "csv", "delimiter" => delim}
|
38
|
+
parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})
|
36
39
|
|
37
40
|
quote = guess_quote(sample_lines, delim)
|
38
41
|
parser_guessed["quote"] = quote ? quote : ''
|
@@ -44,7 +47,10 @@ module Embulk
|
|
44
47
|
parser_guessed["null_string"] = null_string if null_string
|
45
48
|
# don't even set null_string to avoid confusion of null and 'null' in YAML format
|
46
49
|
|
47
|
-
sample_records = sample_lines
|
50
|
+
sample_records = split_lines(parser_guessed, sample_lines, delim)
|
51
|
+
skip_header_lines = guess_skip_header_lines(sample_records)
|
52
|
+
sample_records = sample_records[skip_header_lines..-1]
|
53
|
+
|
48
54
|
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
49
55
|
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
|
50
56
|
|
@@ -53,12 +59,16 @@ module Embulk
|
|
53
59
|
return {}
|
54
60
|
end
|
55
61
|
|
56
|
-
|
57
|
-
|
62
|
+
header_line = (first_types != other_types && !first_types.any? {|t| t != "string" })
|
63
|
+
|
64
|
+
if header_line
|
65
|
+
parser_guessed["skip_header_lines"] = skip_header_lines + 1
|
66
|
+
else
|
67
|
+
parser_guessed["skip_header_lines"] = skip_header_lines
|
58
68
|
end
|
59
69
|
|
60
70
|
unless parser_config.has_key?("columns")
|
61
|
-
if
|
71
|
+
if header_line
|
62
72
|
column_names = sample_records.first
|
63
73
|
else
|
64
74
|
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
|
@@ -81,6 +91,32 @@ module Embulk
|
|
81
91
|
|
82
92
|
private
|
83
93
|
|
94
|
+
def split_lines(parser_config, sample_lines, delim)
|
95
|
+
parser_task = parser_config.merge({"columns" => []}).load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
|
96
|
+
data = sample_lines.map {|x| x.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
|
97
|
+
sample = Buffer.from_ruby_string(data)
|
98
|
+
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
|
99
|
+
tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
|
100
|
+
rows = []
|
101
|
+
while tokenizer.nextFile
|
102
|
+
while tokenizer.nextRecord
|
103
|
+
columns = []
|
104
|
+
while true
|
105
|
+
begin
|
106
|
+
columns << tokenizer.nextColumn
|
107
|
+
rescue java.lang.IllegalStateException # TODO exception class
|
108
|
+
rows << columns
|
109
|
+
break
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
return rows
|
115
|
+
rescue
|
116
|
+
# TODO warning if fallback to this ad-hoc implementation
|
117
|
+
sample_lines.map {|line| line.split(delim) }
|
118
|
+
end
|
119
|
+
|
84
120
|
def guess_delimiter(sample_lines)
|
85
121
|
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
86
122
|
counts = sample_lines.map {|line| line.count(d) }
|
@@ -154,6 +190,17 @@ module Embulk
|
|
154
190
|
return found ? found[0] : nil
|
155
191
|
end
|
156
192
|
|
193
|
+
def guess_skip_header_lines(sample_records)
|
194
|
+
counts = sample_records.map {|records| records.size }
|
195
|
+
(1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
|
196
|
+
check_row_count = counts[i-1]
|
197
|
+
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c == check_row_count }
|
198
|
+
return i - 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
return 0
|
202
|
+
end
|
203
|
+
|
157
204
|
def array_sum(array)
|
158
205
|
array.inject(0) {|r,i| r += i }
|
159
206
|
end
|
data/lib/embulk/guess_plugin.rb
CHANGED
@@ -38,8 +38,6 @@ module Embulk
|
|
38
38
|
return true
|
39
39
|
rescue LoadError => e
|
40
40
|
# catch LoadError but don't catch ClassNotFoundException
|
41
|
-
# TODO: the best code here is to raise exception only if
|
42
|
-
# `name` file is not in $LOAD_PATH.
|
43
41
|
raise e if e.to_s =~ /java.lang.ClassNotFoundException/
|
44
42
|
raise e if $LOAD_PATH.any? {|dir| File.exists? File.join(dir, "#{name}.rb") }
|
45
43
|
end
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -273,6 +273,7 @@ files:
|
|
273
273
|
- embulk-docs/src/release/release-0.4.9.rst
|
274
274
|
- embulk-docs/src/release/release-0.5.0.rst
|
275
275
|
- embulk-docs/src/release/release-0.5.1.rst
|
276
|
+
- embulk-docs/src/release/release-0.5.2.rst
|
276
277
|
- embulk-standards/build.gradle
|
277
278
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
278
279
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
@@ -377,8 +378,8 @@ files:
|
|
377
378
|
- classpath/bval-jsr303-0.5.jar
|
378
379
|
- classpath/commons-beanutils-core-1.8.3.jar
|
379
380
|
- classpath/commons-lang3-3.1.jar
|
380
|
-
- classpath/embulk-core-0.5.
|
381
|
-
- classpath/embulk-standards-0.5.
|
381
|
+
- classpath/embulk-core-0.5.2.jar
|
382
|
+
- classpath/embulk-standards-0.5.2.jar
|
382
383
|
- classpath/guava-18.0.jar
|
383
384
|
- classpath/guice-3.0.jar
|
384
385
|
- classpath/guice-multibindings-3.0.jar
|