embulk 0.6.20 → 0.6.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/config/DataSource.java +2 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +6 -0
- data/embulk-docs/src/built-in.rst +30 -4
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.6.20.rst +1 -1
- data/embulk-docs/src/release/release-0.6.21.rst +20 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +105 -2
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +8 -4
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +56 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +4 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +3 -2
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +88 -0
- data/lib/embulk/guess/csv.rb +36 -10
- data/lib/embulk/version.rb +1 -1
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90d6280d5bdcffb92922bfd929eae4bcc6d83b92
|
4
|
+
data.tar.gz: 0e2c0d5e8a9f990cdfbb7006be196f9bfef19030
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2ccbee9c830dd29e86734c1615354083fca35068476a30d0a06944926c3eb63b0386ab5db39d0ad7ed660a2c395da84e68d8433ee222274102dc01b30312de6
|
7
|
+
data.tar.gz: 27174e4750c66516ead7b0ff90df6dc804e8d45694354cd03602654680b30dd0464fef0221aa18a9c28e6495ad6709e44e0c4469b12e55f3ca81d8b1bdb06fdc
|
data/build.gradle
CHANGED
@@ -23,7 +23,7 @@ Embulk uses a YAML file to define a bulk data loading. Here is an example of the
|
|
23
23
|
type: csv
|
24
24
|
delimiter: ','
|
25
25
|
quote: '"'
|
26
|
-
escape: ''
|
26
|
+
escape: '"'
|
27
27
|
null_string: 'NULL'
|
28
28
|
skip_header_lines: 1
|
29
29
|
columns:
|
@@ -133,9 +133,9 @@ Options
|
|
133
133
|
+============================+==========+================================================================================================================+========================+
|
134
134
|
| delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` or any single-byte character | ``,`` by default |
|
135
135
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
136
|
-
| quote | string | The character surrounding a quoted value
|
136
|
+
| quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``\"`` by default |
|
137
137
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
138
|
-
| escape | string | Escape character to escape a special character
|
138
|
+
| escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
|
139
139
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
140
140
|
| skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
|
141
141
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
@@ -203,7 +203,7 @@ Example
|
|
203
203
|
newline: CRLF
|
204
204
|
delimiter: "\t"
|
205
205
|
quote: '"'
|
206
|
-
escape: ''
|
206
|
+
escape: '"'
|
207
207
|
null_string: 'NULL'
|
208
208
|
skip_header_lines: 1
|
209
209
|
comment_line_marker: '#'
|
@@ -383,3 +383,29 @@ Example
|
|
383
383
|
- type: gzip
|
384
384
|
level: 1
|
385
385
|
|
386
|
+
Rename filter plugin
|
387
|
+
------------------
|
388
|
+
|
389
|
+
The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
|
390
|
+
|
391
|
+
Options
|
392
|
+
~~~~~~~~~~~~~~~~~~
|
393
|
+
|
394
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
395
|
+
| name | type | description | required? |
|
396
|
+
+=========+==========+======================================================================+====================+
|
397
|
+
| columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
|
398
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
399
|
+
|
400
|
+
Example
|
401
|
+
~~~~~~~~~~~~~~~~~~
|
402
|
+
|
403
|
+
.. code-block:: yaml
|
404
|
+
|
405
|
+
filters:
|
406
|
+
...
|
407
|
+
- type: rename
|
408
|
+
columns:
|
409
|
+
my_existing_column1: new_column1
|
410
|
+
my_existing_column2: new_column2
|
411
|
+
|
data/embulk-docs/src/release.rst
CHANGED
@@ -11,7 +11,7 @@ General Changes
|
|
11
11
|
------------------
|
12
12
|
|
13
13
|
* Change default size of page buffer from 8KB to 32KB.
|
14
|
-
* Size of a page buffer is configurable by system config (@sonots++).
|
14
|
+
* Size of a page buffer is configurable by system config (@sonots++). On command line, ``embulk`` command accepts ``-X page_size=N[unit]`` argument (e.g. ``-X page_size=512KB``).
|
15
15
|
|
16
16
|
|
17
17
|
Release Date
|
@@ -0,0 +1,20 @@
|
|
1
|
+
Release 0.6.21
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Added ``filter-rename`` plugin. We can rename name of columns. This plugin has no impact on performance.
|
8
|
+
* ``parser-csv`` plugin accepts ``null`` to ``quote`` and ``escape`` options to disable quoting or escaping. This is useful if a file includes ``"`` in a non-quoted value.
|
9
|
+
* ``parser-csv`` shows warning if empty string is set to ``quote`` or ``escape`` options. Behavior is kept backward-compatible but it will be rejected in the future.
|
10
|
+
|
11
|
+
|
12
|
+
Java Plugin API
|
13
|
+
------------------
|
14
|
+
|
15
|
+
* Added ``config.DataSource.has`` method to check whether it contains a key or not.
|
16
|
+
|
17
|
+
|
18
|
+
Release Date
|
19
|
+
------------------
|
20
|
+
2015-08-05
|
@@ -2,6 +2,9 @@ package org.embulk.standards;
|
|
2
2
|
|
3
3
|
import com.google.common.base.Optional;
|
4
4
|
import com.google.common.collect.ImmutableSet;
|
5
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
6
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
7
|
+
import com.fasterxml.jackson.annotation.JsonValue;
|
5
8
|
import org.embulk.config.Task;
|
6
9
|
import org.embulk.config.Config;
|
7
10
|
import org.embulk.config.ConfigDefault;
|
@@ -57,11 +60,11 @@ public class CsvParserPlugin
|
|
57
60
|
|
58
61
|
@Config("quote")
|
59
62
|
@ConfigDefault("\"\\\"\"")
|
60
|
-
public
|
63
|
+
public Optional<QuoteCharacter> getQuoteChar();
|
61
64
|
|
62
65
|
@Config("escape")
|
63
66
|
@ConfigDefault("\"\\\\\"")
|
64
|
-
public
|
67
|
+
public Optional<EscapeCharacter> getEscapeChar();
|
65
68
|
|
66
69
|
// Null value handling: if the CsvParser found 'non-quoted empty string's,
|
67
70
|
// it replaces them to string that users specified like "\N", "NULL".
|
@@ -90,6 +93,106 @@ public class CsvParserPlugin
|
|
90
93
|
public boolean getAllowExtraColumns();
|
91
94
|
}
|
92
95
|
|
96
|
+
public static class QuoteCharacter
|
97
|
+
{
|
98
|
+
private final char character;
|
99
|
+
|
100
|
+
public QuoteCharacter(char character)
|
101
|
+
{
|
102
|
+
this.character = character;
|
103
|
+
}
|
104
|
+
|
105
|
+
public static QuoteCharacter noQuote()
|
106
|
+
{
|
107
|
+
return new QuoteCharacter(CsvTokenizer.NO_QUOTE);
|
108
|
+
}
|
109
|
+
|
110
|
+
@JsonCreator
|
111
|
+
public static QuoteCharacter ofString(String str)
|
112
|
+
{
|
113
|
+
if (str.length() >= 2) {
|
114
|
+
throw new ConfigException("\"quote\" option accepts only 1 character.");
|
115
|
+
} else if (str.isEmpty()) {
|
116
|
+
Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"quote\" option is obsoleted. Currently it becomes '\"' automatically but this behavior will be removed. Please set '\"' explicitly.");
|
117
|
+
return new QuoteCharacter('"');
|
118
|
+
} else {
|
119
|
+
return new QuoteCharacter(str.charAt(0));
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
@JsonIgnore
|
124
|
+
public char getCharacter()
|
125
|
+
{
|
126
|
+
return character;
|
127
|
+
}
|
128
|
+
|
129
|
+
@JsonValue
|
130
|
+
public String getOptionalString()
|
131
|
+
{
|
132
|
+
return new String(new char[] { character });
|
133
|
+
}
|
134
|
+
|
135
|
+
@Override
|
136
|
+
public boolean equals(Object obj)
|
137
|
+
{
|
138
|
+
if (!(obj instanceof QuoteCharacter)) {
|
139
|
+
return false;
|
140
|
+
}
|
141
|
+
QuoteCharacter o = (QuoteCharacter) obj;
|
142
|
+
return character == o.character;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
public static class EscapeCharacter
|
147
|
+
{
|
148
|
+
private final char character;
|
149
|
+
|
150
|
+
public EscapeCharacter(char character)
|
151
|
+
{
|
152
|
+
this.character = character;
|
153
|
+
}
|
154
|
+
|
155
|
+
public static EscapeCharacter noEscape()
|
156
|
+
{
|
157
|
+
return new EscapeCharacter(CsvTokenizer.NO_ESCAPE);
|
158
|
+
}
|
159
|
+
|
160
|
+
@JsonCreator
|
161
|
+
public static EscapeCharacter ofString(String str)
|
162
|
+
{
|
163
|
+
if (str.length() >= 2) {
|
164
|
+
throw new ConfigException("\"escape\" option accepts only 1 character.");
|
165
|
+
} else if (str.isEmpty()) {
|
166
|
+
Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"escape\" option is obsoleted. Currently it becomes null automatically but this behavior will be removed. Please set \"escape: null\" explicitly.");
|
167
|
+
return noEscape();
|
168
|
+
} else {
|
169
|
+
return new EscapeCharacter(str.charAt(0));
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
@JsonIgnore
|
174
|
+
public char getCharacter()
|
175
|
+
{
|
176
|
+
return character;
|
177
|
+
}
|
178
|
+
|
179
|
+
@JsonValue
|
180
|
+
public String getOptionalString()
|
181
|
+
{
|
182
|
+
return new String(new char[] { character });
|
183
|
+
}
|
184
|
+
|
185
|
+
@Override
|
186
|
+
public boolean equals(Object obj)
|
187
|
+
{
|
188
|
+
if (!(obj instanceof EscapeCharacter)) {
|
189
|
+
return false;
|
190
|
+
}
|
191
|
+
EscapeCharacter o = (EscapeCharacter) obj;
|
192
|
+
return character == o.character;
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
93
196
|
private final Logger log;
|
94
197
|
|
95
198
|
public CsvParserPlugin()
|
@@ -5,7 +5,9 @@ import java.util.List;
|
|
5
5
|
import java.util.ArrayList;
|
6
6
|
import java.util.Deque;
|
7
7
|
import java.util.ArrayDeque;
|
8
|
+
import org.embulk.config.ConfigException;
|
8
9
|
import org.embulk.spi.util.LineDecoder;
|
10
|
+
import org.embulk.spi.Exec;
|
9
11
|
|
10
12
|
public class CsvTokenizer
|
11
13
|
{
|
@@ -20,6 +22,8 @@ public class CsvTokenizer
|
|
20
22
|
}
|
21
23
|
|
22
24
|
private static final char END_OF_LINE = '\0';
|
25
|
+
static final char NO_QUOTE = '\0';
|
26
|
+
static final char NO_ESCAPE = '\0';
|
23
27
|
|
24
28
|
private final char delimiter;
|
25
29
|
private final char quote;
|
@@ -42,8 +46,8 @@ public class CsvTokenizer
|
|
42
46
|
public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
|
43
47
|
{
|
44
48
|
delimiter = task.getDelimiterChar();
|
45
|
-
quote = task.getQuoteChar()
|
46
|
-
escape = task.getEscapeChar();
|
49
|
+
quote = task.getQuoteChar().or(CsvParserPlugin.QuoteCharacter.noQuote()).getCharacter();
|
50
|
+
escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
|
47
51
|
newline = task.getNewline().getString();
|
48
52
|
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
49
53
|
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
@@ -354,12 +358,12 @@ public class CsvTokenizer
|
|
354
358
|
|
355
359
|
private boolean isQuote(char c)
|
356
360
|
{
|
357
|
-
return c == quote;
|
361
|
+
return quote != NO_QUOTE && c == quote;
|
358
362
|
}
|
359
363
|
|
360
364
|
private boolean isEscape(char c)
|
361
365
|
{
|
362
|
-
return c == escape;
|
366
|
+
return escape != NO_ESCAPE && c == escape;
|
363
367
|
}
|
364
368
|
|
365
369
|
public static class InvalidFormatException
|
@@ -0,0 +1,56 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import org.embulk.config.Config;
|
4
|
+
import org.embulk.config.ConfigDefault;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.config.Task;
|
7
|
+
import org.embulk.config.TaskSource;
|
8
|
+
import org.embulk.spi.Column;
|
9
|
+
import org.embulk.spi.FilterPlugin;
|
10
|
+
import org.embulk.spi.PageOutput;
|
11
|
+
import org.embulk.spi.Schema;
|
12
|
+
|
13
|
+
import java.util.Map;
|
14
|
+
|
15
|
+
public class RenameFilterPlugin
|
16
|
+
implements FilterPlugin
|
17
|
+
{
|
18
|
+
public interface PluginTask
|
19
|
+
extends Task
|
20
|
+
{
|
21
|
+
@Config("columns")
|
22
|
+
@ConfigDefault("{}")
|
23
|
+
Map<String, String> getRenameMap();
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
public void transaction(ConfigSource config, Schema inputSchema,
|
28
|
+
FilterPlugin.Control control)
|
29
|
+
{
|
30
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
31
|
+
Map<String, String> renameMap = task.getRenameMap();
|
32
|
+
|
33
|
+
// check column_options is valid or not
|
34
|
+
for (String columnName : renameMap.keySet()) {
|
35
|
+
inputSchema.lookupColumn(columnName); // throws SchemaConfigException
|
36
|
+
}
|
37
|
+
|
38
|
+
Schema.Builder builder = Schema.builder();
|
39
|
+
for (Column column : inputSchema.getColumns()) {
|
40
|
+
String name = column.getName();
|
41
|
+
if (renameMap.containsKey(name)) {
|
42
|
+
name = renameMap.get(name);
|
43
|
+
}
|
44
|
+
builder.add(name, column.getType());
|
45
|
+
}
|
46
|
+
|
47
|
+
control.run(task.dump(), builder.build());
|
48
|
+
}
|
49
|
+
|
50
|
+
@Override
|
51
|
+
public PageOutput open(TaskSource taskSource, Schema inputSchema,
|
52
|
+
Schema outputSchema, PageOutput output)
|
53
|
+
{
|
54
|
+
return output;
|
55
|
+
}
|
56
|
+
}
|
@@ -5,6 +5,7 @@ import com.google.inject.Binder;
|
|
5
5
|
import com.google.inject.Module;
|
6
6
|
import com.google.inject.name.Names;
|
7
7
|
import com.google.inject.multibindings.Multibinder;
|
8
|
+
import org.embulk.spi.FilterPlugin;
|
8
9
|
import org.embulk.spi.FormatterPlugin;
|
9
10
|
import org.embulk.spi.InputPlugin;
|
10
11
|
import org.embulk.spi.OutputPlugin;
|
@@ -44,6 +45,9 @@ public class StandardPluginModule
|
|
44
45
|
// file encoder plugins
|
45
46
|
registerPluginTo(binder, EncoderPlugin.class, "gzip", GzipFileEncoderPlugin.class);
|
46
47
|
|
48
|
+
// filter plugins
|
49
|
+
registerPluginTo(binder, FilterPlugin.class, "rename", RenameFilterPlugin.class);
|
50
|
+
|
47
51
|
// default guess plugins
|
48
52
|
registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
|
49
53
|
registerDefaultGuessPluginTo(binder, new PluginType("csv"));
|
@@ -4,6 +4,7 @@ import org.junit.Rule;
|
|
4
4
|
import org.junit.Test;
|
5
5
|
import static org.junit.Assert.assertEquals;
|
6
6
|
import java.nio.charset.Charset;
|
7
|
+
import com.google.common.base.Optional;
|
7
8
|
import com.google.common.collect.ImmutableList;
|
8
9
|
import com.google.common.collect.ImmutableMap;
|
9
10
|
import org.joda.time.DateTimeZone;
|
@@ -33,7 +34,7 @@ public class TestCsvParserPlugin
|
|
33
34
|
assertEquals(Newline.CRLF, task.getNewline());
|
34
35
|
assertEquals(false, task.getHeaderLine().or(false));
|
35
36
|
assertEquals(',', task.getDelimiterChar());
|
36
|
-
assertEquals('\"', task.getQuoteChar());
|
37
|
+
assertEquals(Optional.of(new CsvParserPlugin.QuoteCharacter('\"')), task.getQuoteChar());
|
37
38
|
assertEquals(false, task.getAllowOptionalColumns());
|
38
39
|
assertEquals(DateTimeZone.UTC, task.getDefaultTimeZone());
|
39
40
|
assertEquals("%Y-%m-%d %H:%M:%S.%N %z", task.getDefaultTimestampFormat());
|
@@ -68,7 +69,7 @@ public class TestCsvParserPlugin
|
|
68
69
|
assertEquals(Newline.LF, task.getNewline());
|
69
70
|
assertEquals(true, task.getHeaderLine().or(false));
|
70
71
|
assertEquals('\t', task.getDelimiterChar());
|
71
|
-
assertEquals('\\', task.getQuoteChar());
|
72
|
+
assertEquals(Optional.of(new CsvParserPlugin.QuoteCharacter('\\')), task.getQuoteChar());
|
72
73
|
assertEquals(true, task.getAllowOptionalColumns());
|
73
74
|
}
|
74
75
|
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableMap;
|
4
|
+
import org.embulk.EmbulkTestRuntime;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.config.TaskSource;
|
7
|
+
import org.embulk.spi.Column;
|
8
|
+
import org.embulk.spi.FilterPlugin;
|
9
|
+
import org.embulk.spi.Exec;
|
10
|
+
import org.embulk.spi.Schema;
|
11
|
+
import org.embulk.spi.SchemaConfigException;
|
12
|
+
import org.embulk.standards.RenameFilterPlugin.PluginTask;
|
13
|
+
import org.junit.Before;
|
14
|
+
import org.junit.Rule;
|
15
|
+
import org.junit.Test;
|
16
|
+
|
17
|
+
import static org.embulk.spi.type.Types.STRING;
|
18
|
+
import static org.embulk.spi.type.Types.TIMESTAMP;
|
19
|
+
import static org.junit.Assert.assertEquals;
|
20
|
+
import static org.junit.Assert.assertTrue;
|
21
|
+
import static org.junit.Assert.fail;
|
22
|
+
|
23
|
+
public class TestRenameFilterPlugin
|
24
|
+
{
|
25
|
+
@Rule
|
26
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
27
|
+
|
28
|
+
private final Schema SCHEMA = Schema.builder()
|
29
|
+
.add("_c0", STRING)
|
30
|
+
.add("_c1", TIMESTAMP)
|
31
|
+
.build();
|
32
|
+
|
33
|
+
private RenameFilterPlugin filter;
|
34
|
+
|
35
|
+
@Before
|
36
|
+
public void createFilter()
|
37
|
+
{
|
38
|
+
filter = new RenameFilterPlugin();
|
39
|
+
}
|
40
|
+
|
41
|
+
@Test
|
42
|
+
public void checkDefaultValues()
|
43
|
+
{
|
44
|
+
PluginTask task = Exec.newConfigSource().loadConfig(PluginTask.class);
|
45
|
+
assertTrue(task.getRenameMap().isEmpty());
|
46
|
+
}
|
47
|
+
|
48
|
+
@Test
|
49
|
+
public void throwSchemaConfigExceptionIfColumnNotFound()
|
50
|
+
{
|
51
|
+
ConfigSource pluginConfig = Exec.newConfigSource()
|
52
|
+
.set("columns", ImmutableMap.of("not_found", "any_name"));
|
53
|
+
|
54
|
+
try {
|
55
|
+
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
|
56
|
+
public void run(TaskSource task, Schema schema) { }
|
57
|
+
});
|
58
|
+
fail();
|
59
|
+
} catch (Throwable t) {
|
60
|
+
assertTrue(t instanceof SchemaConfigException);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
@Test
|
65
|
+
public void checkRenaming()
|
66
|
+
{
|
67
|
+
ConfigSource pluginConfig = Exec.newConfigSource()
|
68
|
+
.set("columns", ImmutableMap.of("_c0", "_c0_new"));
|
69
|
+
|
70
|
+
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
|
71
|
+
@Override
|
72
|
+
public void run(TaskSource task, Schema newSchema)
|
73
|
+
{
|
74
|
+
// _c0 -> _c0_new
|
75
|
+
Column old0 = SCHEMA.getColumn(0);
|
76
|
+
Column new0 = newSchema.getColumn(0);
|
77
|
+
assertEquals("_c0_new", new0.getName());
|
78
|
+
assertEquals(old0.getType(), new0.getType());
|
79
|
+
|
80
|
+
// _c1 is not changed
|
81
|
+
Column old1 = SCHEMA.getColumn(1);
|
82
|
+
Column new1 = newSchema.getColumn(1);
|
83
|
+
assertEquals("_c1", new1.getName());
|
84
|
+
assertEquals(old1.getType(), new1.getType());
|
85
|
+
}
|
86
|
+
});
|
87
|
+
}
|
88
|
+
}
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -14,7 +14,7 @@ module Embulk
|
|
14
14
|
]
|
15
15
|
|
16
16
|
ESCAPE_CANDIDATES = [
|
17
|
-
"\\"
|
17
|
+
"\\", '"'
|
18
18
|
]
|
19
19
|
|
20
20
|
NULL_STRING_CANDIDATES = [
|
@@ -50,12 +50,33 @@ module Embulk
|
|
50
50
|
|
51
51
|
unless parser_guessed.has_key?("quote")
|
52
52
|
quote = guess_quote(sample_lines, delim)
|
53
|
-
|
53
|
+
unless quote
|
54
|
+
if !guess_force_no_quote(sample_lines, delim, '"')
|
55
|
+
# assuming CSV follows RFC for quoting
|
56
|
+
quote = '"'
|
57
|
+
else
|
58
|
+
# disable quoting (set null)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
parser_guessed["quote"] = quote
|
54
62
|
end
|
63
|
+
parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
|
55
64
|
|
56
65
|
unless parser_guessed.has_key?("escape")
|
57
|
-
|
58
|
-
|
66
|
+
if quote = parser_guessed["quote"]
|
67
|
+
escape = guess_escape(sample_lines, delim, quote)
|
68
|
+
unless escape
|
69
|
+
if quote == '"'
|
70
|
+
# assuming this CSV follows RFC for escaping
|
71
|
+
escape = '"'
|
72
|
+
else
|
73
|
+
# disable escaping (set null)
|
74
|
+
end
|
75
|
+
parser_guessed["escape"] = escape
|
76
|
+
end
|
77
|
+
else
|
78
|
+
# escape does nothing if quote is disabled
|
79
|
+
end
|
59
80
|
end
|
60
81
|
|
61
82
|
unless parser_guessed.has_key?("null_string")
|
@@ -220,13 +241,18 @@ module Embulk
|
|
220
241
|
end
|
221
242
|
end
|
222
243
|
|
223
|
-
def
|
244
|
+
def guess_force_no_quote(sample_lines, delim, quote_candidate)
|
245
|
+
delim_regexp = Regexp.escape(delim)
|
246
|
+
q_regexp = Regexp.escape(quote_candidate)
|
247
|
+
sample_lines.any? do |line|
|
248
|
+
# quoting character appear at the middle of a non-quoted value
|
249
|
+
line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def guess_escape(sample_lines, delim, quote)
|
224
254
|
guessed = ESCAPE_CANDIDATES.map do |str|
|
225
|
-
|
226
|
-
regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(optional_quote)})/
|
227
|
-
else
|
228
|
-
regexp = /#{Regexp.quote(str)}#{Regexp.quote(delim)}/
|
229
|
-
end
|
255
|
+
regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
|
230
256
|
counts = sample_lines.map {|line| line.scan(regexp).count }
|
231
257
|
count = counts.inject(0) {|r,c| r + c }
|
232
258
|
[str, count]
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -340,6 +340,7 @@ files:
|
|
340
340
|
- embulk-docs/src/release/release-0.6.19.rst
|
341
341
|
- embulk-docs/src/release/release-0.6.2.rst
|
342
342
|
- embulk-docs/src/release/release-0.6.20.rst
|
343
|
+
- embulk-docs/src/release/release-0.6.21.rst
|
343
344
|
- embulk-docs/src/release/release-0.6.3.rst
|
344
345
|
- embulk-docs/src/release/release-0.6.4.rst
|
345
346
|
- embulk-docs/src/release/release-0.6.5.rst
|
@@ -356,6 +357,7 @@ files:
|
|
356
357
|
- embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java
|
357
358
|
- embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java
|
358
359
|
- embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java
|
360
|
+
- embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java
|
359
361
|
- embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java
|
360
362
|
- embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java
|
361
363
|
- embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java
|
@@ -363,6 +365,7 @@ files:
|
|
363
365
|
- embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java
|
364
366
|
- embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java
|
365
367
|
- embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java
|
368
|
+
- embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java
|
366
369
|
- embulk.gemspec
|
367
370
|
- gradle/wrapper/gradle-wrapper.jar
|
368
371
|
- gradle/wrapper/gradle-wrapper.properties
|
@@ -455,8 +458,8 @@ files:
|
|
455
458
|
- classpath/bval-jsr303-0.5.jar
|
456
459
|
- classpath/commons-beanutils-core-1.8.3.jar
|
457
460
|
- classpath/commons-lang3-3.1.jar
|
458
|
-
- classpath/embulk-core-0.6.
|
459
|
-
- classpath/embulk-standards-0.6.
|
461
|
+
- classpath/embulk-core-0.6.21.jar
|
462
|
+
- classpath/embulk-standards-0.6.21.jar
|
460
463
|
- classpath/guava-18.0.jar
|
461
464
|
- classpath/guice-4.0.jar
|
462
465
|
- classpath/guice-multibindings-4.0.jar
|