embulk 0.6.20 → 0.6.21
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/config/DataSource.java +2 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +6 -0
- data/embulk-docs/src/built-in.rst +30 -4
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.6.20.rst +1 -1
- data/embulk-docs/src/release/release-0.6.21.rst +20 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +105 -2
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +8 -4
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +56 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +4 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +3 -2
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +88 -0
- data/lib/embulk/guess/csv.rb +36 -10
- data/lib/embulk/version.rb +1 -1
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90d6280d5bdcffb92922bfd929eae4bcc6d83b92
|
4
|
+
data.tar.gz: 0e2c0d5e8a9f990cdfbb7006be196f9bfef19030
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2ccbee9c830dd29e86734c1615354083fca35068476a30d0a06944926c3eb63b0386ab5db39d0ad7ed660a2c395da84e68d8433ee222274102dc01b30312de6
|
7
|
+
data.tar.gz: 27174e4750c66516ead7b0ff90df6dc804e8d45694354cd03602654680b30dd0464fef0221aa18a9c28e6495ad6709e44e0c4469b12e55f3ca81d8b1bdb06fdc
|
data/build.gradle
CHANGED
@@ -23,7 +23,7 @@ Embulk uses a YAML file to define a bulk data loading. Here is an example of the
|
|
23
23
|
type: csv
|
24
24
|
delimiter: ','
|
25
25
|
quote: '"'
|
26
|
-
escape: ''
|
26
|
+
escape: '"'
|
27
27
|
null_string: 'NULL'
|
28
28
|
skip_header_lines: 1
|
29
29
|
columns:
|
@@ -133,9 +133,9 @@ Options
|
|
133
133
|
+============================+==========+================================================================================================================+========================+
|
134
134
|
| delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` or any single-byte character | ``,`` by default |
|
135
135
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
136
|
-
| quote | string | The character surrounding a quoted value
|
136
|
+
| quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``\"`` by default |
|
137
137
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
138
|
-
| escape | string | Escape character to escape a special character
|
138
|
+
| escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
|
139
139
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
140
140
|
| skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
|
141
141
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
@@ -203,7 +203,7 @@ Example
|
|
203
203
|
newline: CRLF
|
204
204
|
delimiter: "\t"
|
205
205
|
quote: '"'
|
206
|
-
escape: ''
|
206
|
+
escape: '"'
|
207
207
|
null_string: 'NULL'
|
208
208
|
skip_header_lines: 1
|
209
209
|
comment_line_marker: '#'
|
@@ -383,3 +383,29 @@ Example
|
|
383
383
|
- type: gzip
|
384
384
|
level: 1
|
385
385
|
|
386
|
+
Rename filter plugin
|
387
|
+
------------------
|
388
|
+
|
389
|
+
The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
|
390
|
+
|
391
|
+
Options
|
392
|
+
~~~~~~~~~~~~~~~~~~
|
393
|
+
|
394
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
395
|
+
| name | type | description | required? |
|
396
|
+
+=========+==========+======================================================================+====================+
|
397
|
+
| columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
|
398
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
399
|
+
|
400
|
+
Example
|
401
|
+
~~~~~~~~~~~~~~~~~~
|
402
|
+
|
403
|
+
.. code-block:: yaml
|
404
|
+
|
405
|
+
filters:
|
406
|
+
...
|
407
|
+
- type: rename
|
408
|
+
columns:
|
409
|
+
my_existing_column1: new_column1
|
410
|
+
my_existing_column2: new_column2
|
411
|
+
|
data/embulk-docs/src/release.rst
CHANGED
@@ -11,7 +11,7 @@ General Changes
|
|
11
11
|
------------------
|
12
12
|
|
13
13
|
* Change default size of page buffer from 8KB to 32KB.
|
14
|
-
* Size of a page buffer is configurable by system config (@sonots++).
|
14
|
+
* Size of a page buffer is configurable by system config (@sonots++). On command line, ``embulk`` command accepts ``-X page_size=N[unit]`` argument (e.g. ``-X page_size=512KB``).
|
15
15
|
|
16
16
|
|
17
17
|
Release Date
|
@@ -0,0 +1,20 @@
|
|
1
|
+
Release 0.6.21
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Added ``filter-rename`` plugin. We can rename name of columns. This plugin has no impact on performance.
|
8
|
+
* ``parser-csv`` plugin accepts ``null`` to ``quote`` and ``escape`` options to disable quoting or escaping. This is useful if a file includes ``"`` in a non-quoted value.
|
9
|
+
* ``parser-csv`` shows warning if empty string is set to ``quote`` or ``escape`` options. Behavior is kept backward-compatible but it will be rejected in the future.
|
10
|
+
|
11
|
+
|
12
|
+
Java Plugin API
|
13
|
+
------------------
|
14
|
+
|
15
|
+
* Added ``config.DataSource.has`` method to check whether it contains a key or not.
|
16
|
+
|
17
|
+
|
18
|
+
Release Date
|
19
|
+
------------------
|
20
|
+
2015-08-05
|
@@ -2,6 +2,9 @@ package org.embulk.standards;
|
|
2
2
|
|
3
3
|
import com.google.common.base.Optional;
|
4
4
|
import com.google.common.collect.ImmutableSet;
|
5
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
6
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
7
|
+
import com.fasterxml.jackson.annotation.JsonValue;
|
5
8
|
import org.embulk.config.Task;
|
6
9
|
import org.embulk.config.Config;
|
7
10
|
import org.embulk.config.ConfigDefault;
|
@@ -57,11 +60,11 @@ public class CsvParserPlugin
|
|
57
60
|
|
58
61
|
@Config("quote")
|
59
62
|
@ConfigDefault("\"\\\"\"")
|
60
|
-
public
|
63
|
+
public Optional<QuoteCharacter> getQuoteChar();
|
61
64
|
|
62
65
|
@Config("escape")
|
63
66
|
@ConfigDefault("\"\\\\\"")
|
64
|
-
public
|
67
|
+
public Optional<EscapeCharacter> getEscapeChar();
|
65
68
|
|
66
69
|
// Null value handling: if the CsvParser found 'non-quoted empty string's,
|
67
70
|
// it replaces them to string that users specified like "\N", "NULL".
|
@@ -90,6 +93,106 @@ public class CsvParserPlugin
|
|
90
93
|
public boolean getAllowExtraColumns();
|
91
94
|
}
|
92
95
|
|
96
|
+
public static class QuoteCharacter
|
97
|
+
{
|
98
|
+
private final char character;
|
99
|
+
|
100
|
+
public QuoteCharacter(char character)
|
101
|
+
{
|
102
|
+
this.character = character;
|
103
|
+
}
|
104
|
+
|
105
|
+
public static QuoteCharacter noQuote()
|
106
|
+
{
|
107
|
+
return new QuoteCharacter(CsvTokenizer.NO_QUOTE);
|
108
|
+
}
|
109
|
+
|
110
|
+
@JsonCreator
|
111
|
+
public static QuoteCharacter ofString(String str)
|
112
|
+
{
|
113
|
+
if (str.length() >= 2) {
|
114
|
+
throw new ConfigException("\"quote\" option accepts only 1 character.");
|
115
|
+
} else if (str.isEmpty()) {
|
116
|
+
Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"quote\" option is obsoleted. Currently it becomes '\"' automatically but this behavior will be removed. Please set '\"' explicitly.");
|
117
|
+
return new QuoteCharacter('"');
|
118
|
+
} else {
|
119
|
+
return new QuoteCharacter(str.charAt(0));
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
@JsonIgnore
|
124
|
+
public char getCharacter()
|
125
|
+
{
|
126
|
+
return character;
|
127
|
+
}
|
128
|
+
|
129
|
+
@JsonValue
|
130
|
+
public String getOptionalString()
|
131
|
+
{
|
132
|
+
return new String(new char[] { character });
|
133
|
+
}
|
134
|
+
|
135
|
+
@Override
|
136
|
+
public boolean equals(Object obj)
|
137
|
+
{
|
138
|
+
if (!(obj instanceof QuoteCharacter)) {
|
139
|
+
return false;
|
140
|
+
}
|
141
|
+
QuoteCharacter o = (QuoteCharacter) obj;
|
142
|
+
return character == o.character;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
public static class EscapeCharacter
|
147
|
+
{
|
148
|
+
private final char character;
|
149
|
+
|
150
|
+
public EscapeCharacter(char character)
|
151
|
+
{
|
152
|
+
this.character = character;
|
153
|
+
}
|
154
|
+
|
155
|
+
public static EscapeCharacter noEscape()
|
156
|
+
{
|
157
|
+
return new EscapeCharacter(CsvTokenizer.NO_ESCAPE);
|
158
|
+
}
|
159
|
+
|
160
|
+
@JsonCreator
|
161
|
+
public static EscapeCharacter ofString(String str)
|
162
|
+
{
|
163
|
+
if (str.length() >= 2) {
|
164
|
+
throw new ConfigException("\"escape\" option accepts only 1 character.");
|
165
|
+
} else if (str.isEmpty()) {
|
166
|
+
Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"escape\" option is obsoleted. Currently it becomes null automatically but this behavior will be removed. Please set \"escape: null\" explicitly.");
|
167
|
+
return noEscape();
|
168
|
+
} else {
|
169
|
+
return new EscapeCharacter(str.charAt(0));
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
@JsonIgnore
|
174
|
+
public char getCharacter()
|
175
|
+
{
|
176
|
+
return character;
|
177
|
+
}
|
178
|
+
|
179
|
+
@JsonValue
|
180
|
+
public String getOptionalString()
|
181
|
+
{
|
182
|
+
return new String(new char[] { character });
|
183
|
+
}
|
184
|
+
|
185
|
+
@Override
|
186
|
+
public boolean equals(Object obj)
|
187
|
+
{
|
188
|
+
if (!(obj instanceof EscapeCharacter)) {
|
189
|
+
return false;
|
190
|
+
}
|
191
|
+
EscapeCharacter o = (EscapeCharacter) obj;
|
192
|
+
return character == o.character;
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
93
196
|
private final Logger log;
|
94
197
|
|
95
198
|
public CsvParserPlugin()
|
@@ -5,7 +5,9 @@ import java.util.List;
|
|
5
5
|
import java.util.ArrayList;
|
6
6
|
import java.util.Deque;
|
7
7
|
import java.util.ArrayDeque;
|
8
|
+
import org.embulk.config.ConfigException;
|
8
9
|
import org.embulk.spi.util.LineDecoder;
|
10
|
+
import org.embulk.spi.Exec;
|
9
11
|
|
10
12
|
public class CsvTokenizer
|
11
13
|
{
|
@@ -20,6 +22,8 @@ public class CsvTokenizer
|
|
20
22
|
}
|
21
23
|
|
22
24
|
private static final char END_OF_LINE = '\0';
|
25
|
+
static final char NO_QUOTE = '\0';
|
26
|
+
static final char NO_ESCAPE = '\0';
|
23
27
|
|
24
28
|
private final char delimiter;
|
25
29
|
private final char quote;
|
@@ -42,8 +46,8 @@ public class CsvTokenizer
|
|
42
46
|
public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
|
43
47
|
{
|
44
48
|
delimiter = task.getDelimiterChar();
|
45
|
-
quote = task.getQuoteChar()
|
46
|
-
escape = task.getEscapeChar();
|
49
|
+
quote = task.getQuoteChar().or(CsvParserPlugin.QuoteCharacter.noQuote()).getCharacter();
|
50
|
+
escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
|
47
51
|
newline = task.getNewline().getString();
|
48
52
|
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
49
53
|
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
@@ -354,12 +358,12 @@ public class CsvTokenizer
|
|
354
358
|
|
355
359
|
private boolean isQuote(char c)
|
356
360
|
{
|
357
|
-
return c == quote;
|
361
|
+
return quote != NO_QUOTE && c == quote;
|
358
362
|
}
|
359
363
|
|
360
364
|
private boolean isEscape(char c)
|
361
365
|
{
|
362
|
-
return c == escape;
|
366
|
+
return escape != NO_ESCAPE && c == escape;
|
363
367
|
}
|
364
368
|
|
365
369
|
public static class InvalidFormatException
|
@@ -0,0 +1,56 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import org.embulk.config.Config;
|
4
|
+
import org.embulk.config.ConfigDefault;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.config.Task;
|
7
|
+
import org.embulk.config.TaskSource;
|
8
|
+
import org.embulk.spi.Column;
|
9
|
+
import org.embulk.spi.FilterPlugin;
|
10
|
+
import org.embulk.spi.PageOutput;
|
11
|
+
import org.embulk.spi.Schema;
|
12
|
+
|
13
|
+
import java.util.Map;
|
14
|
+
|
15
|
+
public class RenameFilterPlugin
|
16
|
+
implements FilterPlugin
|
17
|
+
{
|
18
|
+
public interface PluginTask
|
19
|
+
extends Task
|
20
|
+
{
|
21
|
+
@Config("columns")
|
22
|
+
@ConfigDefault("{}")
|
23
|
+
Map<String, String> getRenameMap();
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
public void transaction(ConfigSource config, Schema inputSchema,
|
28
|
+
FilterPlugin.Control control)
|
29
|
+
{
|
30
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
31
|
+
Map<String, String> renameMap = task.getRenameMap();
|
32
|
+
|
33
|
+
// check column_options is valid or not
|
34
|
+
for (String columnName : renameMap.keySet()) {
|
35
|
+
inputSchema.lookupColumn(columnName); // throws SchemaConfigException
|
36
|
+
}
|
37
|
+
|
38
|
+
Schema.Builder builder = Schema.builder();
|
39
|
+
for (Column column : inputSchema.getColumns()) {
|
40
|
+
String name = column.getName();
|
41
|
+
if (renameMap.containsKey(name)) {
|
42
|
+
name = renameMap.get(name);
|
43
|
+
}
|
44
|
+
builder.add(name, column.getType());
|
45
|
+
}
|
46
|
+
|
47
|
+
control.run(task.dump(), builder.build());
|
48
|
+
}
|
49
|
+
|
50
|
+
@Override
|
51
|
+
public PageOutput open(TaskSource taskSource, Schema inputSchema,
|
52
|
+
Schema outputSchema, PageOutput output)
|
53
|
+
{
|
54
|
+
return output;
|
55
|
+
}
|
56
|
+
}
|
@@ -5,6 +5,7 @@ import com.google.inject.Binder;
|
|
5
5
|
import com.google.inject.Module;
|
6
6
|
import com.google.inject.name.Names;
|
7
7
|
import com.google.inject.multibindings.Multibinder;
|
8
|
+
import org.embulk.spi.FilterPlugin;
|
8
9
|
import org.embulk.spi.FormatterPlugin;
|
9
10
|
import org.embulk.spi.InputPlugin;
|
10
11
|
import org.embulk.spi.OutputPlugin;
|
@@ -44,6 +45,9 @@ public class StandardPluginModule
|
|
44
45
|
// file encoder plugins
|
45
46
|
registerPluginTo(binder, EncoderPlugin.class, "gzip", GzipFileEncoderPlugin.class);
|
46
47
|
|
48
|
+
// filter plugins
|
49
|
+
registerPluginTo(binder, FilterPlugin.class, "rename", RenameFilterPlugin.class);
|
50
|
+
|
47
51
|
// default guess plugins
|
48
52
|
registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
|
49
53
|
registerDefaultGuessPluginTo(binder, new PluginType("csv"));
|
@@ -4,6 +4,7 @@ import org.junit.Rule;
|
|
4
4
|
import org.junit.Test;
|
5
5
|
import static org.junit.Assert.assertEquals;
|
6
6
|
import java.nio.charset.Charset;
|
7
|
+
import com.google.common.base.Optional;
|
7
8
|
import com.google.common.collect.ImmutableList;
|
8
9
|
import com.google.common.collect.ImmutableMap;
|
9
10
|
import org.joda.time.DateTimeZone;
|
@@ -33,7 +34,7 @@ public class TestCsvParserPlugin
|
|
33
34
|
assertEquals(Newline.CRLF, task.getNewline());
|
34
35
|
assertEquals(false, task.getHeaderLine().or(false));
|
35
36
|
assertEquals(',', task.getDelimiterChar());
|
36
|
-
assertEquals('\"', task.getQuoteChar());
|
37
|
+
assertEquals(Optional.of(new CsvParserPlugin.QuoteCharacter('\"')), task.getQuoteChar());
|
37
38
|
assertEquals(false, task.getAllowOptionalColumns());
|
38
39
|
assertEquals(DateTimeZone.UTC, task.getDefaultTimeZone());
|
39
40
|
assertEquals("%Y-%m-%d %H:%M:%S.%N %z", task.getDefaultTimestampFormat());
|
@@ -68,7 +69,7 @@ public class TestCsvParserPlugin
|
|
68
69
|
assertEquals(Newline.LF, task.getNewline());
|
69
70
|
assertEquals(true, task.getHeaderLine().or(false));
|
70
71
|
assertEquals('\t', task.getDelimiterChar());
|
71
|
-
assertEquals('\\', task.getQuoteChar());
|
72
|
+
assertEquals(Optional.of(new CsvParserPlugin.QuoteCharacter('\\')), task.getQuoteChar());
|
72
73
|
assertEquals(true, task.getAllowOptionalColumns());
|
73
74
|
}
|
74
75
|
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableMap;
|
4
|
+
import org.embulk.EmbulkTestRuntime;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.config.TaskSource;
|
7
|
+
import org.embulk.spi.Column;
|
8
|
+
import org.embulk.spi.FilterPlugin;
|
9
|
+
import org.embulk.spi.Exec;
|
10
|
+
import org.embulk.spi.Schema;
|
11
|
+
import org.embulk.spi.SchemaConfigException;
|
12
|
+
import org.embulk.standards.RenameFilterPlugin.PluginTask;
|
13
|
+
import org.junit.Before;
|
14
|
+
import org.junit.Rule;
|
15
|
+
import org.junit.Test;
|
16
|
+
|
17
|
+
import static org.embulk.spi.type.Types.STRING;
|
18
|
+
import static org.embulk.spi.type.Types.TIMESTAMP;
|
19
|
+
import static org.junit.Assert.assertEquals;
|
20
|
+
import static org.junit.Assert.assertTrue;
|
21
|
+
import static org.junit.Assert.fail;
|
22
|
+
|
23
|
+
public class TestRenameFilterPlugin
|
24
|
+
{
|
25
|
+
@Rule
|
26
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
27
|
+
|
28
|
+
private final Schema SCHEMA = Schema.builder()
|
29
|
+
.add("_c0", STRING)
|
30
|
+
.add("_c1", TIMESTAMP)
|
31
|
+
.build();
|
32
|
+
|
33
|
+
private RenameFilterPlugin filter;
|
34
|
+
|
35
|
+
@Before
|
36
|
+
public void createFilter()
|
37
|
+
{
|
38
|
+
filter = new RenameFilterPlugin();
|
39
|
+
}
|
40
|
+
|
41
|
+
@Test
|
42
|
+
public void checkDefaultValues()
|
43
|
+
{
|
44
|
+
PluginTask task = Exec.newConfigSource().loadConfig(PluginTask.class);
|
45
|
+
assertTrue(task.getRenameMap().isEmpty());
|
46
|
+
}
|
47
|
+
|
48
|
+
@Test
|
49
|
+
public void throwSchemaConfigExceptionIfColumnNotFound()
|
50
|
+
{
|
51
|
+
ConfigSource pluginConfig = Exec.newConfigSource()
|
52
|
+
.set("columns", ImmutableMap.of("not_found", "any_name"));
|
53
|
+
|
54
|
+
try {
|
55
|
+
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
|
56
|
+
public void run(TaskSource task, Schema schema) { }
|
57
|
+
});
|
58
|
+
fail();
|
59
|
+
} catch (Throwable t) {
|
60
|
+
assertTrue(t instanceof SchemaConfigException);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
@Test
|
65
|
+
public void checkRenaming()
|
66
|
+
{
|
67
|
+
ConfigSource pluginConfig = Exec.newConfigSource()
|
68
|
+
.set("columns", ImmutableMap.of("_c0", "_c0_new"));
|
69
|
+
|
70
|
+
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
|
71
|
+
@Override
|
72
|
+
public void run(TaskSource task, Schema newSchema)
|
73
|
+
{
|
74
|
+
// _c0 -> _c0_new
|
75
|
+
Column old0 = SCHEMA.getColumn(0);
|
76
|
+
Column new0 = newSchema.getColumn(0);
|
77
|
+
assertEquals("_c0_new", new0.getName());
|
78
|
+
assertEquals(old0.getType(), new0.getType());
|
79
|
+
|
80
|
+
// _c1 is not changed
|
81
|
+
Column old1 = SCHEMA.getColumn(1);
|
82
|
+
Column new1 = newSchema.getColumn(1);
|
83
|
+
assertEquals("_c1", new1.getName());
|
84
|
+
assertEquals(old1.getType(), new1.getType());
|
85
|
+
}
|
86
|
+
});
|
87
|
+
}
|
88
|
+
}
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -14,7 +14,7 @@ module Embulk
|
|
14
14
|
]
|
15
15
|
|
16
16
|
ESCAPE_CANDIDATES = [
|
17
|
-
"\\"
|
17
|
+
"\\", '"'
|
18
18
|
]
|
19
19
|
|
20
20
|
NULL_STRING_CANDIDATES = [
|
@@ -50,12 +50,33 @@ module Embulk
|
|
50
50
|
|
51
51
|
unless parser_guessed.has_key?("quote")
|
52
52
|
quote = guess_quote(sample_lines, delim)
|
53
|
-
|
53
|
+
unless quote
|
54
|
+
if !guess_force_no_quote(sample_lines, delim, '"')
|
55
|
+
# assuming CSV follows RFC for quoting
|
56
|
+
quote = '"'
|
57
|
+
else
|
58
|
+
# disable quoting (set null)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
parser_guessed["quote"] = quote
|
54
62
|
end
|
63
|
+
parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
|
55
64
|
|
56
65
|
unless parser_guessed.has_key?("escape")
|
57
|
-
|
58
|
-
|
66
|
+
if quote = parser_guessed["quote"]
|
67
|
+
escape = guess_escape(sample_lines, delim, quote)
|
68
|
+
unless escape
|
69
|
+
if quote == '"'
|
70
|
+
# assuming this CSV follows RFC for escaping
|
71
|
+
escape = '"'
|
72
|
+
else
|
73
|
+
# disable escaping (set null)
|
74
|
+
end
|
75
|
+
parser_guessed["escape"] = escape
|
76
|
+
end
|
77
|
+
else
|
78
|
+
# escape does nothing if quote is disabled
|
79
|
+
end
|
59
80
|
end
|
60
81
|
|
61
82
|
unless parser_guessed.has_key?("null_string")
|
@@ -220,13 +241,18 @@ module Embulk
|
|
220
241
|
end
|
221
242
|
end
|
222
243
|
|
223
|
-
def
|
244
|
+
def guess_force_no_quote(sample_lines, delim, quote_candidate)
|
245
|
+
delim_regexp = Regexp.escape(delim)
|
246
|
+
q_regexp = Regexp.escape(quote_candidate)
|
247
|
+
sample_lines.any? do |line|
|
248
|
+
# quoting character appear at the middle of a non-quoted value
|
249
|
+
line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def guess_escape(sample_lines, delim, quote)
|
224
254
|
guessed = ESCAPE_CANDIDATES.map do |str|
|
225
|
-
|
226
|
-
regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(optional_quote)})/
|
227
|
-
else
|
228
|
-
regexp = /#{Regexp.quote(str)}#{Regexp.quote(delim)}/
|
229
|
-
end
|
255
|
+
regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
|
230
256
|
counts = sample_lines.map {|line| line.scan(regexp).count }
|
231
257
|
count = counts.inject(0) {|r,c| r + c }
|
232
258
|
[str, count]
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -340,6 +340,7 @@ files:
|
|
340
340
|
- embulk-docs/src/release/release-0.6.19.rst
|
341
341
|
- embulk-docs/src/release/release-0.6.2.rst
|
342
342
|
- embulk-docs/src/release/release-0.6.20.rst
|
343
|
+
- embulk-docs/src/release/release-0.6.21.rst
|
343
344
|
- embulk-docs/src/release/release-0.6.3.rst
|
344
345
|
- embulk-docs/src/release/release-0.6.4.rst
|
345
346
|
- embulk-docs/src/release/release-0.6.5.rst
|
@@ -356,6 +357,7 @@ files:
|
|
356
357
|
- embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java
|
357
358
|
- embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java
|
358
359
|
- embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java
|
360
|
+
- embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java
|
359
361
|
- embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java
|
360
362
|
- embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java
|
361
363
|
- embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java
|
@@ -363,6 +365,7 @@ files:
|
|
363
365
|
- embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java
|
364
366
|
- embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java
|
365
367
|
- embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java
|
368
|
+
- embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java
|
366
369
|
- embulk.gemspec
|
367
370
|
- gradle/wrapper/gradle-wrapper.jar
|
368
371
|
- gradle/wrapper/gradle-wrapper.properties
|
@@ -455,8 +458,8 @@ files:
|
|
455
458
|
- classpath/bval-jsr303-0.5.jar
|
456
459
|
- classpath/commons-beanutils-core-1.8.3.jar
|
457
460
|
- classpath/commons-lang3-3.1.jar
|
458
|
-
- classpath/embulk-core-0.6.
|
459
|
-
- classpath/embulk-standards-0.6.
|
461
|
+
- classpath/embulk-core-0.6.21.jar
|
462
|
+
- classpath/embulk-standards-0.6.21.jar
|
460
463
|
- classpath/guava-18.0.jar
|
461
464
|
- classpath/guice-4.0.jar
|
462
465
|
- classpath/guice-multibindings-4.0.jar
|