embulk 0.6.12 → 0.6.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/spi/Exec.java +0 -1
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +1 -2
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnNotFoundException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetter.java +18 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +94 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +161 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +80 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/BooleanColumnSetter.java +64 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/DefaultValueSetter.java +18 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/DoubleColumnSetter.java +61 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/LongColumnSetter.java +69 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/NullDefaultValueSetter.java +34 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +54 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/StringColumnSetter.java +56 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/TimestampColumnSetter.java +64 -0
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.6.13.rst +23 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +7 -1
- data/lib/embulk/command/embulk.rb +5 -1
- data/lib/embulk/command/embulk_run.rb +13 -1
- data/lib/embulk/guess/csv.rb +52 -22
- data/lib/embulk/java/imports.rb +2 -0
- data/lib/embulk/page_builder.rb +56 -2
- data/lib/embulk/version.rb +1 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98a09da0f4c6425b69f3094d7c5b105eea0e7d36
|
4
|
+
data.tar.gz: 26383243a036baa447c9bf3ab28d554e2a5fc9e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acc2663c2fc200aeb2aeffc5e3156cdffbfbd8321a87bfbac1fa125e46f5c2006bdc9cdf63484b0157c622c32cf8be81bec336a029bd6c655e866537bc689578
|
7
|
+
data.tar.gz: 41696a15d2d1cd12efcd1d35574b19ddeeaa931f386f8c7093ab123328d460d5b1d82a94880f38c679b8f690edd43c7d5fcf679cee08b9ba019236b36460a9c7
|
data/Gemfile.lock
CHANGED
data/build.gradle
CHANGED
@@ -35,8 +35,7 @@ public class TimestampParser
|
|
35
35
|
return defaultTimeZone;
|
36
36
|
}
|
37
37
|
|
38
|
-
|
39
|
-
private TimestampParser(ScriptingContainer jruby, String format, DateTimeZone defaultTimeZone)
|
38
|
+
public TimestampParser(ScriptingContainer jruby, String format, DateTimeZone defaultTimeZone)
|
40
39
|
{
|
41
40
|
JRubyTimeParserHelperFactory helperFactory = (JRubyTimeParserHelperFactory) jruby.runScriptlet("Embulk::Java::TimeParserHelper::Factory.new");
|
42
41
|
// TODO get default current time from ExecTask.getExecTimestamp
|
@@ -0,0 +1,18 @@
|
|
1
|
+
package org.embulk.spi.util;
|
2
|
+
|
3
|
+
import org.embulk.spi.time.Timestamp;
|
4
|
+
|
5
|
+
public interface DynamicColumnSetter
|
6
|
+
{
|
7
|
+
public void setNull();
|
8
|
+
|
9
|
+
public void set(boolean value);
|
10
|
+
|
11
|
+
public void set(long value);
|
12
|
+
|
13
|
+
public void set(double value);
|
14
|
+
|
15
|
+
public void set(String value);
|
16
|
+
|
17
|
+
public void set(Timestamp value);
|
18
|
+
}
|
@@ -0,0 +1,94 @@
|
|
1
|
+
package org.embulk.spi.util;
|
2
|
+
|
3
|
+
import org.joda.time.DateTimeZone;
|
4
|
+
import org.embulk.config.ConfigSource;
|
5
|
+
import org.embulk.spi.type.Type;
|
6
|
+
import org.embulk.spi.type.BooleanType;
|
7
|
+
import org.embulk.spi.type.LongType;
|
8
|
+
import org.embulk.spi.type.DoubleType;
|
9
|
+
import org.embulk.spi.type.StringType;
|
10
|
+
import org.embulk.spi.type.TimestampType;
|
11
|
+
import org.embulk.spi.util.dynamic.BooleanColumnSetter;
|
12
|
+
import org.embulk.spi.util.dynamic.LongColumnSetter;
|
13
|
+
import org.embulk.spi.util.dynamic.DoubleColumnSetter;
|
14
|
+
import org.embulk.spi.util.dynamic.StringColumnSetter;
|
15
|
+
import org.embulk.spi.util.dynamic.TimestampColumnSetter;
|
16
|
+
import org.embulk.spi.util.dynamic.DefaultValueSetter;
|
17
|
+
import org.embulk.spi.util.dynamic.NullDefaultValueSetter;
|
18
|
+
import org.embulk.spi.time.TimestampFormatter;
|
19
|
+
import org.embulk.spi.time.TimestampParser;
|
20
|
+
import org.embulk.spi.time.TimestampFormat;
|
21
|
+
import org.embulk.spi.Column;
|
22
|
+
import org.embulk.spi.PageBuilder;
|
23
|
+
import org.embulk.config.ConfigException;
|
24
|
+
|
25
|
+
public class DynamicColumnSetterFactory
|
26
|
+
{
|
27
|
+
private final DefaultValueSetter defaultValue;
|
28
|
+
private final DynamicPageBuilder.BuilderTask task;
|
29
|
+
|
30
|
+
DynamicColumnSetterFactory(
|
31
|
+
DynamicPageBuilder.BuilderTask task,
|
32
|
+
DefaultValueSetter defaultValue)
|
33
|
+
{
|
34
|
+
this.defaultValue = defaultValue;
|
35
|
+
this.task = task;
|
36
|
+
}
|
37
|
+
|
38
|
+
public static DefaultValueSetter nullDefaultValue()
|
39
|
+
{
|
40
|
+
return new NullDefaultValueSetter();
|
41
|
+
}
|
42
|
+
|
43
|
+
public DynamicColumnSetter newColumnSetter(PageBuilder pageBuilder, Column column)
|
44
|
+
{
|
45
|
+
Type type = column.getType();
|
46
|
+
if (type instanceof BooleanType) {
|
47
|
+
return new BooleanColumnSetter(pageBuilder, column, defaultValue);
|
48
|
+
} else if (type instanceof LongType) {
|
49
|
+
return new LongColumnSetter(pageBuilder, column, defaultValue);
|
50
|
+
} else if (type instanceof DoubleType) {
|
51
|
+
return new DoubleColumnSetter(pageBuilder, column, defaultValue);
|
52
|
+
} else if (type instanceof StringType) {
|
53
|
+
TimestampFormatter formatter = new TimestampFormatter(task.getJRuby(),
|
54
|
+
getTimestampFormat(column).getFormat(), getTimeZone(column));
|
55
|
+
return new StringColumnSetter(pageBuilder, column, defaultValue, formatter);
|
56
|
+
} else if (type instanceof TimestampType) {
|
57
|
+
// TODO use flexible time format like Ruby's Time.parse
|
58
|
+
TimestampParser parser = new TimestampParser(task.getJRuby(),
|
59
|
+
getTimestampFormat(column).getFormat(), getTimeZone(column));
|
60
|
+
return new TimestampColumnSetter(pageBuilder, column, defaultValue, parser);
|
61
|
+
}
|
62
|
+
throw new ConfigException("Unknown column type: "+type);
|
63
|
+
}
|
64
|
+
|
65
|
+
private TimestampFormat getTimestampFormat(Column column)
|
66
|
+
{
|
67
|
+
DynamicPageBuilder.ColumnOption option = getColumnOption(column);
|
68
|
+
if (option != null) {
|
69
|
+
return option.getTimestampFormat();
|
70
|
+
} else {
|
71
|
+
return new TimestampFormat("%Y-%m-%d %H:%M:%S.%6N");
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
private DateTimeZone getTimeZone(Column column)
|
76
|
+
{
|
77
|
+
DynamicPageBuilder.ColumnOption option = getColumnOption(column);
|
78
|
+
if (option != null) {
|
79
|
+
return option.getTimeZone().or(task.getDefaultTimeZone());
|
80
|
+
} else {
|
81
|
+
return task.getDefaultTimeZone();
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
private DynamicPageBuilder.ColumnOption getColumnOption(Column column)
|
86
|
+
{
|
87
|
+
ConfigSource option = task.getColumnOptions().get(column.getName());
|
88
|
+
if (option != null) {
|
89
|
+
return option.loadConfig(DynamicPageBuilder.ColumnOption.class);
|
90
|
+
} else {
|
91
|
+
return null;
|
92
|
+
}
|
93
|
+
}
|
94
|
+
}
|
@@ -0,0 +1,161 @@
|
|
1
|
+
package org.embulk.spi.util;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.Map;
|
5
|
+
import com.google.common.base.Optional;
|
6
|
+
import org.joda.time.DateTimeZone;
|
7
|
+
import org.jruby.embed.ScriptingContainer;
|
8
|
+
import com.google.common.collect.ImmutableList;
|
9
|
+
import com.google.common.collect.ImmutableMap;
|
10
|
+
import org.embulk.config.Config;
|
11
|
+
import org.embulk.config.ConfigDefault;
|
12
|
+
import org.embulk.config.ConfigInject;
|
13
|
+
import org.embulk.config.ConfigSource;
|
14
|
+
import org.embulk.config.Task;
|
15
|
+
import org.embulk.spi.Schema;
|
16
|
+
import org.embulk.spi.Column;
|
17
|
+
import org.embulk.spi.BufferAllocator;
|
18
|
+
import org.embulk.spi.PageBuilder;
|
19
|
+
import org.embulk.spi.PageOutput;
|
20
|
+
import org.embulk.spi.time.TimestampFormatter;
|
21
|
+
import org.embulk.spi.time.TimestampParser;
|
22
|
+
import org.embulk.spi.time.TimestampFormat;
|
23
|
+
import org.embulk.spi.util.dynamic.SkipColumnSetter;
|
24
|
+
|
25
|
+
public class DynamicPageBuilder
|
26
|
+
implements AutoCloseable
|
27
|
+
{
|
28
|
+
private final PageBuilder pageBuilder;
|
29
|
+
private final Schema schema;
|
30
|
+
private final List<DynamicColumnSetter> setters;
|
31
|
+
private final Map<String, DynamicColumnSetter> columnLookup;
|
32
|
+
|
33
|
+
public static interface BuilderTask
|
34
|
+
extends Task
|
35
|
+
{
|
36
|
+
@Config("default_timezone")
|
37
|
+
@ConfigDefault("\"UTC\"")
|
38
|
+
public DateTimeZone getDefaultTimeZone();
|
39
|
+
|
40
|
+
@Config("column_options")
|
41
|
+
@ConfigDefault("{}")
|
42
|
+
public Map<String, ConfigSource> getColumnOptions();
|
43
|
+
|
44
|
+
// required by TimestampFormatter
|
45
|
+
@ConfigInject
|
46
|
+
public ScriptingContainer getJRuby();
|
47
|
+
}
|
48
|
+
|
49
|
+
public static interface ColumnOption
|
50
|
+
extends Task
|
51
|
+
{
|
52
|
+
@Config("timestamp_format")
|
53
|
+
@ConfigDefault("\"%Y-%m-%d %H:%M:%S.%6N\"")
|
54
|
+
public TimestampFormat getTimestampFormat();
|
55
|
+
|
56
|
+
@Config("timezone")
|
57
|
+
@ConfigDefault("null")
|
58
|
+
public Optional<DateTimeZone> getTimeZone();
|
59
|
+
}
|
60
|
+
|
61
|
+
public DynamicPageBuilder(BuilderTask task,
|
62
|
+
BufferAllocator allocator, Schema schema, PageOutput output)
|
63
|
+
{
|
64
|
+
this.pageBuilder = new PageBuilder(allocator, schema, output);
|
65
|
+
this.schema = schema;
|
66
|
+
|
67
|
+
// TODO configurable default value
|
68
|
+
DynamicColumnSetterFactory factory = new DynamicColumnSetterFactory(task,
|
69
|
+
DynamicColumnSetterFactory.nullDefaultValue());
|
70
|
+
|
71
|
+
ImmutableList.Builder<DynamicColumnSetter> setters = ImmutableList.builder();
|
72
|
+
ImmutableMap.Builder<String, DynamicColumnSetter> lookup = ImmutableMap.builder();
|
73
|
+
for (Column c : schema.getColumns()) {
|
74
|
+
DynamicColumnSetter setter = factory.newColumnSetter(pageBuilder, c);
|
75
|
+
setters.add(setter);
|
76
|
+
lookup.put(c.getName(), setter);
|
77
|
+
}
|
78
|
+
this.setters = setters.build();
|
79
|
+
this.columnLookup = lookup.build();
|
80
|
+
}
|
81
|
+
|
82
|
+
public List<Column> getColumns()
|
83
|
+
{
|
84
|
+
return schema.getColumns();
|
85
|
+
}
|
86
|
+
|
87
|
+
public DynamicColumnSetter column(Column c)
|
88
|
+
{
|
89
|
+
return setters.get(c.getIndex());
|
90
|
+
}
|
91
|
+
|
92
|
+
public DynamicColumnSetter column(int index)
|
93
|
+
{
|
94
|
+
if (index < 0 || setters.size() <= index) {
|
95
|
+
throw new DynamicColumnNotFoundException("Column index '"+index+"' is not exist");
|
96
|
+
}
|
97
|
+
return setters.get(index);
|
98
|
+
}
|
99
|
+
|
100
|
+
public DynamicColumnSetter lookupColumn(String columnName)
|
101
|
+
{
|
102
|
+
DynamicColumnSetter setter = columnLookup.get(columnName);
|
103
|
+
if (setter == null) {
|
104
|
+
throw new DynamicColumnNotFoundException("Column '"+columnName+"' is not exist");
|
105
|
+
}
|
106
|
+
return setter;
|
107
|
+
}
|
108
|
+
|
109
|
+
public DynamicColumnSetter columnOrSkip(int index)
|
110
|
+
{
|
111
|
+
if (index < 0 || setters.size() <= index) {
|
112
|
+
return SkipColumnSetter.get();
|
113
|
+
}
|
114
|
+
return setters.get(index);
|
115
|
+
}
|
116
|
+
|
117
|
+
public DynamicColumnSetter columnOrSkip(String columnName)
|
118
|
+
{
|
119
|
+
DynamicColumnSetter setter = columnLookup.get(columnName);
|
120
|
+
if (setter == null) {
|
121
|
+
return SkipColumnSetter.get();
|
122
|
+
}
|
123
|
+
return setter;
|
124
|
+
}
|
125
|
+
|
126
|
+
// for jruby
|
127
|
+
protected DynamicColumnSetter columnOrNull(int index)
|
128
|
+
{
|
129
|
+
if (index < 0 || setters.size() <= index) {
|
130
|
+
return null;
|
131
|
+
}
|
132
|
+
return setters.get(index);
|
133
|
+
}
|
134
|
+
|
135
|
+
// for jruby
|
136
|
+
protected DynamicColumnSetter columnOrNull(String columnName)
|
137
|
+
{
|
138
|
+
return columnLookup.get(columnName);
|
139
|
+
}
|
140
|
+
|
141
|
+
public void addRecord()
|
142
|
+
{
|
143
|
+
pageBuilder.addRecord();
|
144
|
+
}
|
145
|
+
|
146
|
+
public void flush()
|
147
|
+
{
|
148
|
+
pageBuilder.flush();
|
149
|
+
}
|
150
|
+
|
151
|
+
public void finish()
|
152
|
+
{
|
153
|
+
pageBuilder.finish();
|
154
|
+
}
|
155
|
+
|
156
|
+
@Override
|
157
|
+
public void close()
|
158
|
+
{
|
159
|
+
pageBuilder.close();
|
160
|
+
}
|
161
|
+
}
|
@@ -0,0 +1,80 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
4
|
+
import org.jruby.RubyNil;
|
5
|
+
import org.jruby.RubyBoolean;
|
6
|
+
import org.jruby.RubyInteger;
|
7
|
+
import org.jruby.RubyFloat;
|
8
|
+
import org.jruby.RubyString;
|
9
|
+
import org.jruby.RubyTime;
|
10
|
+
import org.jruby.exceptions.RaiseException;
|
11
|
+
import org.embulk.spi.PageBuilder;
|
12
|
+
import org.embulk.spi.Column;
|
13
|
+
import org.embulk.spi.util.DynamicColumnSetter;
|
14
|
+
import org.embulk.spi.time.Timestamp;
|
15
|
+
|
16
|
+
public abstract class AbstractDynamicColumnSetter
|
17
|
+
implements DynamicColumnSetter
|
18
|
+
{
|
19
|
+
protected final PageBuilder pageBuilder;
|
20
|
+
protected final Column column;
|
21
|
+
protected final DefaultValueSetter defaultValue;
|
22
|
+
|
23
|
+
protected AbstractDynamicColumnSetter(PageBuilder pageBuilder, Column column,
|
24
|
+
DefaultValueSetter defaultValue)
|
25
|
+
{
|
26
|
+
this.pageBuilder = pageBuilder;
|
27
|
+
this.column = column;
|
28
|
+
this.defaultValue = defaultValue;
|
29
|
+
}
|
30
|
+
|
31
|
+
public abstract void setNull();
|
32
|
+
|
33
|
+
public abstract void set(boolean value);
|
34
|
+
|
35
|
+
public abstract void set(long value);
|
36
|
+
|
37
|
+
public abstract void set(double value);
|
38
|
+
|
39
|
+
public abstract void set(String value);
|
40
|
+
|
41
|
+
public abstract void set(Timestamp value);
|
42
|
+
|
43
|
+
public IRubyObject setRubyObject(IRubyObject rubyObject)
|
44
|
+
{
|
45
|
+
if (rubyObject instanceof RubyNil) {
|
46
|
+
setNull();
|
47
|
+
} else if (rubyObject instanceof RubyBoolean) {
|
48
|
+
RubyBoolean b = (RubyBoolean) rubyObject;
|
49
|
+
set(b.isTrue());
|
50
|
+
} else if (rubyObject instanceof RubyInteger) {
|
51
|
+
RubyInteger i = (RubyInteger) rubyObject;
|
52
|
+
try {
|
53
|
+
set(i.getLongValue());
|
54
|
+
} catch (RaiseException ex) {
|
55
|
+
if ("RangeError".equals(ex.getException().getMetaClass().getBaseName())) {
|
56
|
+
// integer is too large
|
57
|
+
throw ex; //TODO setDefaultValue();
|
58
|
+
} else {
|
59
|
+
throw ex;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
} else if (rubyObject instanceof RubyFloat) {
|
63
|
+
RubyFloat f = (RubyFloat) rubyObject;
|
64
|
+
set(f.getDoubleValue());
|
65
|
+
} else if (rubyObject instanceof RubyString) {
|
66
|
+
RubyString s = (RubyString) rubyObject;
|
67
|
+
set(s.asJavaString());
|
68
|
+
} else if (rubyObject instanceof RubyTime) {
|
69
|
+
RubyTime time = (RubyTime) rubyObject;
|
70
|
+
long msec = time.getDateTime().getMillis();
|
71
|
+
long nsec = time.getNSec();
|
72
|
+
long sec = msec / 1000 + nsec / 1000000000;
|
73
|
+
int nano = (int) ((msec % 1000) * 1000000 + nsec % 1000000000);
|
74
|
+
set(Timestamp.ofEpochSecond(sec, nano));
|
75
|
+
} else {
|
76
|
+
throw rubyObject.getRuntime().newTypeError("cannot convert instance of " + rubyObject.getMetaClass() + " to nil, true, false, Integer, Float, String, or Time");
|
77
|
+
}
|
78
|
+
return rubyObject.getRuntime().getNil();
|
79
|
+
}
|
80
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableSet;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.PageBuilder;
|
6
|
+
import org.embulk.spi.time.Timestamp;
|
7
|
+
|
8
|
+
public class BooleanColumnSetter
|
9
|
+
extends AbstractDynamicColumnSetter
|
10
|
+
{
|
11
|
+
private static final ImmutableSet<String> TRUE_STRINGS =
|
12
|
+
ImmutableSet.of(
|
13
|
+
"true", "True", "TRUE",
|
14
|
+
"yes", "Yes", "YES",
|
15
|
+
"t", "T", "y", "Y",
|
16
|
+
"on", "On", "ON",
|
17
|
+
"1");
|
18
|
+
|
19
|
+
public BooleanColumnSetter(PageBuilder pageBuilder, Column column,
|
20
|
+
DefaultValueSetter defaultValue)
|
21
|
+
{
|
22
|
+
super(pageBuilder, column, defaultValue);
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public void setNull()
|
27
|
+
{
|
28
|
+
pageBuilder.setNull(column);
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public void set(boolean v)
|
33
|
+
{
|
34
|
+
pageBuilder.setBoolean(column, v);
|
35
|
+
}
|
36
|
+
|
37
|
+
@Override
|
38
|
+
public void set(long v)
|
39
|
+
{
|
40
|
+
pageBuilder.setBoolean(column, v > 0);
|
41
|
+
}
|
42
|
+
|
43
|
+
@Override
|
44
|
+
public void set(double v)
|
45
|
+
{
|
46
|
+
pageBuilder.setBoolean(column, v > 0.0);
|
47
|
+
}
|
48
|
+
|
49
|
+
@Override
|
50
|
+
public void set(String v)
|
51
|
+
{
|
52
|
+
if (TRUE_STRINGS.contains(v)) {
|
53
|
+
pageBuilder.setBoolean(column, true);
|
54
|
+
} else {
|
55
|
+
defaultValue.setDouble(pageBuilder, column);
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
@Override
|
60
|
+
public void set(Timestamp v)
|
61
|
+
{
|
62
|
+
defaultValue.setBoolean(pageBuilder, column);
|
63
|
+
}
|
64
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import org.embulk.spi.PageBuilder;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
|
7
|
+
public interface DefaultValueSetter
|
8
|
+
{
|
9
|
+
public void setBoolean(PageBuilder pageBuilder, Column c);
|
10
|
+
|
11
|
+
public void setLong(PageBuilder pageBuilder, Column c);
|
12
|
+
|
13
|
+
public void setDouble(PageBuilder pageBuilder, Column c);
|
14
|
+
|
15
|
+
public void setString(PageBuilder pageBuilder, Column c);
|
16
|
+
|
17
|
+
public void setTimestamp(PageBuilder pageBuilder, Column c);
|
18
|
+
}
|
@@ -0,0 +1,61 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import java.math.RoundingMode;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.PageBuilder;
|
6
|
+
import org.embulk.spi.time.Timestamp;
|
7
|
+
|
8
|
+
public class DoubleColumnSetter
|
9
|
+
extends AbstractDynamicColumnSetter
|
10
|
+
{
|
11
|
+
public DoubleColumnSetter(PageBuilder pageBuilder, Column column,
|
12
|
+
DefaultValueSetter defaultValue)
|
13
|
+
{
|
14
|
+
super(pageBuilder, column, defaultValue);
|
15
|
+
}
|
16
|
+
|
17
|
+
@Override
|
18
|
+
public void setNull()
|
19
|
+
{
|
20
|
+
pageBuilder.setNull(column);
|
21
|
+
}
|
22
|
+
|
23
|
+
@Override
|
24
|
+
public void set(boolean v)
|
25
|
+
{
|
26
|
+
pageBuilder.setDouble(column, v ? 1.0 : 0.0);
|
27
|
+
}
|
28
|
+
|
29
|
+
@Override
|
30
|
+
public void set(long v)
|
31
|
+
{
|
32
|
+
pageBuilder.setDouble(column, (double) v);
|
33
|
+
}
|
34
|
+
|
35
|
+
@Override
|
36
|
+
public void set(double v)
|
37
|
+
{
|
38
|
+
pageBuilder.setDouble(column, v);
|
39
|
+
}
|
40
|
+
|
41
|
+
@Override
|
42
|
+
public void set(String v)
|
43
|
+
{
|
44
|
+
double dv;
|
45
|
+
try {
|
46
|
+
dv = Double.parseDouble(v);
|
47
|
+
} catch (NumberFormatException e) {
|
48
|
+
defaultValue.setDouble(pageBuilder, column);
|
49
|
+
return;
|
50
|
+
}
|
51
|
+
pageBuilder.setDouble(column, dv);
|
52
|
+
}
|
53
|
+
|
54
|
+
@Override
|
55
|
+
public void set(Timestamp v)
|
56
|
+
{
|
57
|
+
double sec = (double) v.getEpochSecond();
|
58
|
+
double frac = v.getNano() / 1000000000.0;
|
59
|
+
pageBuilder.setDouble(column, sec + frac);
|
60
|
+
}
|
61
|
+
}
|
@@ -0,0 +1,69 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import java.math.RoundingMode;
|
4
|
+
import com.google.common.math.DoubleMath;
|
5
|
+
import org.embulk.spi.Column;
|
6
|
+
import org.embulk.spi.PageBuilder;
|
7
|
+
import org.embulk.spi.time.Timestamp;
|
8
|
+
|
9
|
+
public class LongColumnSetter
|
10
|
+
extends AbstractDynamicColumnSetter
|
11
|
+
{
|
12
|
+
public LongColumnSetter(PageBuilder pageBuilder, Column column,
|
13
|
+
DefaultValueSetter defaultValue)
|
14
|
+
{
|
15
|
+
super(pageBuilder, column, defaultValue);
|
16
|
+
}
|
17
|
+
|
18
|
+
@Override
|
19
|
+
public void setNull()
|
20
|
+
{
|
21
|
+
pageBuilder.setNull(column);
|
22
|
+
}
|
23
|
+
|
24
|
+
@Override
|
25
|
+
public void set(boolean v)
|
26
|
+
{
|
27
|
+
pageBuilder.setLong(column, v ? 1L : 0L);
|
28
|
+
}
|
29
|
+
|
30
|
+
@Override
|
31
|
+
public void set(long v)
|
32
|
+
{
|
33
|
+
pageBuilder.setLong(column, v);
|
34
|
+
}
|
35
|
+
|
36
|
+
@Override
|
37
|
+
public void set(double v)
|
38
|
+
{
|
39
|
+
long lv;
|
40
|
+
try {
|
41
|
+
// TODO configurable rounding mode
|
42
|
+
lv = DoubleMath.roundToLong(v, RoundingMode.HALF_UP);
|
43
|
+
} catch (ArithmeticException ex) {
|
44
|
+
// NaN / Infinite / -Infinite
|
45
|
+
defaultValue.setLong(pageBuilder, column);
|
46
|
+
return;
|
47
|
+
}
|
48
|
+
pageBuilder.setLong(column, lv);
|
49
|
+
}
|
50
|
+
|
51
|
+
@Override
|
52
|
+
public void set(String v)
|
53
|
+
{
|
54
|
+
long lv;
|
55
|
+
try {
|
56
|
+
lv = Long.parseLong(v);
|
57
|
+
} catch (NumberFormatException e) {
|
58
|
+
defaultValue.setLong(pageBuilder, column);
|
59
|
+
return;
|
60
|
+
}
|
61
|
+
pageBuilder.setLong(column, lv);
|
62
|
+
}
|
63
|
+
|
64
|
+
@Override
|
65
|
+
public void set(Timestamp v)
|
66
|
+
{
|
67
|
+
pageBuilder.setDouble(column, v.getEpochSecond());
|
68
|
+
}
|
69
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import org.embulk.spi.PageBuilder;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
|
7
|
+
public class NullDefaultValueSetter
|
8
|
+
implements DefaultValueSetter
|
9
|
+
{
|
10
|
+
public void setBoolean(PageBuilder pageBuilder, Column c)
|
11
|
+
{
|
12
|
+
pageBuilder.setNull(c);
|
13
|
+
}
|
14
|
+
|
15
|
+
public void setLong(PageBuilder pageBuilder, Column c)
|
16
|
+
{
|
17
|
+
pageBuilder.setNull(c);
|
18
|
+
}
|
19
|
+
|
20
|
+
public void setDouble(PageBuilder pageBuilder, Column c)
|
21
|
+
{
|
22
|
+
pageBuilder.setNull(c);
|
23
|
+
}
|
24
|
+
|
25
|
+
public void setString(PageBuilder pageBuilder, Column c)
|
26
|
+
{
|
27
|
+
pageBuilder.setNull(c);
|
28
|
+
}
|
29
|
+
|
30
|
+
public void setTimestamp(PageBuilder pageBuilder, Column c)
|
31
|
+
{
|
32
|
+
pageBuilder.setNull(c);
|
33
|
+
}
|
34
|
+
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
4
|
+
import org.embulk.spi.PageBuilder;
|
5
|
+
import org.embulk.spi.Column;
|
6
|
+
import org.embulk.spi.time.Timestamp;
|
7
|
+
import org.embulk.spi.time.TimestampParser;
|
8
|
+
import org.embulk.spi.time.TimestampParseException;
|
9
|
+
|
10
|
+
public class SkipColumnSetter
|
11
|
+
extends AbstractDynamicColumnSetter
|
12
|
+
{
|
13
|
+
private static final SkipColumnSetter instance = new SkipColumnSetter();
|
14
|
+
|
15
|
+
public static SkipColumnSetter get()
|
16
|
+
{
|
17
|
+
return instance;
|
18
|
+
}
|
19
|
+
|
20
|
+
private SkipColumnSetter()
|
21
|
+
{
|
22
|
+
super(null, null, null);
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public void setNull()
|
27
|
+
{ }
|
28
|
+
|
29
|
+
@Override
|
30
|
+
public void set(boolean v)
|
31
|
+
{ }
|
32
|
+
|
33
|
+
@Override
|
34
|
+
public void set(long v)
|
35
|
+
{ }
|
36
|
+
|
37
|
+
@Override
|
38
|
+
public void set(double v)
|
39
|
+
{ }
|
40
|
+
|
41
|
+
@Override
|
42
|
+
public void set(String v)
|
43
|
+
{ }
|
44
|
+
|
45
|
+
@Override
|
46
|
+
public void set(Timestamp v)
|
47
|
+
{ }
|
48
|
+
|
49
|
+
@Override
|
50
|
+
public IRubyObject setRubyObject(IRubyObject rubyObject)
|
51
|
+
{
|
52
|
+
return rubyObject.getRuntime().getNil();
|
53
|
+
}
|
54
|
+
}
|
@@ -0,0 +1,56 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import org.embulk.spi.PageBuilder;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
import org.embulk.spi.time.TimestampFormatter;
|
7
|
+
|
8
|
+
public class StringColumnSetter
|
9
|
+
extends AbstractDynamicColumnSetter
|
10
|
+
{
|
11
|
+
private final TimestampFormatter timestampFormatter;
|
12
|
+
|
13
|
+
public StringColumnSetter(PageBuilder pageBuilder, Column column,
|
14
|
+
DefaultValueSetter defaultValue,
|
15
|
+
TimestampFormatter timestampFormatter)
|
16
|
+
{
|
17
|
+
super(pageBuilder, column, defaultValue);
|
18
|
+
this.timestampFormatter = timestampFormatter;
|
19
|
+
}
|
20
|
+
|
21
|
+
@Override
|
22
|
+
public void setNull()
|
23
|
+
{
|
24
|
+
pageBuilder.setNull(column);
|
25
|
+
}
|
26
|
+
|
27
|
+
@Override
|
28
|
+
public void set(boolean v)
|
29
|
+
{
|
30
|
+
pageBuilder.setString(column, Boolean.toString(v));
|
31
|
+
}
|
32
|
+
|
33
|
+
@Override
|
34
|
+
public void set(long v)
|
35
|
+
{
|
36
|
+
pageBuilder.setString(column, Long.toString(v));
|
37
|
+
}
|
38
|
+
|
39
|
+
@Override
|
40
|
+
public void set(double v)
|
41
|
+
{
|
42
|
+
pageBuilder.setString(column, Double.toString(v));
|
43
|
+
}
|
44
|
+
|
45
|
+
@Override
|
46
|
+
public void set(String v)
|
47
|
+
{
|
48
|
+
pageBuilder.setString(column, v);
|
49
|
+
}
|
50
|
+
|
51
|
+
@Override
|
52
|
+
public void set(Timestamp v)
|
53
|
+
{
|
54
|
+
pageBuilder.setString(column, timestampFormatter.format(v));
|
55
|
+
}
|
56
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
package org.embulk.spi.util.dynamic;
|
2
|
+
|
3
|
+
import org.embulk.spi.PageBuilder;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
import org.embulk.spi.time.TimestampParser;
|
7
|
+
import org.embulk.spi.time.TimestampParseException;
|
8
|
+
|
9
|
+
public class TimestampColumnSetter
|
10
|
+
extends AbstractDynamicColumnSetter
|
11
|
+
{
|
12
|
+
private final TimestampParser timestampParser;
|
13
|
+
|
14
|
+
public TimestampColumnSetter(PageBuilder pageBuilder, Column column,
|
15
|
+
DefaultValueSetter defaultValue,
|
16
|
+
TimestampParser timestampParser)
|
17
|
+
{
|
18
|
+
super(pageBuilder, column, defaultValue);
|
19
|
+
this.timestampParser = timestampParser;
|
20
|
+
}
|
21
|
+
|
22
|
+
@Override
|
23
|
+
public void setNull()
|
24
|
+
{
|
25
|
+
pageBuilder.setNull(column);
|
26
|
+
}
|
27
|
+
|
28
|
+
@Override
|
29
|
+
public void set(boolean v)
|
30
|
+
{
|
31
|
+
defaultValue.setTimestamp(pageBuilder, column);
|
32
|
+
}
|
33
|
+
|
34
|
+
@Override
|
35
|
+
public void set(long v)
|
36
|
+
{
|
37
|
+
pageBuilder.setTimestamp(column, Timestamp.ofEpochSecond(v));
|
38
|
+
}
|
39
|
+
|
40
|
+
@Override
|
41
|
+
public void set(double v)
|
42
|
+
{
|
43
|
+
long sec = (long) v;
|
44
|
+
int nsec = (int) ((v - (double) sec) * 1000000000);
|
45
|
+
pageBuilder.setTimestamp(column, Timestamp.ofEpochSecond(sec, nsec));
|
46
|
+
defaultValue.setTimestamp(pageBuilder, column);
|
47
|
+
}
|
48
|
+
|
49
|
+
@Override
|
50
|
+
public void set(String v)
|
51
|
+
{
|
52
|
+
try {
|
53
|
+
pageBuilder.setTimestamp(column, timestampParser.parse(v));
|
54
|
+
} catch (TimestampParseException e) {
|
55
|
+
defaultValue.setTimestamp(pageBuilder, column);
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
@Override
|
60
|
+
public void set(Timestamp v)
|
61
|
+
{
|
62
|
+
pageBuilder.setTimestamp(column, v);
|
63
|
+
}
|
64
|
+
}
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
Release 0.6.13
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Plugin API
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Aded spi.util.DynamicPageBuilder utility class. This is a wrapper of PageBuilder which converts types automatically.
|
8
|
+
* ``page_builder`` used by Ruby plugins are now backed by DynamicPageBuilder. This means that Ruby plugins don't need type conversion code such as to_i or to_s any more.
|
9
|
+
|
10
|
+
Built-in plugins
|
11
|
+
------------------
|
12
|
+
|
13
|
+
* ``guess-csv`` plugin guesses ``trim_if_not_quoted`` option. This works well if a CSV file includes integers with extra spaces (such as CSV files exported by Excel).
|
14
|
+
* Fixed a problem where ``guess-csv`` plugin guessed type of a column as "string" if it includes non-empty null_string.
|
15
|
+
|
16
|
+
General Changes
|
17
|
+
------------------
|
18
|
+
|
19
|
+
* "-b" option installs bundler automatically if it is not installed rather shows "no such file to load -- bundler" error message.
|
20
|
+
|
21
|
+
Release Date
|
22
|
+
------------------
|
23
|
+
2015-06-25
|
@@ -80,14 +80,20 @@ public class CsvTokenizer
|
|
80
80
|
return input.nextFile();
|
81
81
|
}
|
82
82
|
|
83
|
+
// used by guess-csv
|
83
84
|
public boolean nextRecord()
|
85
|
+
{
|
86
|
+
return nextRecord(true);
|
87
|
+
}
|
88
|
+
|
89
|
+
public boolean nextRecord(boolean skipEmptyLine)
|
84
90
|
{
|
85
91
|
// If at the end of record, read the next line and initialize the state
|
86
92
|
if (recordState != RecordState.END) {
|
87
93
|
throw new TooManyColumnsException("Too many columns");
|
88
94
|
}
|
89
95
|
|
90
|
-
boolean hasNext = nextLine(
|
96
|
+
boolean hasNext = nextLine(skipEmptyLine);
|
91
97
|
if (hasNext) {
|
92
98
|
recordState = RecordState.NOT_END;
|
93
99
|
return true;
|
@@ -18,7 +18,11 @@ if bundle_path
|
|
18
18
|
Gem.clear_paths # force rubygems to reload GEM_HOME
|
19
19
|
|
20
20
|
ENV['BUNDLE_GEMFILE'] = File.expand_path File.join(bundle_path, "Gemfile")
|
21
|
-
|
21
|
+
begin
|
22
|
+
require 'bundler'
|
23
|
+
rescue LoadError => e
|
24
|
+
raise "#{e}\nBundler is not installed. Did you run \`$ embulk bundle #{bundle_path}\` ?"
|
25
|
+
end
|
22
26
|
Bundler.load.setup_environment
|
23
27
|
require 'bundler/setup'
|
24
28
|
# since here, `require` may load files of different (newer) embulk versions
|
@@ -261,7 +261,19 @@ examples:
|
|
261
261
|
|
262
262
|
ENV['BUNDLE_GEMFILE'] = File.expand_path File.join(path, "Gemfile")
|
263
263
|
Dir.chdir(path) do
|
264
|
-
|
264
|
+
begin
|
265
|
+
require 'bundler'
|
266
|
+
rescue LoadError
|
267
|
+
require 'rubygems/gem_runner'
|
268
|
+
begin
|
269
|
+
puts "#{Time.now.strftime("%Y-%m-%d %H:%M:%S.%3N %z")}: installing bundler..."
|
270
|
+
Gem::GemRunner.new.run %w[install bundler]
|
271
|
+
rescue Gem::SystemExitException => e
|
272
|
+
raise e if e.exit_code != 0
|
273
|
+
end
|
274
|
+
Gem.clear_paths
|
275
|
+
require 'bundler'
|
276
|
+
end
|
265
277
|
require 'bundler/friendly_errors'
|
266
278
|
require 'bundler/cli'
|
267
279
|
Bundler.with_friendly_errors do
|
data/lib/embulk/guess/csv.rb
CHANGED
@@ -60,12 +60,24 @@ module Embulk
|
|
60
60
|
# don't even set null_string to avoid confusion of null and 'null' in YAML format
|
61
61
|
end
|
62
62
|
|
63
|
-
|
64
|
-
|
63
|
+
# guessing skip_header_lines should be before guessing guess_comment_line_marker
|
64
|
+
# because lines supplied to CsvTokenizer already don't include skipped header lines.
|
65
|
+
# skipping empty lines is also disabled here because skipping header lines is done by
|
66
|
+
# CsvParser which doesn't skip empty lines automatically
|
67
|
+
sample_records = split_lines(parser_guessed, false, sample_lines, delim, {})
|
65
68
|
skip_header_lines = guess_skip_header_lines(sample_records)
|
69
|
+
sample_lines = sample_lines[skip_header_lines..-1]
|
66
70
|
sample_records = sample_records[skip_header_lines..-1]
|
67
71
|
|
68
|
-
|
72
|
+
unless parser_guessed.has_key?("comment_line_marker")
|
73
|
+
comment_line_marker, sample_lines =
|
74
|
+
guess_comment_line_marker(sample_lines, delim, parser_guessed["quote"], parser_guessed["null_string"])
|
75
|
+
if comment_line_marker
|
76
|
+
parser_guessed["comment_line_marker"] = comment_line_marker
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
sample_records = split_lines(parser_guessed, true, sample_lines, delim, {})
|
69
81
|
|
70
82
|
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
71
83
|
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1] || [])
|
@@ -75,6 +87,17 @@ module Embulk
|
|
75
87
|
return {}
|
76
88
|
end
|
77
89
|
|
90
|
+
unless parser_guessed.has_key?("trim_if_not_quoted")
|
91
|
+
sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
|
92
|
+
other_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed[1..-1] || [])
|
93
|
+
if other_types != other_types_trimmed
|
94
|
+
parser_guessed["trim_if_not_quoted"] = true
|
95
|
+
other_types = other_types_trimmed
|
96
|
+
else
|
97
|
+
parser_guessed["trim_if_not_quoted"] = false
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
78
101
|
header_line = (first_types != other_types && first_types.all? {|t| ["string", "boolean"].include?(t) })
|
79
102
|
|
80
103
|
if header_line
|
@@ -83,10 +106,8 @@ module Embulk
|
|
83
106
|
parser_guessed["skip_header_lines"] = skip_header_lines
|
84
107
|
end
|
85
108
|
|
86
|
-
parser_guessed["
|
87
|
-
|
88
|
-
parser_guessed["allow_extra_columns"] = false
|
89
|
-
parser_guessed["allow_optional_columns"] = false
|
109
|
+
parser_guessed["allow_extra_columns"] = false unless parser_guessed.has_key?("allow_extra_columns")
|
110
|
+
parser_guessed["allow_optional_columns"] = false unless parser_guessed.has_key?("allow_optional_columns")
|
90
111
|
|
91
112
|
if header_line
|
92
113
|
column_names = sample_records.first
|
@@ -110,22 +131,26 @@ module Embulk
|
|
110
131
|
|
111
132
|
private
|
112
133
|
|
113
|
-
def split_lines(parser_config, sample_lines, delim)
|
114
|
-
|
115
|
-
|
134
|
+
def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
|
135
|
+
null_string = parser_config["null_string"]
|
136
|
+
config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
|
137
|
+
parser_task = config.load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
|
138
|
+
data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
|
116
139
|
sample = Buffer.from_ruby_string(data)
|
117
140
|
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
|
118
141
|
tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
|
119
142
|
rows = []
|
120
143
|
while tokenizer.nextFile
|
121
|
-
while tokenizer.nextRecord
|
144
|
+
while tokenizer.nextRecord(skip_empty_lines)
|
122
145
|
begin
|
123
146
|
columns = []
|
124
147
|
while true
|
125
148
|
begin
|
126
149
|
column = tokenizer.nextColumn
|
127
150
|
quoted = tokenizer.wasQuotedColumn
|
128
|
-
|
151
|
+
if null_string && !quoted && column == null_string
|
152
|
+
column = nil
|
153
|
+
end
|
129
154
|
columns << column
|
130
155
|
rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
|
131
156
|
rows << columns
|
@@ -228,20 +253,25 @@ module Embulk
|
|
228
253
|
return 0
|
229
254
|
end
|
230
255
|
|
231
|
-
def guess_comment_line_marker(
|
256
|
+
def guess_comment_line_marker(sample_lines, delim, quote, null_string)
|
257
|
+
exclude = []
|
258
|
+
exclude << /^#{Regexp.escape(quote)}/ if quote && !quote.empty?
|
259
|
+
exclude << /^#{Regexp.escape(null_string)}(?:#{Regexp.escape(delim)}|$)/ if null_string
|
260
|
+
|
232
261
|
guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
|
233
262
|
regexp = /^#{Regexp.quote(str)}/
|
234
|
-
|
235
|
-
|
263
|
+
unmatch_lines = sample_lines.reject do |line|
|
264
|
+
exclude.all? {|ex| line !~ ex } && line =~ regexp
|
236
265
|
end
|
237
|
-
|
238
|
-
[str,
|
239
|
-
end.select {|str,
|
240
|
-
|
241
|
-
|
242
|
-
|
266
|
+
match_count = sample_lines.size - unmatch_lines.size
|
267
|
+
[str, match_count, unmatch_lines]
|
268
|
+
end.select {|str,match_count,unmatch_lines| match_count > 0 }.sort_by {|str,match_count,unmatch_lines| -match_count }
|
269
|
+
|
270
|
+
str, match_count, unmatch_lines = guessed.first
|
271
|
+
if str
|
272
|
+
return str, unmatch_lines
|
243
273
|
else
|
244
|
-
return nil,
|
274
|
+
return nil, sample_lines
|
245
275
|
end
|
246
276
|
end
|
247
277
|
|
data/lib/embulk/java/imports.rb
CHANGED
@@ -21,6 +21,7 @@ module Embulk::Java
|
|
21
21
|
java_import 'org.embulk.spi.TransactionalPageOutput'
|
22
22
|
java_import 'org.embulk.spi.PageReader'
|
23
23
|
java_import 'org.embulk.spi.PageBuilder'
|
24
|
+
java_import 'org.embulk.spi.util.DynamicPageBuilder'
|
24
25
|
java_import 'org.embulk.spi.util.LineDecoder'
|
25
26
|
java_import 'org.embulk.spi.util.ListFileInput'
|
26
27
|
java_import 'org.embulk.spi.Schema'
|
@@ -30,6 +31,7 @@ module Embulk::Java
|
|
30
31
|
java_import 'org.embulk.spi.PluginClassLoader'
|
31
32
|
java_import 'org.embulk.spi.FileInputRunner'
|
32
33
|
java_import 'org.embulk.spi.FileOutputRunner'
|
34
|
+
java_import 'org.embulk.spi.Exec'
|
33
35
|
|
34
36
|
# TODO
|
35
37
|
end
|
data/lib/embulk/page_builder.rb
CHANGED
@@ -1,13 +1,67 @@
|
|
1
1
|
module Embulk
|
2
2
|
|
3
|
+
org.embulk.spi.util.dynamic.AbstractDynamicColumnSetter.module_eval do
|
4
|
+
alias_method(:set, :setRubyObject)
|
5
|
+
end
|
6
|
+
|
3
7
|
class PageBuilder
|
4
8
|
def initialize(schema, java_page_output)
|
5
|
-
|
9
|
+
# TODO get task as an argument
|
10
|
+
task = Java::Exec.newConfigSource.load_config(Java::DynamicPageBuilder::BuilderTask.java_class)
|
11
|
+
@page_builder = Java::DynamicPageBuilder.new(task, Java::Injected::BufferAllocator, schema.to_java, java_page_output)
|
6
12
|
@schema = schema
|
7
13
|
end
|
8
14
|
|
9
15
|
def add(record)
|
10
|
-
|
16
|
+
i = 0
|
17
|
+
m = record.size
|
18
|
+
while i < m
|
19
|
+
@page_builder.column(i).set(record[i])
|
20
|
+
i += 1
|
21
|
+
end
|
22
|
+
@page_builder.addRecord
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def [](index_or_column)
|
27
|
+
case index_or_column
|
28
|
+
when Integer
|
29
|
+
@page_builder.column_or_null(index_or_column)
|
30
|
+
when Column
|
31
|
+
@page_builder.column_or_null(index_or_column.index)
|
32
|
+
else
|
33
|
+
@page_builder.column_or_null(index_or_column)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def column(index_or_column)
|
38
|
+
case index_or_column
|
39
|
+
when Integer
|
40
|
+
@page_builder.column(index_or_column)
|
41
|
+
when Column
|
42
|
+
@page_builder.column(index_or_column.index)
|
43
|
+
else
|
44
|
+
@page_builder.lookupColumn(index_or_column)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def column_or_skip(index_or_column)
|
49
|
+
case index_or_column
|
50
|
+
when Integer
|
51
|
+
@page_builder.column_or_skip(index_or_column)
|
52
|
+
when Column
|
53
|
+
@page_builder.column_or_skip(index_or_column.index)
|
54
|
+
else
|
55
|
+
@page_builder.column_or_skip(index_or_column)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def add!
|
60
|
+
@page_builder.add_record
|
61
|
+
end
|
62
|
+
|
63
|
+
def flush
|
64
|
+
@page_builder.flush
|
11
65
|
end
|
12
66
|
|
13
67
|
def finish
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -216,6 +216,10 @@ files:
|
|
216
216
|
- embulk-core/src/main/java/org/embulk/spi/unit/ByteSize.java
|
217
217
|
- embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java
|
218
218
|
- embulk-core/src/main/java/org/embulk/spi/util/Decoders.java
|
219
|
+
- embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnNotFoundException.java
|
220
|
+
- embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetter.java
|
221
|
+
- embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java
|
222
|
+
- embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java
|
219
223
|
- embulk-core/src/main/java/org/embulk/spi/util/Encoders.java
|
220
224
|
- embulk-core/src/main/java/org/embulk/spi/util/Executors.java
|
221
225
|
- embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java
|
@@ -232,6 +236,15 @@ files:
|
|
232
236
|
- embulk-core/src/main/java/org/embulk/spi/util/Pages.java
|
233
237
|
- embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java
|
234
238
|
- embulk-core/src/main/java/org/embulk/spi/util/RetryExecutor.java
|
239
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java
|
240
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/BooleanColumnSetter.java
|
241
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/DefaultValueSetter.java
|
242
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/DoubleColumnSetter.java
|
243
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/LongColumnSetter.java
|
244
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/NullDefaultValueSetter.java
|
245
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java
|
246
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/StringColumnSetter.java
|
247
|
+
- embulk-core/src/main/java/org/embulk/spi/util/dynamic/TimestampColumnSetter.java
|
235
248
|
- embulk-core/src/main/resources/embulk/logback-color.xml
|
236
249
|
- embulk-core/src/main/resources/embulk/logback-console.xml
|
237
250
|
- embulk-core/src/main/resources/embulk/logback-file.xml
|
@@ -301,6 +314,7 @@ files:
|
|
301
314
|
- embulk-docs/src/release/release-0.6.10.rst
|
302
315
|
- embulk-docs/src/release/release-0.6.11.rst
|
303
316
|
- embulk-docs/src/release/release-0.6.12.rst
|
317
|
+
- embulk-docs/src/release/release-0.6.13.rst
|
304
318
|
- embulk-docs/src/release/release-0.6.2.rst
|
305
319
|
- embulk-docs/src/release/release-0.6.3.rst
|
306
320
|
- embulk-docs/src/release/release-0.6.4.rst
|
@@ -417,8 +431,8 @@ files:
|
|
417
431
|
- classpath/bval-jsr303-0.5.jar
|
418
432
|
- classpath/commons-beanutils-core-1.8.3.jar
|
419
433
|
- classpath/commons-lang3-3.1.jar
|
420
|
-
- classpath/embulk-core-0.6.
|
421
|
-
- classpath/embulk-standards-0.6.
|
434
|
+
- classpath/embulk-core-0.6.13.jar
|
435
|
+
- classpath/embulk-standards-0.6.13.jar
|
422
436
|
- classpath/guava-18.0.jar
|
423
437
|
- classpath/guice-4.0.jar
|
424
438
|
- classpath/guice-multibindings-4.0.jar
|