embulk-filter-timestamp_format 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/CHANGELOG.md +7 -0
  4. data/README.md +46 -1
  5. data/bench/config_java.yml +14 -0
  6. data/bench/config_jruby.yml +14 -0
  7. data/bench/gen_dummy.rb +5 -0
  8. data/build.gradle +1 -1
  9. data/example/double.csv +2 -0
  10. data/example/double.yml +20 -0
  11. data/example/{json_example.jsonl → example.jsonl} +0 -0
  12. data/example/example.yml +4 -12
  13. data/example/example2.csv +2 -0
  14. data/example/example2.yml +14 -0
  15. data/example/json_double.jsonl +1 -0
  16. data/example/json_double.yml +14 -0
  17. data/example/json_long.jsonl +1 -0
  18. data/example/json_long.yml +14 -0
  19. data/example/json_string.jsonl +2 -0
  20. data/example/json_string.yml +14 -0
  21. data/example/long.csv +1 -0
  22. data/example/long.yml +20 -0
  23. data/example/string.csv +4 -0
  24. data/example/{string_example.yml → string.yml} +6 -5
  25. data/example/string_java.yml +23 -0
  26. data/example/timestamp.csv +2 -0
  27. data/example/{timestamp_example.yml → timestamp.yml} +4 -4
  28. data/src/main/java/org/embulk/filter/timestamp_format/ColumnCaster.java +107 -14
  29. data/src/main/java/org/embulk/filter/timestamp_format/ColumnVisitorImpl.java +104 -33
  30. data/src/main/java/org/embulk/filter/timestamp_format/JsonCaster.java +61 -4
  31. data/src/main/java/org/embulk/filter/timestamp_format/JsonVisitor.java +8 -0
  32. data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatFilterPlugin.java +28 -17
  33. data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatter.java +36 -5
  34. data/src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java +57 -26
  35. data/src/main/java/org/embulk/filter/timestamp_format/TimestampUnit.java +112 -0
  36. data/src/main/java/org/embulk/filter/timestamp_format/TimestampUnitDeserializer.java +54 -0
  37. data/src/main/java/org/embulk/filter/timestamp_format/cast/DoubleCast.java +32 -0
  38. data/src/main/java/org/embulk/filter/timestamp_format/cast/LongCast.java +32 -0
  39. data/src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java +20 -4
  40. data/src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java +5 -6
  41. data/src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java +192 -0
  42. metadata +29 -8
  43. data/example/json_example.yml +0 -14
  44. data/src/test/java/org/embulk/filter/TestTimestampFormatFilterPlugin.java +0 -5
@@ -4,6 +4,7 @@ import org.embulk.spi.DataException;
4
4
  import org.embulk.spi.PageReader;
5
5
  import org.embulk.spi.Schema;
6
6
 
7
+ import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
7
8
  import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
8
9
 
9
10
  import org.embulk.spi.Column;
@@ -13,6 +14,7 @@ import org.embulk.spi.PageBuilder;
13
14
  import org.slf4j.Logger;
14
15
 
15
16
  import java.util.HashMap;
17
+ import java.util.HashSet;
16
18
 
17
19
  public class ColumnVisitorImpl
18
20
  implements ColumnVisitor
@@ -23,6 +25,7 @@ public class ColumnVisitorImpl
23
25
  private final Schema outputSchema;
24
26
  private final PageReader pageReader;
25
27
  private final PageBuilder pageBuilder;
28
+ private final HashSet<String> shouldCastSet = new HashSet<>();
26
29
  private final HashMap<String, Column> outputColumnMap = new HashMap<>();
27
30
  private final ColumnCaster columnCaster;
28
31
 
@@ -35,10 +38,30 @@ public class ColumnVisitorImpl
35
38
  this.pageReader = pageReader;
36
39
  this.pageBuilder = pageBuilder;
37
40
 
41
+ buildShouldCastSet();
38
42
  buildOutputColumnMap();
39
43
  this.columnCaster = new ColumnCaster(task, inputSchema, outputSchema, pageReader, pageBuilder);
40
44
  }
41
45
 
46
+ private void buildShouldCastSet()
47
+ {
48
+ // columnName => Boolean to avoid unnecessary cast
49
+ for (ColumnConfig columnConfig : task.getColumns()) {
50
+ String name = columnConfig.getName();
51
+ if (name.startsWith("$.")) {
52
+ String firstName = name.split("\\.", 3)[1]; // check only top level column name
53
+ shouldCastSet.add(firstName);
54
+ continue;
55
+ }
56
+ shouldCastSet.add(name);
57
+ }
58
+ }
59
+
60
+ private boolean shouldCast(String name)
61
+ {
62
+ return shouldCastSet.contains(name);
63
+ }
64
+
42
65
  private void buildOutputColumnMap()
43
66
  {
44
67
  // columnName => outputColumn
@@ -88,67 +111,115 @@ public class ColumnVisitorImpl
88
111
  @Override
89
112
  public void longColumn(final Column inputColumn)
90
113
  {
91
- if (pageReader.isNull(inputColumn)) {
92
- pageBuilder.setNull(inputColumn);
114
+ String name = inputColumn.getName();
115
+ if (! shouldCast(name)){
116
+ if (pageReader.isNull(inputColumn)) {
117
+ pageBuilder.setNull(inputColumn);
118
+ }
119
+ else {
120
+ pageBuilder.setLong(inputColumn, pageReader.getLong(inputColumn));
121
+ }
93
122
  }
94
123
  else {
95
- pageBuilder.setLong(inputColumn, pageReader.getLong(inputColumn));
124
+ final Column outputColumn = outputColumnMap.get(name);
125
+ PageBuildable op = new PageBuildable() {
126
+ public void run() throws DataException {
127
+ columnCaster.setFromLong(outputColumn, pageReader.getLong(inputColumn));
128
+ }
129
+ };
130
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
96
131
  }
97
132
  }
98
133
 
99
134
  @Override
100
135
  public void doubleColumn(final Column inputColumn)
101
136
  {
102
- if (pageReader.isNull(inputColumn)) {
103
- pageBuilder.setNull(inputColumn);
137
+ String name = inputColumn.getName();
138
+ if (! shouldCast(name)){
139
+ if (pageReader.isNull(inputColumn)) {
140
+ pageBuilder.setNull(inputColumn);
141
+ }
142
+ else {
143
+ pageBuilder.setDouble(inputColumn, pageReader.getDouble(inputColumn));
144
+ }
104
145
  }
105
146
  else {
106
- pageBuilder.setDouble(inputColumn, pageReader.getDouble(inputColumn));
147
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
148
+ PageBuildable op = new PageBuildable() {
149
+ public void run() throws DataException {
150
+ columnCaster.setFromDouble(outputColumn, pageReader.getDouble(inputColumn));
151
+ }
152
+ };
153
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
107
154
  }
108
155
  }
109
156
 
110
157
  @Override
111
158
  public void stringColumn(final Column inputColumn)
112
159
  {
113
- if (pageReader.isNull(inputColumn)) {
114
- pageBuilder.setNull(inputColumn);
115
- return;
116
- }
117
- final Column outputColumn = outputColumnMap.get(inputColumn.getName());
118
- PageBuildable op = new PageBuildable() {
119
- public void run() throws DataException
120
- {
121
- columnCaster.setFromString(outputColumn, pageReader.getString(inputColumn));
160
+ String name = inputColumn.getName();
161
+ if (! shouldCast(name)){
162
+ if (pageReader.isNull(inputColumn)) {
163
+ pageBuilder.setNull(inputColumn);
122
164
  }
123
- };
124
- withStopOnInvalidRecord(op, inputColumn, outputColumn);
165
+ else {
166
+ pageBuilder.setString(inputColumn, pageReader.getString(inputColumn));
167
+ }
168
+ }
169
+ else {
170
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
171
+ PageBuildable op = new PageBuildable() {
172
+ public void run() throws DataException {
173
+ columnCaster.setFromString(outputColumn, pageReader.getString(inputColumn));
174
+ }
175
+ };
176
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
177
+ }
125
178
  }
126
179
 
127
180
  @Override
128
181
  public void timestampColumn(final Column inputColumn)
129
182
  {
130
- if (pageReader.isNull(inputColumn)) {
131
- pageBuilder.setNull(inputColumn);
132
- return;
133
- }
134
- final Column outputColumn = outputColumnMap.get(inputColumn.getName());
135
- PageBuildable op = new PageBuildable() {
136
- public void run() throws DataException
137
- {
138
- columnCaster.setFromTimestamp(outputColumn, pageReader.getTimestamp(inputColumn));
183
+ String name = inputColumn.getName();
184
+ if (! shouldCast(name)){
185
+ if (pageReader.isNull(inputColumn)) {
186
+ pageBuilder.setNull(inputColumn);
139
187
  }
140
- };
141
- withStopOnInvalidRecord(op, inputColumn, outputColumn);
188
+ else {
189
+ pageBuilder.setTimestamp(inputColumn, pageReader.getTimestamp(inputColumn));
190
+ }
191
+ }
192
+ else {
193
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
194
+ PageBuildable op = new PageBuildable() {
195
+ public void run() throws DataException {
196
+ columnCaster.setFromTimestamp(outputColumn, pageReader.getTimestamp(inputColumn));
197
+ }
198
+ };
199
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
200
+ }
142
201
  }
143
202
 
144
203
  @Override
145
204
  public void jsonColumn(final Column inputColumn)
146
205
  {
147
- if (pageReader.isNull(inputColumn)) {
148
- pageBuilder.setNull(inputColumn);
149
- return;
206
+ String name = inputColumn.getName();
207
+ if (! shouldCast(name)){
208
+ if (pageReader.isNull(inputColumn)) {
209
+ pageBuilder.setNull(inputColumn);
210
+ }
211
+ else {
212
+ pageBuilder.setJson(inputColumn, pageReader.getJson(inputColumn));
213
+ }
214
+ }
215
+ else {
216
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
217
+ PageBuildable op = new PageBuildable() {
218
+ public void run() throws DataException {
219
+ columnCaster.setFromJson(outputColumn, pageReader.getJson(inputColumn));
220
+ }
221
+ };
222
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
150
223
  }
151
- final Column outputColumn = outputColumnMap.get(inputColumn.getName());
152
- columnCaster.setFromJson(outputColumn, pageReader.getJson(inputColumn));
153
224
  }
154
225
  }
@@ -1,5 +1,8 @@
1
1
  package org.embulk.filter.timestamp_format;
2
2
 
3
+ import org.embulk.config.ConfigException;
4
+ import org.embulk.filter.timestamp_format.cast.DoubleCast;
5
+ import org.embulk.filter.timestamp_format.cast.LongCast;
3
6
  import org.embulk.filter.timestamp_format.cast.StringCast;
4
7
  import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
5
8
  import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
@@ -8,6 +11,8 @@ import org.embulk.spi.type.DoubleType;
8
11
  import org.embulk.spi.type.LongType;
9
12
  import org.embulk.spi.type.StringType;
10
13
  import org.embulk.spi.type.Type;
14
+ import org.msgpack.value.FloatValue;
15
+ import org.msgpack.value.IntegerValue;
11
16
  import org.msgpack.value.StringValue;
12
17
  import org.msgpack.value.Value;
13
18
  import org.msgpack.value.ValueFactory;
@@ -22,14 +27,64 @@ class JsonCaster
22
27
  private final PluginTask task;
23
28
  private final HashMap<String, TimestampParser> timestampParserMap;
24
29
  private final HashMap<String, TimestampFormatter> timestampFormatterMap;
30
+ private final HashMap<String, TimestampUnit> fromTimestampUnitMap;
31
+ private final HashMap<String, TimestampUnit> toTimestampUnitMap;
25
32
 
26
33
  JsonCaster(PluginTask task,
27
34
  HashMap<String, TimestampParser> timestampParserMap,
28
- HashMap<String, TimestampFormatter> timestampFormatterMap)
35
+ HashMap<String, TimestampFormatter> timestampFormatterMap,
36
+ HashMap<String, TimestampUnit> fromTimestampUnitMap,
37
+ HashMap<String, TimestampUnit> toTimestampUnitMap)
29
38
  {
30
39
  this.task = task;
31
40
  this.timestampParserMap = timestampParserMap;
32
41
  this.timestampFormatterMap = timestampFormatterMap;
42
+ this.fromTimestampUnitMap = fromTimestampUnitMap;
43
+ this.toTimestampUnitMap = toTimestampUnitMap;
44
+ }
45
+
46
+ public Value fromLong(ColumnConfig columnConfig, IntegerValue value)
47
+ {
48
+ Type outputType = columnConfig.getType();
49
+ TimestampUnit fromUnit = fromTimestampUnitMap.get(columnConfig.getName());
50
+ if (outputType instanceof StringType) {
51
+ TimestampFormatter formatter = timestampFormatterMap.get(columnConfig.getName());
52
+ return ValueFactory.newString(LongCast.asString(value.asLong(), fromUnit, formatter));
53
+ }
54
+ else if (outputType instanceof LongType) {
55
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
56
+ return ValueFactory.newInteger(LongCast.asLong(value.asLong(), fromUnit, toUnit));
57
+ }
58
+ else if (outputType instanceof DoubleType) {
59
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
60
+ return ValueFactory.newFloat(LongCast.asDouble(value.asLong(), fromUnit, toUnit));
61
+ }
62
+ else {
63
+ assert false;
64
+ throw new RuntimeException();
65
+ }
66
+ }
67
+
68
+ public Value fromDouble(ColumnConfig columnConfig, FloatValue value)
69
+ {
70
+ Type outputType = columnConfig.getType();
71
+ TimestampUnit fromUnit = fromTimestampUnitMap.get(columnConfig.getName());
72
+ if (outputType instanceof StringType) {
73
+ TimestampFormatter formatter = timestampFormatterMap.get(columnConfig.getName());
74
+ return ValueFactory.newString(DoubleCast.asString(value.toDouble(), fromUnit, formatter));
75
+ }
76
+ else if (outputType instanceof LongType) {
77
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
78
+ return ValueFactory.newInteger(DoubleCast.asLong(value.toDouble(), fromUnit, toUnit));
79
+ }
80
+ else if (outputType instanceof DoubleType) {
81
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
82
+ return ValueFactory.newFloat(DoubleCast.asDouble(value.toDouble(), fromUnit, toUnit));
83
+ }
84
+ else {
85
+ assert false;
86
+ throw new RuntimeException();
87
+ }
33
88
  }
34
89
 
35
90
  public Value fromString(ColumnConfig columnConfig, StringValue value)
@@ -41,14 +96,16 @@ class JsonCaster
41
96
  return ValueFactory.newString(StringCast.asString(value.asString(), parser, formatter));
42
97
  }
43
98
  else if (outputType instanceof LongType) {
44
- return ValueFactory.newInteger(StringCast.asLong(value.asString(), parser));
99
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
100
+ return ValueFactory.newInteger(StringCast.asLong(value.asString(), parser, toUnit));
45
101
  }
46
102
  else if (outputType instanceof DoubleType) {
47
- return ValueFactory.newFloat(StringCast.asDouble(value.asString(), parser));
103
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
104
+ return ValueFactory.newFloat(StringCast.asDouble(value.asString(), parser, toUnit));
48
105
  }
49
106
  else {
50
107
  assert false;
51
- return null;
108
+ throw new RuntimeException();
52
109
  }
53
110
  }
54
111
  }
@@ -108,6 +108,14 @@ public class JsonVisitor
108
108
  }
109
109
  return ValueFactory.newMap(newValue, true);
110
110
  }
111
+ else if (value.isIntegerValue()) {
112
+ ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
113
+ return jsonCaster.fromLong(columnConfig, value.asIntegerValue());
114
+ }
115
+ else if (value.isFloatValue()) {
116
+ ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
117
+ return jsonCaster.fromDouble(columnConfig, value.asFloatValue());
118
+ }
111
119
  else if (value.isStringValue()) {
112
120
  ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
113
121
  return jsonCaster.fromString(columnConfig, value.asStringValue());
@@ -1,5 +1,6 @@
1
1
  package org.embulk.filter.timestamp_format;
2
2
 
3
+ import com.google.common.base.Optional;
3
4
  import com.google.common.collect.ImmutableList;
4
5
  import org.embulk.config.Config;
5
6
  import org.embulk.config.ConfigDefault;
@@ -18,9 +19,9 @@ import org.embulk.spi.PageOutput;
18
19
  import org.embulk.spi.PageReader;
19
20
  import org.embulk.spi.Schema;
20
21
 
21
- import org.embulk.spi.type.DoubleType;
22
- import org.embulk.spi.type.LongType;
23
- import org.embulk.spi.type.StringType;
22
+ import org.embulk.spi.time.Timestamp;
23
+ import org.embulk.spi.type.BooleanType;
24
+ import org.embulk.spi.type.JsonType;
24
25
  import org.embulk.spi.type.TimestampType;
25
26
  import org.embulk.spi.type.Type;
26
27
  import org.jruby.embed.ScriptingContainer;
@@ -44,6 +45,14 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
44
45
  @Config("type")
45
46
  @ConfigDefault("\"string\"")
46
47
  Type getType();
48
+
49
+ @Config("from_unit")
50
+ @ConfigDefault("null")
51
+ Optional<TimestampUnit> getFromUnit();
52
+
53
+ @Config("to_unit")
54
+ @ConfigDefault("null")
55
+ Optional<TimestampUnit> getToUnit();
47
56
  }
48
57
 
49
58
  interface PluginTask extends Task,
@@ -57,6 +66,14 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
57
66
  @ConfigDefault("false")
58
67
  Boolean getStopOnInvalidRecord();
59
68
 
69
+ @Config("default_from_timestamp_unit")
70
+ @ConfigDefault("\"second\"")
71
+ TimestampUnit getDefaultFromTimestampUnit();
72
+
73
+ @Config("default_to_timestamp_unit")
74
+ @ConfigDefault("\"second\"")
75
+ TimestampUnit getDefaultToTimestampUnit();
76
+
60
77
  @ConfigInject
61
78
  ScriptingContainer getJRuby();
62
79
  }
@@ -88,24 +105,18 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
88
105
  }
89
106
  }
90
107
 
91
- // throw if column type is not string or timestamp
108
+ // throw if column type is not supported
92
109
  for (ColumnConfig columnConfig : columns) {
110
+ String name = columnConfig.getName();
93
111
  Type type = columnConfig.getType();
94
- boolean acceptable = false;
95
- if (type instanceof StringType) {
96
- continue;
97
- }
98
- else if (type instanceof TimestampType) {
99
- continue;
112
+ if (type instanceof BooleanType) {
113
+ throw new ConfigException(String.format("casting to boolean is not available: \"%s\"", name));
100
114
  }
101
- else if (type instanceof LongType) {
102
- continue;
115
+ if (type instanceof JsonType) {
116
+ throw new ConfigException(String.format("casting to json is not available: \"%s\"", name));
103
117
  }
104
- else if (type instanceof DoubleType) {
105
- continue;
106
- }
107
- else {
108
- throw new ConfigException("column type must be string, timestamp, long, or double");
118
+ if (name.startsWith("$.") && type instanceof TimestampType) {
119
+ throw new ConfigException(String.format("casting a json path into timestamp is not available: \"%s\"", name));
109
120
  }
110
121
  }
111
122
  }
@@ -15,6 +15,8 @@ import org.joda.time.DateTimeZone;
15
15
  import org.jruby.embed.ScriptingContainer;
16
16
  import org.jruby.util.RubyDateFormat;
17
17
 
18
+ import java.text.SimpleDateFormat;
19
+ import java.util.Date;
18
20
  import java.util.Locale;
19
21
 
20
22
  public class TimestampFormatter
@@ -41,7 +43,8 @@ public class TimestampFormatter
41
43
  Optional<String> getToFormat();
42
44
  }
43
45
 
44
- private final RubyDateFormat toDateFormat;
46
+ private final RubyDateFormat jrubyFormatter;
47
+ private final SimpleDateFormat javaFormatter;
45
48
  private final DateTimeZone toTimeZone;
46
49
 
47
50
  public TimestampFormatter(PluginTask task, Optional<? extends TimestampColumnOption> columnOption)
@@ -58,7 +61,15 @@ public class TimestampFormatter
58
61
  public TimestampFormatter(ScriptingContainer jruby, String format, DateTimeZone toTimeZone)
59
62
  {
60
63
  this.toTimeZone = toTimeZone;
61
- this.toDateFormat = new RubyDateFormat(format, Locale.ENGLISH, true);
64
+ if (format.contains("%")) {
65
+ this.jrubyFormatter = new RubyDateFormat(format, Locale.ENGLISH, true);
66
+ this.javaFormatter = null;
67
+ }
68
+ else {
69
+ this.jrubyFormatter = null;
70
+ this.javaFormatter = new SimpleDateFormat(format, Locale.ENGLISH);
71
+ javaFormatter.setTimeZone(toTimeZone.toTimeZone());
72
+ }
62
73
  }
63
74
 
64
75
  public DateTimeZone getToTimeZone()
@@ -73,10 +84,30 @@ public class TimestampFormatter
73
84
  }
74
85
 
75
86
  public String format(Timestamp value)
87
+ {
88
+ if (jrubyFormatter != null) {
89
+ return jrubyFormat(value);
90
+ }
91
+ else if (javaFormatter != null) {
92
+ return javaFormat(value);
93
+ }
94
+ else {
95
+ assert false;
96
+ throw new RuntimeException();
97
+ }
98
+ }
99
+
100
+ private String jrubyFormat(Timestamp value)
76
101
  {
77
102
  // TODO optimize by using reused StringBuilder
78
- toDateFormat.setDateTime(new DateTime(value.getEpochSecond() * 1000, toTimeZone));
79
- toDateFormat.setNSec(value.getNano());
80
- return toDateFormat.format(null);
103
+ jrubyFormatter.setDateTime(new DateTime(value.getEpochSecond() * 1000, toTimeZone));
104
+ jrubyFormatter.setNSec(value.getNano());
105
+ return jrubyFormatter.format(null);
106
+ }
107
+
108
+ private String javaFormat(Timestamp value)
109
+ {
110
+ long milliSecond = value.getEpochSecond() * 1000 + value.getNano() / 1000000;
111
+ return javaFormatter.format(milliSecond);
81
112
  }
82
113
  }