embulk-filter-timestamp_format 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/CHANGELOG.md +7 -0
  4. data/README.md +46 -1
  5. data/bench/config_java.yml +14 -0
  6. data/bench/config_jruby.yml +14 -0
  7. data/bench/gen_dummy.rb +5 -0
  8. data/build.gradle +1 -1
  9. data/example/double.csv +2 -0
  10. data/example/double.yml +20 -0
  11. data/example/{json_example.jsonl → example.jsonl} +0 -0
  12. data/example/example.yml +4 -12
  13. data/example/example2.csv +2 -0
  14. data/example/example2.yml +14 -0
  15. data/example/json_double.jsonl +1 -0
  16. data/example/json_double.yml +14 -0
  17. data/example/json_long.jsonl +1 -0
  18. data/example/json_long.yml +14 -0
  19. data/example/json_string.jsonl +2 -0
  20. data/example/json_string.yml +14 -0
  21. data/example/long.csv +1 -0
  22. data/example/long.yml +20 -0
  23. data/example/string.csv +4 -0
  24. data/example/{string_example.yml → string.yml} +6 -5
  25. data/example/string_java.yml +23 -0
  26. data/example/timestamp.csv +2 -0
  27. data/example/{timestamp_example.yml → timestamp.yml} +4 -4
  28. data/src/main/java/org/embulk/filter/timestamp_format/ColumnCaster.java +107 -14
  29. data/src/main/java/org/embulk/filter/timestamp_format/ColumnVisitorImpl.java +104 -33
  30. data/src/main/java/org/embulk/filter/timestamp_format/JsonCaster.java +61 -4
  31. data/src/main/java/org/embulk/filter/timestamp_format/JsonVisitor.java +8 -0
  32. data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatFilterPlugin.java +28 -17
  33. data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatter.java +36 -5
  34. data/src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java +57 -26
  35. data/src/main/java/org/embulk/filter/timestamp_format/TimestampUnit.java +112 -0
  36. data/src/main/java/org/embulk/filter/timestamp_format/TimestampUnitDeserializer.java +54 -0
  37. data/src/main/java/org/embulk/filter/timestamp_format/cast/DoubleCast.java +32 -0
  38. data/src/main/java/org/embulk/filter/timestamp_format/cast/LongCast.java +32 -0
  39. data/src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java +20 -4
  40. data/src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java +5 -6
  41. data/src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java +192 -0
  42. metadata +29 -8
  43. data/example/json_example.yml +0 -14
  44. data/src/test/java/org/embulk/filter/TestTimestampFormatFilterPlugin.java +0 -5
@@ -4,6 +4,7 @@ import org.embulk.spi.DataException;
4
4
  import org.embulk.spi.PageReader;
5
5
  import org.embulk.spi.Schema;
6
6
 
7
+ import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
7
8
  import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
8
9
 
9
10
  import org.embulk.spi.Column;
@@ -13,6 +14,7 @@ import org.embulk.spi.PageBuilder;
13
14
  import org.slf4j.Logger;
14
15
 
15
16
  import java.util.HashMap;
17
+ import java.util.HashSet;
16
18
 
17
19
  public class ColumnVisitorImpl
18
20
  implements ColumnVisitor
@@ -23,6 +25,7 @@ public class ColumnVisitorImpl
23
25
  private final Schema outputSchema;
24
26
  private final PageReader pageReader;
25
27
  private final PageBuilder pageBuilder;
28
+ private final HashSet<String> shouldCastSet = new HashSet<>();
26
29
  private final HashMap<String, Column> outputColumnMap = new HashMap<>();
27
30
  private final ColumnCaster columnCaster;
28
31
 
@@ -35,10 +38,30 @@ public class ColumnVisitorImpl
35
38
  this.pageReader = pageReader;
36
39
  this.pageBuilder = pageBuilder;
37
40
 
41
+ buildShouldCastSet();
38
42
  buildOutputColumnMap();
39
43
  this.columnCaster = new ColumnCaster(task, inputSchema, outputSchema, pageReader, pageBuilder);
40
44
  }
41
45
 
46
+ private void buildShouldCastSet()
47
+ {
48
+ // columnName => Boolean to avoid unnecessary cast
49
+ for (ColumnConfig columnConfig : task.getColumns()) {
50
+ String name = columnConfig.getName();
51
+ if (name.startsWith("$.")) {
52
+ String firstName = name.split("\\.", 3)[1]; // check only top level column name
53
+ shouldCastSet.add(firstName);
54
+ continue;
55
+ }
56
+ shouldCastSet.add(name);
57
+ }
58
+ }
59
+
60
+ private boolean shouldCast(String name)
61
+ {
62
+ return shouldCastSet.contains(name);
63
+ }
64
+
42
65
  private void buildOutputColumnMap()
43
66
  {
44
67
  // columnName => outputColumn
@@ -88,67 +111,115 @@ public class ColumnVisitorImpl
88
111
  @Override
89
112
  public void longColumn(final Column inputColumn)
90
113
  {
91
- if (pageReader.isNull(inputColumn)) {
92
- pageBuilder.setNull(inputColumn);
114
+ String name = inputColumn.getName();
115
+ if (! shouldCast(name)){
116
+ if (pageReader.isNull(inputColumn)) {
117
+ pageBuilder.setNull(inputColumn);
118
+ }
119
+ else {
120
+ pageBuilder.setLong(inputColumn, pageReader.getLong(inputColumn));
121
+ }
93
122
  }
94
123
  else {
95
- pageBuilder.setLong(inputColumn, pageReader.getLong(inputColumn));
124
+ final Column outputColumn = outputColumnMap.get(name);
125
+ PageBuildable op = new PageBuildable() {
126
+ public void run() throws DataException {
127
+ columnCaster.setFromLong(outputColumn, pageReader.getLong(inputColumn));
128
+ }
129
+ };
130
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
96
131
  }
97
132
  }
98
133
 
99
134
  @Override
100
135
  public void doubleColumn(final Column inputColumn)
101
136
  {
102
- if (pageReader.isNull(inputColumn)) {
103
- pageBuilder.setNull(inputColumn);
137
+ String name = inputColumn.getName();
138
+ if (! shouldCast(name)){
139
+ if (pageReader.isNull(inputColumn)) {
140
+ pageBuilder.setNull(inputColumn);
141
+ }
142
+ else {
143
+ pageBuilder.setDouble(inputColumn, pageReader.getDouble(inputColumn));
144
+ }
104
145
  }
105
146
  else {
106
- pageBuilder.setDouble(inputColumn, pageReader.getDouble(inputColumn));
147
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
148
+ PageBuildable op = new PageBuildable() {
149
+ public void run() throws DataException {
150
+ columnCaster.setFromDouble(outputColumn, pageReader.getDouble(inputColumn));
151
+ }
152
+ };
153
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
107
154
  }
108
155
  }
109
156
 
110
157
  @Override
111
158
  public void stringColumn(final Column inputColumn)
112
159
  {
113
- if (pageReader.isNull(inputColumn)) {
114
- pageBuilder.setNull(inputColumn);
115
- return;
116
- }
117
- final Column outputColumn = outputColumnMap.get(inputColumn.getName());
118
- PageBuildable op = new PageBuildable() {
119
- public void run() throws DataException
120
- {
121
- columnCaster.setFromString(outputColumn, pageReader.getString(inputColumn));
160
+ String name = inputColumn.getName();
161
+ if (! shouldCast(name)){
162
+ if (pageReader.isNull(inputColumn)) {
163
+ pageBuilder.setNull(inputColumn);
122
164
  }
123
- };
124
- withStopOnInvalidRecord(op, inputColumn, outputColumn);
165
+ else {
166
+ pageBuilder.setString(inputColumn, pageReader.getString(inputColumn));
167
+ }
168
+ }
169
+ else {
170
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
171
+ PageBuildable op = new PageBuildable() {
172
+ public void run() throws DataException {
173
+ columnCaster.setFromString(outputColumn, pageReader.getString(inputColumn));
174
+ }
175
+ };
176
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
177
+ }
125
178
  }
126
179
 
127
180
  @Override
128
181
  public void timestampColumn(final Column inputColumn)
129
182
  {
130
- if (pageReader.isNull(inputColumn)) {
131
- pageBuilder.setNull(inputColumn);
132
- return;
133
- }
134
- final Column outputColumn = outputColumnMap.get(inputColumn.getName());
135
- PageBuildable op = new PageBuildable() {
136
- public void run() throws DataException
137
- {
138
- columnCaster.setFromTimestamp(outputColumn, pageReader.getTimestamp(inputColumn));
183
+ String name = inputColumn.getName();
184
+ if (! shouldCast(name)){
185
+ if (pageReader.isNull(inputColumn)) {
186
+ pageBuilder.setNull(inputColumn);
139
187
  }
140
- };
141
- withStopOnInvalidRecord(op, inputColumn, outputColumn);
188
+ else {
189
+ pageBuilder.setTimestamp(inputColumn, pageReader.getTimestamp(inputColumn));
190
+ }
191
+ }
192
+ else {
193
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
194
+ PageBuildable op = new PageBuildable() {
195
+ public void run() throws DataException {
196
+ columnCaster.setFromTimestamp(outputColumn, pageReader.getTimestamp(inputColumn));
197
+ }
198
+ };
199
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
200
+ }
142
201
  }
143
202
 
144
203
  @Override
145
204
  public void jsonColumn(final Column inputColumn)
146
205
  {
147
- if (pageReader.isNull(inputColumn)) {
148
- pageBuilder.setNull(inputColumn);
149
- return;
206
+ String name = inputColumn.getName();
207
+ if (! shouldCast(name)){
208
+ if (pageReader.isNull(inputColumn)) {
209
+ pageBuilder.setNull(inputColumn);
210
+ }
211
+ else {
212
+ pageBuilder.setJson(inputColumn, pageReader.getJson(inputColumn));
213
+ }
214
+ }
215
+ else {
216
+ final Column outputColumn = outputColumnMap.get(inputColumn.getName());
217
+ PageBuildable op = new PageBuildable() {
218
+ public void run() throws DataException {
219
+ columnCaster.setFromJson(outputColumn, pageReader.getJson(inputColumn));
220
+ }
221
+ };
222
+ withStopOnInvalidRecord(op, inputColumn, outputColumn);
150
223
  }
151
- final Column outputColumn = outputColumnMap.get(inputColumn.getName());
152
- columnCaster.setFromJson(outputColumn, pageReader.getJson(inputColumn));
153
224
  }
154
225
  }
@@ -1,5 +1,8 @@
1
1
  package org.embulk.filter.timestamp_format;
2
2
 
3
+ import org.embulk.config.ConfigException;
4
+ import org.embulk.filter.timestamp_format.cast.DoubleCast;
5
+ import org.embulk.filter.timestamp_format.cast.LongCast;
3
6
  import org.embulk.filter.timestamp_format.cast.StringCast;
4
7
  import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
5
8
  import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
@@ -8,6 +11,8 @@ import org.embulk.spi.type.DoubleType;
8
11
  import org.embulk.spi.type.LongType;
9
12
  import org.embulk.spi.type.StringType;
10
13
  import org.embulk.spi.type.Type;
14
+ import org.msgpack.value.FloatValue;
15
+ import org.msgpack.value.IntegerValue;
11
16
  import org.msgpack.value.StringValue;
12
17
  import org.msgpack.value.Value;
13
18
  import org.msgpack.value.ValueFactory;
@@ -22,14 +27,64 @@ class JsonCaster
22
27
  private final PluginTask task;
23
28
  private final HashMap<String, TimestampParser> timestampParserMap;
24
29
  private final HashMap<String, TimestampFormatter> timestampFormatterMap;
30
+ private final HashMap<String, TimestampUnit> fromTimestampUnitMap;
31
+ private final HashMap<String, TimestampUnit> toTimestampUnitMap;
25
32
 
26
33
  JsonCaster(PluginTask task,
27
34
  HashMap<String, TimestampParser> timestampParserMap,
28
- HashMap<String, TimestampFormatter> timestampFormatterMap)
35
+ HashMap<String, TimestampFormatter> timestampFormatterMap,
36
+ HashMap<String, TimestampUnit> fromTimestampUnitMap,
37
+ HashMap<String, TimestampUnit> toTimestampUnitMap)
29
38
  {
30
39
  this.task = task;
31
40
  this.timestampParserMap = timestampParserMap;
32
41
  this.timestampFormatterMap = timestampFormatterMap;
42
+ this.fromTimestampUnitMap = fromTimestampUnitMap;
43
+ this.toTimestampUnitMap = toTimestampUnitMap;
44
+ }
45
+
46
+ public Value fromLong(ColumnConfig columnConfig, IntegerValue value)
47
+ {
48
+ Type outputType = columnConfig.getType();
49
+ TimestampUnit fromUnit = fromTimestampUnitMap.get(columnConfig.getName());
50
+ if (outputType instanceof StringType) {
51
+ TimestampFormatter formatter = timestampFormatterMap.get(columnConfig.getName());
52
+ return ValueFactory.newString(LongCast.asString(value.asLong(), fromUnit, formatter));
53
+ }
54
+ else if (outputType instanceof LongType) {
55
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
56
+ return ValueFactory.newInteger(LongCast.asLong(value.asLong(), fromUnit, toUnit));
57
+ }
58
+ else if (outputType instanceof DoubleType) {
59
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
60
+ return ValueFactory.newFloat(LongCast.asDouble(value.asLong(), fromUnit, toUnit));
61
+ }
62
+ else {
63
+ assert false;
64
+ throw new RuntimeException();
65
+ }
66
+ }
67
+
68
+ public Value fromDouble(ColumnConfig columnConfig, FloatValue value)
69
+ {
70
+ Type outputType = columnConfig.getType();
71
+ TimestampUnit fromUnit = fromTimestampUnitMap.get(columnConfig.getName());
72
+ if (outputType instanceof StringType) {
73
+ TimestampFormatter formatter = timestampFormatterMap.get(columnConfig.getName());
74
+ return ValueFactory.newString(DoubleCast.asString(value.toDouble(), fromUnit, formatter));
75
+ }
76
+ else if (outputType instanceof LongType) {
77
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
78
+ return ValueFactory.newInteger(DoubleCast.asLong(value.toDouble(), fromUnit, toUnit));
79
+ }
80
+ else if (outputType instanceof DoubleType) {
81
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
82
+ return ValueFactory.newFloat(DoubleCast.asDouble(value.toDouble(), fromUnit, toUnit));
83
+ }
84
+ else {
85
+ assert false;
86
+ throw new RuntimeException();
87
+ }
33
88
  }
34
89
 
35
90
  public Value fromString(ColumnConfig columnConfig, StringValue value)
@@ -41,14 +96,16 @@ class JsonCaster
41
96
  return ValueFactory.newString(StringCast.asString(value.asString(), parser, formatter));
42
97
  }
43
98
  else if (outputType instanceof LongType) {
44
- return ValueFactory.newInteger(StringCast.asLong(value.asString(), parser));
99
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
100
+ return ValueFactory.newInteger(StringCast.asLong(value.asString(), parser, toUnit));
45
101
  }
46
102
  else if (outputType instanceof DoubleType) {
47
- return ValueFactory.newFloat(StringCast.asDouble(value.asString(), parser));
103
+ TimestampUnit toUnit = toTimestampUnitMap.get(columnConfig.getName());
104
+ return ValueFactory.newFloat(StringCast.asDouble(value.asString(), parser, toUnit));
48
105
  }
49
106
  else {
50
107
  assert false;
51
- return null;
108
+ throw new RuntimeException();
52
109
  }
53
110
  }
54
111
  }
@@ -108,6 +108,14 @@ public class JsonVisitor
108
108
  }
109
109
  return ValueFactory.newMap(newValue, true);
110
110
  }
111
+ else if (value.isIntegerValue()) {
112
+ ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
113
+ return jsonCaster.fromLong(columnConfig, value.asIntegerValue());
114
+ }
115
+ else if (value.isFloatValue()) {
116
+ ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
117
+ return jsonCaster.fromDouble(columnConfig, value.asFloatValue());
118
+ }
111
119
  else if (value.isStringValue()) {
112
120
  ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
113
121
  return jsonCaster.fromString(columnConfig, value.asStringValue());
@@ -1,5 +1,6 @@
1
1
  package org.embulk.filter.timestamp_format;
2
2
 
3
+ import com.google.common.base.Optional;
3
4
  import com.google.common.collect.ImmutableList;
4
5
  import org.embulk.config.Config;
5
6
  import org.embulk.config.ConfigDefault;
@@ -18,9 +19,9 @@ import org.embulk.spi.PageOutput;
18
19
  import org.embulk.spi.PageReader;
19
20
  import org.embulk.spi.Schema;
20
21
 
21
- import org.embulk.spi.type.DoubleType;
22
- import org.embulk.spi.type.LongType;
23
- import org.embulk.spi.type.StringType;
22
+ import org.embulk.spi.time.Timestamp;
23
+ import org.embulk.spi.type.BooleanType;
24
+ import org.embulk.spi.type.JsonType;
24
25
  import org.embulk.spi.type.TimestampType;
25
26
  import org.embulk.spi.type.Type;
26
27
  import org.jruby.embed.ScriptingContainer;
@@ -44,6 +45,14 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
44
45
  @Config("type")
45
46
  @ConfigDefault("\"string\"")
46
47
  Type getType();
48
+
49
+ @Config("from_unit")
50
+ @ConfigDefault("null")
51
+ Optional<TimestampUnit> getFromUnit();
52
+
53
+ @Config("to_unit")
54
+ @ConfigDefault("null")
55
+ Optional<TimestampUnit> getToUnit();
47
56
  }
48
57
 
49
58
  interface PluginTask extends Task,
@@ -57,6 +66,14 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
57
66
  @ConfigDefault("false")
58
67
  Boolean getStopOnInvalidRecord();
59
68
 
69
+ @Config("default_from_timestamp_unit")
70
+ @ConfigDefault("\"second\"")
71
+ TimestampUnit getDefaultFromTimestampUnit();
72
+
73
+ @Config("default_to_timestamp_unit")
74
+ @ConfigDefault("\"second\"")
75
+ TimestampUnit getDefaultToTimestampUnit();
76
+
60
77
  @ConfigInject
61
78
  ScriptingContainer getJRuby();
62
79
  }
@@ -88,24 +105,18 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
88
105
  }
89
106
  }
90
107
 
91
- // throw if column type is not string or timestamp
108
+ // throw if column type is not supported
92
109
  for (ColumnConfig columnConfig : columns) {
110
+ String name = columnConfig.getName();
93
111
  Type type = columnConfig.getType();
94
- boolean acceptable = false;
95
- if (type instanceof StringType) {
96
- continue;
97
- }
98
- else if (type instanceof TimestampType) {
99
- continue;
112
+ if (type instanceof BooleanType) {
113
+ throw new ConfigException(String.format("casting to boolean is not available: \"%s\"", name));
100
114
  }
101
- else if (type instanceof LongType) {
102
- continue;
115
+ if (type instanceof JsonType) {
116
+ throw new ConfigException(String.format("casting to json is not available: \"%s\"", name));
103
117
  }
104
- else if (type instanceof DoubleType) {
105
- continue;
106
- }
107
- else {
108
- throw new ConfigException("column type must be string, timestamp, long, or double");
118
+ if (name.startsWith("$.") && type instanceof TimestampType) {
119
+ throw new ConfigException(String.format("casting a json path into timestamp is not available: \"%s\"", name));
109
120
  }
110
121
  }
111
122
  }
@@ -15,6 +15,8 @@ import org.joda.time.DateTimeZone;
15
15
  import org.jruby.embed.ScriptingContainer;
16
16
  import org.jruby.util.RubyDateFormat;
17
17
 
18
+ import java.text.SimpleDateFormat;
19
+ import java.util.Date;
18
20
  import java.util.Locale;
19
21
 
20
22
  public class TimestampFormatter
@@ -41,7 +43,8 @@ public class TimestampFormatter
41
43
  Optional<String> getToFormat();
42
44
  }
43
45
 
44
- private final RubyDateFormat toDateFormat;
46
+ private final RubyDateFormat jrubyFormatter;
47
+ private final SimpleDateFormat javaFormatter;
45
48
  private final DateTimeZone toTimeZone;
46
49
 
47
50
  public TimestampFormatter(PluginTask task, Optional<? extends TimestampColumnOption> columnOption)
@@ -58,7 +61,15 @@ public class TimestampFormatter
58
61
  public TimestampFormatter(ScriptingContainer jruby, String format, DateTimeZone toTimeZone)
59
62
  {
60
63
  this.toTimeZone = toTimeZone;
61
- this.toDateFormat = new RubyDateFormat(format, Locale.ENGLISH, true);
64
+ if (format.contains("%")) {
65
+ this.jrubyFormatter = new RubyDateFormat(format, Locale.ENGLISH, true);
66
+ this.javaFormatter = null;
67
+ }
68
+ else {
69
+ this.jrubyFormatter = null;
70
+ this.javaFormatter = new SimpleDateFormat(format, Locale.ENGLISH);
71
+ javaFormatter.setTimeZone(toTimeZone.toTimeZone());
72
+ }
62
73
  }
63
74
 
64
75
  public DateTimeZone getToTimeZone()
@@ -73,10 +84,30 @@ public class TimestampFormatter
73
84
  }
74
85
 
75
86
  public String format(Timestamp value)
87
+ {
88
+ if (jrubyFormatter != null) {
89
+ return jrubyFormat(value);
90
+ }
91
+ else if (javaFormatter != null) {
92
+ return javaFormat(value);
93
+ }
94
+ else {
95
+ assert false;
96
+ throw new RuntimeException();
97
+ }
98
+ }
99
+
100
+ private String jrubyFormat(Timestamp value)
76
101
  {
77
102
  // TODO optimize by using reused StringBuilder
78
- toDateFormat.setDateTime(new DateTime(value.getEpochSecond() * 1000, toTimeZone));
79
- toDateFormat.setNSec(value.getNano());
80
- return toDateFormat.format(null);
103
+ jrubyFormatter.setDateTime(new DateTime(value.getEpochSecond() * 1000, toTimeZone));
104
+ jrubyFormatter.setNSec(value.getNano());
105
+ return jrubyFormatter.format(null);
106
+ }
107
+
108
+ private String javaFormat(Timestamp value)
109
+ {
110
+ long milliSecond = value.getEpochSecond() * 1000 + value.getNano() / 1000000;
111
+ return javaFormatter.format(milliSecond);
81
112
  }
82
113
  }