embulk-filter-timestamp_format 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/CHANGELOG.md +7 -0
  4. data/README.md +46 -1
  5. data/bench/config_java.yml +14 -0
  6. data/bench/config_jruby.yml +14 -0
  7. data/bench/gen_dummy.rb +5 -0
  8. data/build.gradle +1 -1
  9. data/example/double.csv +2 -0
  10. data/example/double.yml +20 -0
  11. data/example/{json_example.jsonl → example.jsonl} +0 -0
  12. data/example/example.yml +4 -12
  13. data/example/example2.csv +2 -0
  14. data/example/example2.yml +14 -0
  15. data/example/json_double.jsonl +1 -0
  16. data/example/json_double.yml +14 -0
  17. data/example/json_long.jsonl +1 -0
  18. data/example/json_long.yml +14 -0
  19. data/example/json_string.jsonl +2 -0
  20. data/example/json_string.yml +14 -0
  21. data/example/long.csv +1 -0
  22. data/example/long.yml +20 -0
  23. data/example/string.csv +4 -0
  24. data/example/{string_example.yml → string.yml} +6 -5
  25. data/example/string_java.yml +23 -0
  26. data/example/timestamp.csv +2 -0
  27. data/example/{timestamp_example.yml → timestamp.yml} +4 -4
  28. data/src/main/java/org/embulk/filter/timestamp_format/ColumnCaster.java +107 -14
  29. data/src/main/java/org/embulk/filter/timestamp_format/ColumnVisitorImpl.java +104 -33
  30. data/src/main/java/org/embulk/filter/timestamp_format/JsonCaster.java +61 -4
  31. data/src/main/java/org/embulk/filter/timestamp_format/JsonVisitor.java +8 -0
  32. data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatFilterPlugin.java +28 -17
  33. data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatter.java +36 -5
  34. data/src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java +57 -26
  35. data/src/main/java/org/embulk/filter/timestamp_format/TimestampUnit.java +112 -0
  36. data/src/main/java/org/embulk/filter/timestamp_format/TimestampUnitDeserializer.java +54 -0
  37. data/src/main/java/org/embulk/filter/timestamp_format/cast/DoubleCast.java +32 -0
  38. data/src/main/java/org/embulk/filter/timestamp_format/cast/LongCast.java +32 -0
  39. data/src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java +20 -4
  40. data/src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java +5 -6
  41. data/src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java +192 -0
  42. metadata +29 -8
  43. data/example/json_example.yml +0 -14
  44. data/src/test/java/org/embulk/filter/TestTimestampFormatFilterPlugin.java +0 -5
@@ -17,13 +17,14 @@ import org.embulk.spi.time.TimestampParseException;
17
17
  import org.joda.time.DateTimeZone;
18
18
  import org.jruby.embed.ScriptingContainer;
19
19
 
20
+ import java.text.ParseException;
21
+ import java.text.SimpleDateFormat;
20
22
  import java.util.ArrayList;
21
23
  import java.util.List;
24
+ import java.util.TimeZone;
22
25
 
23
- public class TimestampParser
24
- {
25
- public interface Task
26
- {
26
+ public class TimestampParser {
27
+ public interface Task {
27
28
  @Config("default_from_timezone")
28
29
  @ConfigDefault("\"UTC\"")
29
30
  DateTimeZone getDefaultFromTimeZone();
@@ -33,8 +34,7 @@ public class TimestampParser
33
34
  List<String> getDefaultFromTimestampFormat();
34
35
  }
35
36
 
36
- public interface TimestampColumnOption
37
- {
37
+ public interface TimestampColumnOption {
38
38
  @Config("from_timezone")
39
39
  @ConfigDefault("null")
40
40
  Optional<DateTimeZone> getFromTimeZone();
@@ -44,60 +44,71 @@ public class TimestampParser
44
44
  Optional<List<String>> getFromFormat();
45
45
  }
46
46
 
47
- private final List<JRubyTimeParserHelper> helperList;
47
+ private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<JRubyTimeParserHelper>();
48
+ private final List<SimpleDateFormat> javaParserList = new ArrayList<SimpleDateFormat>();
48
49
  private final DateTimeZone defaultFromTimeZone;
49
50
 
50
- TimestampParser(PluginTask task)
51
- {
51
+ TimestampParser(PluginTask task) {
52
52
  this(task.getJRuby(), task.getDefaultFromTimestampFormat(), task.getDefaultFromTimeZone());
53
53
  }
54
54
 
55
- public TimestampParser(PluginTask task, TimestampColumnOption columnOption)
56
- {
55
+ public TimestampParser(PluginTask task, TimestampColumnOption columnOption) {
57
56
  this(task.getJRuby(),
58
57
  columnOption.getFromFormat().or(task.getDefaultFromTimestampFormat()),
59
58
  columnOption.getFromTimeZone().or(task.getDefaultFromTimeZone()));
60
59
  }
61
60
 
62
- public TimestampParser(ScriptingContainer jruby, List<String> formatList, DateTimeZone defaultFromTimeZone)
63
- {
61
+ public TimestampParser(ScriptingContainer jruby, List<String> formatList, DateTimeZone defaultFromTimeZone) {
64
62
  JRubyTimeParserHelperFactory helperFactory = (JRubyTimeParserHelperFactory) jruby.runScriptlet("Embulk::Java::TimeParserHelper::Factory.new");
65
63
  // TODO get default current time from ExecTask.getExecTimestamp
66
- this.helperList = new ArrayList<JRubyTimeParserHelper>();
67
64
  for (String format : formatList) {
68
- JRubyTimeParserHelper helper = (JRubyTimeParserHelper) helperFactory.newInstance(format, 1970, 1, 1, 0, 0, 0, 0); // TODO default time zone
69
- this.helperList.add(helper);
65
+ if (format.contains("%")) {
66
+ JRubyTimeParserHelper helper = (JRubyTimeParserHelper) helperFactory.newInstance(format, 1970, 1, 1, 0, 0, 0, 0); // TODO default time zone
67
+ this.jrubyParserList.add(helper);
68
+ } else {
69
+ SimpleDateFormat parser = new SimpleDateFormat(format);
70
+ parser.setTimeZone(defaultFromTimeZone.toTimeZone());
71
+ this.javaParserList.add(parser);
72
+ }
70
73
  }
71
74
  this.defaultFromTimeZone = defaultFromTimeZone;
72
75
  }
73
76
 
74
- public DateTimeZone getDefaultFromTimeZone()
75
- {
77
+ public DateTimeZone getDefaultFromTimeZone() {
76
78
  return defaultFromTimeZone;
77
79
  }
78
80
 
79
- public Timestamp parse(String text) throws TimestampParseException
80
- {
81
- JRubyTimeParserHelper helper = null;
81
+ public Timestamp parse(String text) throws TimestampParseException, ParseException {
82
+ if (!jrubyParserList.isEmpty()) {
83
+ return jrubyParse(text);
84
+ } else if (!javaParserList.isEmpty()) {
85
+ return javaParse(text);
86
+ } else {
87
+ assert false;
88
+ throw new RuntimeException();
89
+ }
90
+ }
91
+
92
+ private Timestamp jrubyParse(String text) throws TimestampParseException {
82
93
  long localUsec = -1;
83
94
  TimestampParseException exception = null;
84
95
 
85
- for (JRubyTimeParserHelper h : helperList) {
96
+ JRubyTimeParserHelper helper = null;
97
+ for (JRubyTimeParserHelper h : jrubyParserList) {
86
98
  helper = h;
87
99
  try {
88
- localUsec = helper.strptimeUsec(text);
100
+ localUsec = helper.strptimeUsec(text); // NOTE: micro second resolution
89
101
  break;
90
- }
91
- catch (TimestampParseException ex) {
102
+ } catch (TimestampParseException ex) {
92
103
  exception = ex;
93
104
  }
94
105
  }
95
106
  if (localUsec == -1) {
96
107
  throw exception;
97
108
  }
109
+ DateTimeZone timeZone = defaultFromTimeZone;
98
110
  String zone = helper.getZone();
99
111
 
100
- DateTimeZone timeZone = defaultFromTimeZone;
101
112
  if (zone != null) {
102
113
  // TODO cache parsed zone?
103
114
  timeZone = parseDateTimeZone(zone);
@@ -112,4 +123,24 @@ public class TimestampParser
112
123
 
113
124
  return Timestamp.ofEpochSecond(sec, usec * 1000);
114
125
  }
126
+
127
+ private Timestamp javaParse(String text) throws ParseException {
128
+ long msec = -1;
129
+ ParseException exception = null;
130
+
131
+ for (SimpleDateFormat parser : javaParserList) {
132
+ try {
133
+ msec = parser.parse(text).getTime(); // NOTE: milli second resolution
134
+ break;
135
+ } catch (ParseException ex) {
136
+ exception = ex;
137
+ }
138
+ }
139
+ if (msec == -1) {
140
+ throw exception;
141
+ }
142
+
143
+ long nanoAdjustment = msec * 1000000;
144
+ return Timestamp.ofEpochSecond(0, nanoAdjustment);
145
+ }
115
146
  }
@@ -0,0 +1,112 @@
1
+ package org.embulk.filter.timestamp_format;
2
+
3
+ import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ @JsonDeserialize(using=TimestampUnitDeserializer.class)
7
+ public enum TimestampUnit {
8
+ Second {
9
+ @Override
10
+ public int scale() {
11
+ return 1;
12
+ }
13
+
14
+ @Override
15
+ public int scaleToNano() {
16
+ return 1000000000;
17
+ }
18
+ },
19
+ MilliSecond {
20
+ @Override
21
+ public int scale() {
22
+ return 1000;
23
+ }
24
+
25
+ @Override
26
+ public int scaleToNano() {
27
+ return 1000000;
28
+ }
29
+ },
30
+ MicroSecond {
31
+ @Override
32
+ public int scale() {
33
+ return 1000000;
34
+ }
35
+
36
+ @Override
37
+ public int scaleToNano() {
38
+ return 1000;
39
+ }
40
+ },
41
+ NanoSecond {
42
+ @Override
43
+ public int scale() {
44
+ return 1000000000;
45
+ }
46
+
47
+ @Override
48
+ public int scaleToNano() {
49
+ return 1;
50
+ }
51
+ };
52
+
53
+ public abstract int scale();
54
+ public abstract int scaleToNano();
55
+
56
+ public static Timestamp toTimestamp(long value, TimestampUnit fromUnit)
57
+ {
58
+ long nanoAdjustment = value * fromUnit.scaleToNano();
59
+ return Timestamp.ofEpochSecond(0, nanoAdjustment);
60
+ }
61
+
62
+ public static Timestamp toTimestamp(double value, TimestampUnit fromUnit)
63
+ {
64
+ long nanoAdjustment = (long) (value * fromUnit.scaleToNano());
65
+ return Timestamp.ofEpochSecond(0, nanoAdjustment);
66
+ }
67
+
68
+ public static long toLong(Timestamp value, TimestampUnit toUnit)
69
+ {
70
+ long epochSecond = value.getEpochSecond() * toUnit.scale();
71
+ long nanoIntegerPart = value.getNano() / toUnit.scaleToNano();
72
+ return epochSecond + nanoIntegerPart;
73
+
74
+ }
75
+ public static double toDouble(Timestamp value, TimestampUnit toUnit)
76
+ {
77
+ long epochSecond = value.getEpochSecond() * toUnit.scale();
78
+ long nanoIntegerPart = value.getNano() / toUnit.scaleToNano();
79
+ long nanoDecimalPart = value.getNano() - (nanoIntegerPart * toUnit.scaleToNano());
80
+ return epochSecond + nanoIntegerPart + (nanoDecimalPart / (double) toUnit.scaleToNano());
81
+ }
82
+
83
+ public static long changeUnit(long value, TimestampUnit fromUnit, TimestampUnit toUnit)
84
+ {
85
+ if (fromUnit.scale() == toUnit.scale()) {
86
+ return value;
87
+ }
88
+ else if (fromUnit.scale() < toUnit.scale()) {
89
+ long factor = toUnit.scale() / fromUnit.scale();
90
+ return value * factor;
91
+ }
92
+ else {
93
+ long divideFactor = fromUnit.scale() / toUnit.scale();
94
+ return value / divideFactor;
95
+ }
96
+ }
97
+
98
+ public static double changeUnit(double value, TimestampUnit fromUnit, TimestampUnit toUnit)
99
+ {
100
+ if (fromUnit.scale() == toUnit.scale()) {
101
+ return value;
102
+ }
103
+ else if (fromUnit.scale() < toUnit.scale()) {
104
+ long factor = toUnit.scale() / fromUnit.scale();
105
+ return value * factor;
106
+ }
107
+ else {
108
+ long divideFactor = fromUnit.scale() / toUnit.scale();
109
+ return value / (double)divideFactor;
110
+ }
111
+ }
112
+ }
@@ -0,0 +1,54 @@
1
+ package org.embulk.filter.timestamp_format;
2
+
3
+ import java.util.Map;
4
+ import java.io.IOException;
5
+ import com.google.common.base.Joiner;
6
+ import com.google.common.collect.ImmutableMap;
7
+ import com.fasterxml.jackson.databind.DeserializationContext;
8
+ import com.fasterxml.jackson.databind.deser.std.FromStringDeserializer;
9
+ import com.fasterxml.jackson.databind.JsonMappingException;
10
+
11
+ public class TimestampUnitDeserializer
12
+ extends FromStringDeserializer<TimestampUnit>
13
+ {
14
+ private static final Map<String, TimestampUnit> stringToTimestampUnitMap;
15
+
16
+ static {
17
+ ImmutableMap.Builder<String, TimestampUnit> builder = ImmutableMap.builder();
18
+ builder.put("Second", TimestampUnit.Second);
19
+ builder.put("second", TimestampUnit.Second);
20
+ builder.put("sec", TimestampUnit.Second);
21
+ builder.put("MilliSecond", TimestampUnit.MilliSecond);
22
+ builder.put("millisecond", TimestampUnit.MilliSecond);
23
+ builder.put("milli_second", TimestampUnit.MilliSecond);
24
+ builder.put("ms", TimestampUnit.MilliSecond);
25
+ builder.put("MicroSecond", TimestampUnit.MicroSecond);
26
+ builder.put("microsecond", TimestampUnit.MicroSecond);
27
+ builder.put("micro_second", TimestampUnit.MicroSecond);
28
+ builder.put("us", TimestampUnit.MicroSecond);
29
+ builder.put("NanoSecond", TimestampUnit.NanoSecond);
30
+ builder.put("nanosecond", TimestampUnit.NanoSecond);
31
+ builder.put("nano_second", TimestampUnit.NanoSecond);
32
+ builder.put("ns", TimestampUnit.NanoSecond);
33
+ stringToTimestampUnitMap = builder.build();
34
+ }
35
+
36
+ public TimestampUnitDeserializer()
37
+ {
38
+ super(TimestampUnit.class);
39
+ }
40
+
41
+ @Override
42
+ protected TimestampUnit _deserialize(String value, DeserializationContext context)
43
+ throws IOException
44
+ {
45
+ TimestampUnit t = stringToTimestampUnitMap.get(value);
46
+ if (t == null) {
47
+ throw new JsonMappingException(
48
+ String.format("Unknown type name '%s'. Supported types are: %s",
49
+ value,
50
+ Joiner.on(", ").join(stringToTimestampUnitMap.keySet())));
51
+ }
52
+ return t;
53
+ }
54
+ }
@@ -0,0 +1,32 @@
1
+ package org.embulk.filter.timestamp_format.cast;
2
+
3
+ import org.embulk.filter.timestamp_format.TimestampFormatter;
4
+ import org.embulk.filter.timestamp_format.TimestampUnit;
5
+ import org.embulk.spi.DataException;
6
+ import org.embulk.spi.time.Timestamp;
7
+
8
+ public class DoubleCast
9
+ {
10
+ private DoubleCast() {}
11
+
12
+ public static String asString(double value, TimestampUnit fromUnit, TimestampFormatter formatter) throws DataException
13
+ {
14
+ Timestamp timestamp = TimestampUnit.toTimestamp(value, fromUnit);
15
+ return formatter.format(timestamp);
16
+ }
17
+
18
+ public static Timestamp asTimestamp(double value, TimestampUnit fromUnit) throws DataException
19
+ {
20
+ return TimestampUnit.toTimestamp(value, fromUnit);
21
+ }
22
+
23
+ public static long asLong(double value, TimestampUnit fromUnit, TimestampUnit toUnit) throws DataException
24
+ {
25
+ return (long) TimestampUnit.changeUnit(value, fromUnit, toUnit);
26
+ }
27
+
28
+ public static double asDouble(double value, TimestampUnit fromUnit, TimestampUnit toUnit) throws DataException
29
+ {
30
+ return TimestampUnit.changeUnit(value, fromUnit, toUnit);
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ package org.embulk.filter.timestamp_format.cast;
2
+
3
+ import org.embulk.filter.timestamp_format.TimestampFormatter;
4
+ import org.embulk.filter.timestamp_format.TimestampUnit;
5
+ import org.embulk.spi.DataException;
6
+ import org.embulk.spi.time.Timestamp;
7
+
8
+ public class LongCast
9
+ {
10
+ private LongCast() {}
11
+
12
+ public static String asString(long value, TimestampUnit fromUnit, TimestampFormatter formatter) throws DataException
13
+ {
14
+ Timestamp timestamp = TimestampUnit.toTimestamp(value, fromUnit);
15
+ return formatter.format(timestamp);
16
+ }
17
+
18
+ public static Timestamp asTimestamp(long value, TimestampUnit fromUnit) throws DataException
19
+ {
20
+ return TimestampUnit.toTimestamp(value, fromUnit);
21
+ }
22
+
23
+ public static long asLong(long value, TimestampUnit fromUnit, TimestampUnit toUnit) throws DataException
24
+ {
25
+ return TimestampUnit.changeUnit(value, fromUnit, toUnit);
26
+ }
27
+
28
+ public static double asDouble(long value, TimestampUnit fromUnit, TimestampUnit toUnit) throws DataException
29
+ {
30
+ return (double) TimestampUnit.changeUnit(value, fromUnit, toUnit);
31
+ }
32
+ }
@@ -2,10 +2,13 @@ package org.embulk.filter.timestamp_format.cast;
2
2
 
3
3
  import org.embulk.filter.timestamp_format.TimestampFormatter;
4
4
  import org.embulk.filter.timestamp_format.TimestampParser;
5
+ import org.embulk.filter.timestamp_format.TimestampUnit;
5
6
  import org.embulk.spi.DataException;
6
7
  import org.embulk.spi.time.Timestamp;
7
8
  import org.embulk.spi.time.TimestampParseException;
8
9
 
10
+ import java.text.ParseException;
11
+
9
12
  public class StringCast
10
13
  {
11
14
  private StringCast() {}
@@ -24,6 +27,9 @@ public class StringCast
24
27
  catch (TimestampParseException ex) {
25
28
  throw new DataException(buildErrorMessage(value), ex);
26
29
  }
30
+ catch (ParseException ex) {
31
+ throw new DataException(buildErrorMessage(value), ex);
32
+ }
27
33
  }
28
34
 
29
35
  public static Timestamp asTimestamp(String value, TimestampParser parser) throws DataException
@@ -34,26 +40,36 @@ public class StringCast
34
40
  catch (TimestampParseException ex) {
35
41
  throw new DataException(buildErrorMessage(value), ex);
36
42
  }
43
+ catch (ParseException ex) {
44
+ throw new DataException(buildErrorMessage(value), ex);
45
+ }
37
46
  }
38
47
 
39
- public static long asLong(String value, TimestampParser parser) throws DataException
48
+ public static long asLong(String value, TimestampParser parser, TimestampUnit toUnit) throws DataException
40
49
  {
41
50
  try {
42
51
  Timestamp timestamp = parser.parse(value);
43
- return timestamp.getEpochSecond();
52
+ return TimestampUnit.toLong(timestamp, toUnit);
44
53
  }
45
54
  catch (TimestampParseException ex) {
46
55
  throw new DataException(buildErrorMessage(value), ex);
47
56
  }
57
+ catch (ParseException ex) {
58
+ throw new DataException(buildErrorMessage(value), ex);
59
+ }
48
60
  }
49
- public static double asDouble(String value, TimestampParser parser) throws DataException
61
+
62
+ public static double asDouble(String value, TimestampParser parser, TimestampUnit toUnit) throws DataException
50
63
  {
51
64
  try {
52
65
  Timestamp timestamp = parser.parse(value);
53
- return TimestampCast.asDouble(timestamp);
66
+ return TimestampUnit.toDouble(timestamp, toUnit);
54
67
  }
55
68
  catch (TimestampParseException ex) {
56
69
  throw new DataException(buildErrorMessage(value), ex);
57
70
  }
71
+ catch (ParseException ex) {
72
+ throw new DataException(buildErrorMessage(value), ex);
73
+ }
58
74
  }
59
75
  }
@@ -1,6 +1,7 @@
1
1
  package org.embulk.filter.timestamp_format.cast;
2
2
 
3
3
  import org.embulk.filter.timestamp_format.TimestampFormatter;
4
+ import org.embulk.filter.timestamp_format.TimestampUnit;
4
5
  import org.embulk.spi.DataException;
5
6
  import org.embulk.spi.time.Timestamp;
6
7
 
@@ -18,15 +19,13 @@ public class TimestampCast
18
19
  return value;
19
20
  }
20
21
 
21
- public static long asLong(Timestamp value) throws DataException
22
+ public static long asLong(Timestamp value, TimestampUnit toUnit) throws DataException
22
23
  {
23
- return value.getEpochSecond();
24
+ return TimestampUnit.toLong(value, toUnit);
24
25
  }
25
26
 
26
- public static double asDouble(Timestamp value) throws DataException
27
+ public static double asDouble(Timestamp value, TimestampUnit toUnit) throws DataException
27
28
  {
28
- long epoch = value.getEpochSecond();
29
- int nano = value.getNano();
30
- return (double) epoch + ((double) nano / 1000000000.0);
29
+ return TimestampUnit.toDouble(value, toUnit);
31
30
  }
32
31
  }