embulk-filter-timestamp_format 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cb84426fbf7dfbaac21925ece7475f322c7c2010
4
- data.tar.gz: 2c5accec8470864588c20d62c42626a89a584bab
3
+ metadata.gz: 0086188ce09565733308b14cebcfd784df0a6a4e
4
+ data.tar.gz: 8d974fbc9276c45f07541f62f79f725b375f2d89
5
5
  SHA512:
6
- metadata.gz: d77f18cde0d2a3626198d1e94aa1c7e1b6c47d3c8572c9397cf1c5ec8a4ec808fcb9dea636339c0e838e00f418cefd290d92d77b44669a6df31c1c3e2fd8b9bb
7
- data.tar.gz: c3464272e96b244c782b65e8e536a1f64fc488646958c58d04a068550c85d718f137081662ce5fd7fe1370aa6788bda4b8598a7e0e606b7a00e6811e096177a5
6
+ metadata.gz: fcbe6f58659fa857ddf7237aa9534bcb392ff21d60e1e31abe267414b570a93b44bd347839361a207add780306ceb682e6e3f14e93181b3b1741fd497078bf86
7
+ data.tar.gz: 2b13d3aab703ba5dfd53f41587d20322595fee6b3af5603f8744c4955b1b32ce2525d022a46bda85d5567de8a687ef1718e3aa064094d278a20406fdf94db6c0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ # 0.1.7 (2016-05-09)
2
+
3
+ Enhancements:
4
+
5
+ * Use Joda-Time DateTimeFormat instead of SimpleDateFormat for Java timestamp parser/formatter
6
+ * to be thread-safe
7
+ * to fix ss.SSS resolves 1.1 as 1.001 seconds wrongly
8
+
1
9
  # 0.1.6 (2016-05-01)
2
10
 
3
11
  Enhancements:
data/README.md CHANGED
@@ -8,7 +8,7 @@ A filter plugin for Embulk to change timestamp format
8
8
 
9
9
  - **columns**: columns to retain (array of hash)
10
10
  - **name**: name of column (required)
11
- - **type**: type to cast (string, timestamp, long (unixtimestamp), double (unixtimestamp), default is string)
11
+ - **type**: type to cast, choose one of `string`, `timestamp`, `long` (unixtimestamp), `double` (unixtimestamp) (string, default is `string`)
12
12
  - **from_format**: specify the format of the input string (array of strings, default is default_from_timestamp_format)
13
13
  - **from_timezone**: specify the timezone of the input string (string, default is default_from_timezone)
14
14
  - **to_format**: specify the format of the output string (string, default is default_to_timestamp_format)
@@ -19,8 +19,8 @@ A filter plugin for Embulk to change timestamp format
19
19
  - **default_from_timezone**: default timezone for the input string (string, default is `UTC`)
20
20
  - **default_to_timestamp_format**: default timestamp format for the output string (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
21
21
  - **default_to_timezone**: default timezone for the output string (string, default is `UTC`)
22
- - **default_from_timetamp_unit**: default time unit such as second, ms, us, ns for the input unixtimestamp (string, default is `second`)
23
- - **default_to_timetamp_unit**: default time unit such as second, ms, us, ns for the output unixtimestamp (string, default is `second`)
22
+ - **default_from_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the input unixtimestamp (string, default is `second`)
23
+ - **default_to_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the output unixtimestamp (string, default is `second`)
24
24
  - **stop_on_invalid_record**: stop bulk load transaction if a invalid record is found (boolean, default is `false`)
25
25
 
26
26
  ## Example
@@ -37,33 +37,35 @@ in:
37
37
  type: file
38
38
  path_prefix: example/example.jsonl
39
39
  parser:
40
- type: jsonl
40
+ type: jsonl # not json parser
41
41
  columns:
42
42
  - {name: timestamp, type: string}
43
43
  - {name: nested, type: json}
44
44
  filters:
45
45
  - type: timestamp_format
46
+ default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]
46
47
  default_to_timezone: "Asia/Tokyo"
47
48
  default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
48
49
  columns:
49
- - {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
50
- - {name: $.nested.timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
50
+ - {name: timestamp, type: long, to_unit: ms}
51
+ - {name: $.nested.timestamp}
52
+ out:
51
53
  type: stdout
52
54
  ```
53
55
 
54
56
  Output will be as:
55
57
 
56
58
  ```
57
- {"timestamp":"2015-07-13 00:00:00.0","nested":{"timestamp":"2015-07-13 00:00:00.0}}
58
- {"timestamp":"2015-07-13 00:00:00.1","nested":{"timestamp":"2015-07-13 00:00:00.1}}
59
+ {"timestamp":1436713200000,"nested":{"timestamp":"2015-07-13 00:00:00.0}}
60
+ {"timestamp":1436713200100,"nested":{"timestamp":"2015-07-13 00:00:00.1}}
59
61
  ```
60
62
 
61
63
  See [./example](./example) for more examples.
62
64
 
63
- ## Timestamp Parser/Formatter Performance Issue
65
+ ## JRuby Timestamp Parser/Formatter Performance Issue
64
66
 
65
67
  Embulk's timestamp parser/formatter originally uses jruby implementation, but it is slow.
66
- To improve performance, this plugin also supports Java's [SimpleDateFormat](https://docs.oracle.com/javase/jp/6/api/java/text/SimpleDateFormat.html) format as:
68
+ To improve performance, this plugin also supports Java's Joda-Time [DateTimeFormat](http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) format as:
67
69
 
68
70
  ```yaml
69
71
  in:
@@ -76,12 +78,11 @@ in:
76
78
  - {name: nested, type: json}
77
79
  filters:
78
80
  - type: timestamp_format
79
- default_from_timezone: "Asia/Taipei"
80
81
  default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.SSS z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
81
82
  default_to_timezone: "Asia/Taipei"
82
83
  default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
83
84
  columns:
84
- - {name: timestamp}
85
+ - {name: timestamp, type: long, to_unit: ms}
85
86
  - {name: $.nested.timestamp}
86
87
  out:
87
88
  type: stdout
@@ -93,13 +94,14 @@ If format strings contain `%`, jruby parser/formatter is used. Otherwirse, java
93
94
 
94
95
  Benchmark test sets are available at [./bench](./bench). In my environment (Mac Book Pro), for 1000000 timestamps:
95
96
 
96
- * jruby parser/formatter: 65.06s
97
- * java parser/formatter: 1.3s
97
+ * java parser / java formatter: 1.3s
98
+ * java parser / jruby formatter: 1.4s
99
+ * jruby parser / java formatter: 64.52s
100
+ * jruby parser / jruby formatter: 65.06s
98
101
 
99
102
  **NOTICE:**
100
103
 
101
- * JRuby parser has micro second resolution, but Java parser (SimpleDateFormat) has only milli second resolution
102
- * `S` requires three digits always. For example, `yyyy-MM-dd HH:mm::ss.S` for `2015-12-17 01:02:03.1` gives 001 milli seconds wrongly, but it is the specification of SimpleDateFormat.
104
+ * JRuby parser has micro second resolution, but Java parser (Joda-Time) has only milli second resolution
103
105
 
104
106
  ## ToDo
105
107
 
@@ -9,6 +9,5 @@ filters:
9
9
  - type: timestamp_format
10
10
  columns:
11
11
  - {name: timestamp, from_format: ["yyyy-MM-dd hh:mm:ss.SSS"], to_format: "yyyy-MM-dd"}
12
-
13
12
  out:
14
13
  type: "null"
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: bench/dummy
4
+ parser:
5
+ type: csv
6
+ columns:
7
+ - {name: timestamp, type: string}
8
+ filters:
9
+ - type: timestamp_format
10
+ columns:
11
+ - {name: timestamp, from_format: ["yyyy-MM-dd hh:mm:ss.SSS"], to_format: "%Y-%m-%d"}
12
+ out:
13
+ type: "null"
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: bench/dummy
4
+ parser:
5
+ type: csv
6
+ columns:
7
+ - {name: timestamp, type: string}
8
+ filters:
9
+ - type: timestamp_format
10
+ columns:
11
+ - {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N"], to_format: "yyyy-MM-dd"}
12
+ out:
13
+ type: "null"
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.6"
16
+ version = "0.1.7"
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
data/example/example.yml CHANGED
@@ -8,7 +8,7 @@ filters:
8
8
  default_to_timezone: "Asia/Tokyo"
9
9
  default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
10
10
  columns:
11
- - {name: "$.record.timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
11
+ - {name: "$.record.timestamp", type: long, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"], to_unit: ms}
12
12
  - {name: "$.record.nested.nested[0].timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
13
13
  out:
14
14
  type: "null"
data/example/string.csv CHANGED
@@ -1,3 +1,5 @@
1
+ 2015-07-13,2015-07-13,2015-07-13,2015-07-13
2
+ 2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC
1
3
  2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00
2
4
  2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC
3
5
  2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC
data/example/string.yml CHANGED
@@ -11,7 +11,7 @@ in:
11
11
  filters:
12
12
  - type: timestamp_format
13
13
  default_from_timezone: "Asia/Taipei"
14
- default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
14
+ default_from_timestamp_format: ["%Y-%m-%d", "%Y-%m-%d %z", "%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
15
15
  default_to_timezone: "Asia/Taipei"
16
16
  default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
17
17
  columns:
@@ -11,7 +11,7 @@ in:
11
11
  filters:
12
12
  - type: timestamp_format
13
13
  default_from_timezone: "Asia/Taipei"
14
- default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"] # SSS must be three digit ...
14
+ default_from_timestamp_format: ["yyyy-MM-dd", "yyyy-MM-dd z", "yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
15
15
  default_to_timezone: "Asia/Taipei"
16
16
  default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
17
17
  columns:
@@ -15,9 +15,9 @@ import org.joda.time.DateTimeZone;
15
15
  import org.jruby.embed.ScriptingContainer;
16
16
  import org.jruby.util.RubyDateFormat;
17
17
 
18
- import java.text.SimpleDateFormat;
19
- import java.util.Date;
20
18
  import java.util.Locale;
19
+ import org.joda.time.format.DateTimeFormat;
20
+ import org.joda.time.format.DateTimeFormatter;
21
21
 
22
22
  public class TimestampFormatter
23
23
  {
@@ -44,7 +44,7 @@ public class TimestampFormatter
44
44
  }
45
45
 
46
46
  private final RubyDateFormat jrubyFormatter;
47
- private final SimpleDateFormat javaFormatter;
47
+ private final DateTimeFormatter javaFormatter;
48
48
  private final DateTimeZone toTimeZone;
49
49
 
50
50
  public TimestampFormatter(PluginTask task, Optional<? extends TimestampColumnOption> columnOption)
@@ -67,8 +67,7 @@ public class TimestampFormatter
67
67
  }
68
68
  else {
69
69
  this.jrubyFormatter = null;
70
- this.javaFormatter = new SimpleDateFormat(format, Locale.ENGLISH);
71
- javaFormatter.setTimeZone(toTimeZone.toTimeZone());
70
+ this.javaFormatter = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(toTimeZone);
72
71
  }
73
72
  }
74
73
 
@@ -108,6 +107,6 @@ public class TimestampFormatter
108
107
  private String javaFormat(Timestamp value)
109
108
  {
110
109
  long milliSecond = value.getEpochSecond() * 1000 + value.getNano() / 1000000;
111
- return javaFormatter.format(milliSecond);
110
+ return javaFormatter.print(milliSecond);
112
111
  }
113
112
  }
@@ -14,14 +14,16 @@ import org.embulk.spi.time.Timestamp;
14
14
  import static org.embulk.spi.time.TimestampFormat.parseDateTimeZone;
15
15
 
16
16
  import org.embulk.spi.time.TimestampParseException;
17
+ import org.joda.time.DateTime;
17
18
  import org.joda.time.DateTimeZone;
19
+ import org.joda.time.format.DateTimeFormatter;
18
20
  import org.jruby.embed.ScriptingContainer;
19
21
 
20
- import java.text.ParseException;
21
- import java.text.SimpleDateFormat;
22
22
  import java.util.ArrayList;
23
23
  import java.util.List;
24
- import java.util.TimeZone;
24
+ import java.util.Locale;
25
+
26
+ import org.joda.time.format.DateTimeFormat;
25
27
 
26
28
  public class TimestampParser {
27
29
  public interface Task {
@@ -44,8 +46,8 @@ public class TimestampParser {
44
46
  Optional<List<String>> getFromFormat();
45
47
  }
46
48
 
47
- private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<JRubyTimeParserHelper>();
48
- private final List<SimpleDateFormat> javaParserList = new ArrayList<SimpleDateFormat>();
49
+ private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<>();
50
+ private final List<DateTimeFormatter> javaParserList = new ArrayList<>();
49
51
  private final DateTimeZone defaultFromTimeZone;
50
52
 
51
53
  TimestampParser(PluginTask task) {
@@ -60,14 +62,14 @@ public class TimestampParser {
60
62
 
61
63
  public TimestampParser(ScriptingContainer jruby, List<String> formatList, DateTimeZone defaultFromTimeZone) {
62
64
  JRubyTimeParserHelperFactory helperFactory = (JRubyTimeParserHelperFactory) jruby.runScriptlet("Embulk::Java::TimeParserHelper::Factory.new");
65
+
63
66
  // TODO get default current time from ExecTask.getExecTimestamp
64
67
  for (String format : formatList) {
65
68
  if (format.contains("%")) {
66
69
  JRubyTimeParserHelper helper = (JRubyTimeParserHelper) helperFactory.newInstance(format, 1970, 1, 1, 0, 0, 0, 0); // TODO default time zone
67
70
  this.jrubyParserList.add(helper);
68
71
  } else {
69
- SimpleDateFormat parser = new SimpleDateFormat(format);
70
- parser.setTimeZone(defaultFromTimeZone.toTimeZone());
72
+ DateTimeFormatter parser = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(defaultFromTimeZone);
71
73
  this.javaParserList.add(parser);
72
74
  }
73
75
  }
@@ -78,7 +80,7 @@ public class TimestampParser {
78
80
  return defaultFromTimeZone;
79
81
  }
80
82
 
81
- public Timestamp parse(String text) throws TimestampParseException, ParseException {
83
+ public Timestamp parse(String text) throws TimestampParseException, IllegalArgumentException {
82
84
  if (!jrubyParserList.isEmpty()) {
83
85
  return jrubyParse(text);
84
86
  } else if (!javaParserList.isEmpty()) {
@@ -124,21 +126,22 @@ public class TimestampParser {
124
126
  return Timestamp.ofEpochSecond(sec, usec * 1000);
125
127
  }
126
128
 
127
- private Timestamp javaParse(String text) throws ParseException {
128
- long msec = -1;
129
- ParseException exception = null;
129
+ private Timestamp javaParse(String text) throws IllegalArgumentException {
130
+ DateTime dateTime = null;
131
+ IllegalArgumentException exception = null;
130
132
 
131
- for (SimpleDateFormat parser : javaParserList) {
133
+ for (DateTimeFormatter parser : javaParserList) {
132
134
  try {
133
- msec = parser.parse(text).getTime(); // NOTE: milli second resolution
135
+ dateTime = parser.parseDateTime(text);
134
136
  break;
135
- } catch (ParseException ex) {
137
+ } catch (IllegalArgumentException ex) {
136
138
  exception = ex;
137
139
  }
138
140
  }
139
- if (msec == -1) {
141
+ if (dateTime == null) {
140
142
  throw exception;
141
143
  }
144
+ long msec = dateTime.getMillis(); // NOTE: milli second resolution
142
145
 
143
146
  long nanoAdjustment = msec * 1000000;
144
147
  return Timestamp.ofEpochSecond(0, nanoAdjustment);
@@ -7,8 +7,6 @@ import org.embulk.spi.DataException;
7
7
  import org.embulk.spi.time.Timestamp;
8
8
  import org.embulk.spi.time.TimestampParseException;
9
9
 
10
- import java.text.ParseException;
11
-
12
10
  public class StringCast
13
11
  {
14
12
  private StringCast() {}
@@ -27,7 +25,7 @@ public class StringCast
27
25
  catch (TimestampParseException ex) {
28
26
  throw new DataException(buildErrorMessage(value), ex);
29
27
  }
30
- catch (ParseException ex) {
28
+ catch (IllegalArgumentException ex) {
31
29
  throw new DataException(buildErrorMessage(value), ex);
32
30
  }
33
31
  }
@@ -40,7 +38,7 @@ public class StringCast
40
38
  catch (TimestampParseException ex) {
41
39
  throw new DataException(buildErrorMessage(value), ex);
42
40
  }
43
- catch (ParseException ex) {
41
+ catch (IllegalArgumentException ex) {
44
42
  throw new DataException(buildErrorMessage(value), ex);
45
43
  }
46
44
  }
@@ -54,7 +52,7 @@ public class StringCast
54
52
  catch (TimestampParseException ex) {
55
53
  throw new DataException(buildErrorMessage(value), ex);
56
54
  }
57
- catch (ParseException ex) {
55
+ catch (IllegalArgumentException ex) {
58
56
  throw new DataException(buildErrorMessage(value), ex);
59
57
  }
60
58
  }
@@ -68,7 +66,7 @@ public class StringCast
68
66
  catch (TimestampParseException ex) {
69
67
  throw new DataException(buildErrorMessage(value), ex);
70
68
  }
71
- catch (ParseException ex) {
69
+ catch (IllegalArgumentException ex) {
72
70
  throw new DataException(buildErrorMessage(value), ex);
73
71
  }
74
72
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-timestamp_format
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naotoshi Seo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-01 00:00:00.000000000 Z
11
+ date: 2016-05-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,8 @@ files:
52
52
  - README.md
53
53
  - bench/config_java.yml
54
54
  - bench/config_jruby.yml
55
+ - bench/config_jruby_formatter.yml
56
+ - bench/config_jruby_parser.yml
55
57
  - bench/gen_dummy.rb
56
58
  - build.gradle
57
59
  - config/checkstyle/checkstyle.xml
@@ -93,7 +95,7 @@ files:
93
95
  - src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java
94
96
  - src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java
95
97
  - src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java
96
- - classpath/embulk-filter-timestamp_format-0.1.6.jar
98
+ - classpath/embulk-filter-timestamp_format-0.1.7.jar
97
99
  homepage: https://github.com/sonots/embulk-filter-timestamp_format
98
100
  licenses:
99
101
  - MIT