embulk-filter-timestamp_format 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cb84426fbf7dfbaac21925ece7475f322c7c2010
4
- data.tar.gz: 2c5accec8470864588c20d62c42626a89a584bab
3
+ metadata.gz: 0086188ce09565733308b14cebcfd784df0a6a4e
4
+ data.tar.gz: 8d974fbc9276c45f07541f62f79f725b375f2d89
5
5
  SHA512:
6
- metadata.gz: d77f18cde0d2a3626198d1e94aa1c7e1b6c47d3c8572c9397cf1c5ec8a4ec808fcb9dea636339c0e838e00f418cefd290d92d77b44669a6df31c1c3e2fd8b9bb
7
- data.tar.gz: c3464272e96b244c782b65e8e536a1f64fc488646958c58d04a068550c85d718f137081662ce5fd7fe1370aa6788bda4b8598a7e0e606b7a00e6811e096177a5
6
+ metadata.gz: fcbe6f58659fa857ddf7237aa9534bcb392ff21d60e1e31abe267414b570a93b44bd347839361a207add780306ceb682e6e3f14e93181b3b1741fd497078bf86
7
+ data.tar.gz: 2b13d3aab703ba5dfd53f41587d20322595fee6b3af5603f8744c4955b1b32ce2525d022a46bda85d5567de8a687ef1718e3aa064094d278a20406fdf94db6c0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ # 0.1.7 (2016-05-09)
2
+
3
+ Enhancements:
4
+
5
+ * Use Joda-Time DateTimeFormat instead of SimpleDateFormat for Java timestamp parser/formatter
6
+ * to be thread-safe
7
+ * to fix ss.SSS resolves 1.1 as 1.001 seconds wrongly
8
+
1
9
  # 0.1.6 (2016-05-01)
2
10
 
3
11
  Enhancements:
data/README.md CHANGED
@@ -8,7 +8,7 @@ A filter plugin for Embulk to change timestamp format
8
8
 
9
9
  - **columns**: columns to retain (array of hash)
10
10
  - **name**: name of column (required)
11
- - **type**: type to cast (string, timestamp, long (unixtimestamp), double (unixtimestamp), default is string)
11
+ - **type**: type to cast, choose one of `string`, `timestamp`, `long` (unixtimestamp), `double` (unixtimestamp) (string, default is `string`)
12
12
  - **from_format**: specify the format of the input string (array of strings, default is default_from_timestamp_format)
13
13
  - **from_timezone**: specify the timezone of the input string (string, default is default_from_timezone)
14
14
  - **to_format**: specify the format of the output string (string, default is default_to_timestamp_format)
@@ -19,8 +19,8 @@ A filter plugin for Embulk to change timestamp format
19
19
  - **default_from_timezone**: default timezone for the input string (string, default is `UTC`)
20
20
  - **default_to_timestamp_format**: default timestamp format for the output string (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
21
21
  - **default_to_timezone**: default timezone for the output string (string, default is `UTC`)
22
- - **default_from_timetamp_unit**: default time unit such as second, ms, us, ns for the input unixtimestamp (string, default is `second`)
23
- - **default_to_timetamp_unit**: default time unit such as second, ms, us, ns for the output unixtimestamp (string, default is `second`)
22
+ - **default_from_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the input unixtimestamp (string, default is `second`)
23
+ - **default_to_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the output unixtimestamp (string, default is `second`)
24
24
  - **stop_on_invalid_record**: stop bulk load transaction if a invalid record is found (boolean, default is `false`)
25
25
 
26
26
  ## Example
@@ -37,33 +37,35 @@ in:
37
37
  type: file
38
38
  path_prefix: example/example.jsonl
39
39
  parser:
40
- type: jsonl
40
+ type: jsonl # not json parser
41
41
  columns:
42
42
  - {name: timestamp, type: string}
43
43
  - {name: nested, type: json}
44
44
  filters:
45
45
  - type: timestamp_format
46
+ default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]
46
47
  default_to_timezone: "Asia/Tokyo"
47
48
  default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
48
49
  columns:
49
- - {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
50
- - {name: $.nested.timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
50
+ - {name: timestamp, type: long, to_unit: ms}
51
+ - {name: $.nested.timestamp}
52
+ out:
51
53
  type: stdout
52
54
  ```
53
55
 
54
56
  Output will be as:
55
57
 
56
58
  ```
57
- {"timestamp":"2015-07-13 00:00:00.0","nested":{"timestamp":"2015-07-13 00:00:00.0}}
58
- {"timestamp":"2015-07-13 00:00:00.1","nested":{"timestamp":"2015-07-13 00:00:00.1}}
59
+ {"timestamp":1436713200000,"nested":{"timestamp":"2015-07-13 00:00:00.0}}
60
+ {"timestamp":1436713200100,"nested":{"timestamp":"2015-07-13 00:00:00.1}}
59
61
  ```
60
62
 
61
63
  See [./example](./example) for more examples.
62
64
 
63
- ## Timestamp Parser/Formatter Performance Issue
65
+ ## JRuby Timestamp Parser/Formatter Performance Issue
64
66
 
65
67
  Embulk's timestamp parser/formatter originally uses jruby implementation, but it is slow.
66
- To improve performance, this plugin also supports Java's [SimpleDateFormat](https://docs.oracle.com/javase/jp/6/api/java/text/SimpleDateFormat.html) format as:
68
+ To improve performance, this plugin also supports Java's Joda-Time [DateTimeFormat](http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) format as:
67
69
 
68
70
  ```yaml
69
71
  in:
@@ -76,12 +78,11 @@ in:
76
78
  - {name: nested, type: json}
77
79
  filters:
78
80
  - type: timestamp_format
79
- default_from_timezone: "Asia/Taipei"
80
81
  default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.SSS z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
81
82
  default_to_timezone: "Asia/Taipei"
82
83
  default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
83
84
  columns:
84
- - {name: timestamp}
85
+ - {name: timestamp, type: long, to_unit: ms}
85
86
  - {name: $.nested.timestamp}
86
87
  out:
87
88
  type: stdout
@@ -93,13 +94,14 @@ If format strings contain `%`, jruby parser/formatter is used. Otherwirse, java
93
94
 
94
95
  Benchmark test sets are available at [./bench](./bench). In my environment (Mac Book Pro), for 1000000 timestamps:
95
96
 
96
- * jruby parser/formatter: 65.06s
97
- * java parser/formatter: 1.3s
97
+ * java parser / java formatter: 1.3s
98
+ * java parser / jruby formatter: 1.4s
99
+ * jruby parser / java formatter: 64.52s
100
+ * jruby parser / jruby formatter: 65.06s
98
101
 
99
102
  **NOTICE:**
100
103
 
101
- * JRuby parser has micro second resolution, but Java parser (SimpleDateFormat) has only milli second resolution
102
- * `S` requires three digits always. For example, `yyyy-MM-dd HH:mm::ss.S` for `2015-12-17 01:02:03.1` gives 001 milli seconds wrongly, but it is the specification of SimpleDateFormat.
104
+ * JRuby parser has micro second resolution, but Java parser (Joda-Time) has only milli second resolution
103
105
 
104
106
  ## ToDo
105
107
 
@@ -9,6 +9,5 @@ filters:
9
9
  - type: timestamp_format
10
10
  columns:
11
11
  - {name: timestamp, from_format: ["yyyy-MM-dd hh:mm:ss.SSS"], to_format: "yyyy-MM-dd"}
12
-
13
12
  out:
14
13
  type: "null"
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: bench/dummy
4
+ parser:
5
+ type: csv
6
+ columns:
7
+ - {name: timestamp, type: string}
8
+ filters:
9
+ - type: timestamp_format
10
+ columns:
11
+ - {name: timestamp, from_format: ["yyyy-MM-dd hh:mm:ss.SSS"], to_format: "%Y-%m-%d"}
12
+ out:
13
+ type: "null"
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: bench/dummy
4
+ parser:
5
+ type: csv
6
+ columns:
7
+ - {name: timestamp, type: string}
8
+ filters:
9
+ - type: timestamp_format
10
+ columns:
11
+ - {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N"], to_format: "yyyy-MM-dd"}
12
+ out:
13
+ type: "null"
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.6"
16
+ version = "0.1.7"
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
data/example/example.yml CHANGED
@@ -8,7 +8,7 @@ filters:
8
8
  default_to_timezone: "Asia/Tokyo"
9
9
  default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
10
10
  columns:
11
- - {name: "$.record.timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
11
+ - {name: "$.record.timestamp", type: long, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"], to_unit: ms}
12
12
  - {name: "$.record.nested.nested[0].timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
13
13
  out:
14
14
  type: "null"
data/example/string.csv CHANGED
@@ -1,3 +1,5 @@
1
+ 2015-07-13,2015-07-13,2015-07-13,2015-07-13
2
+ 2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC
1
3
  2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00
2
4
  2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC
3
5
  2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC
data/example/string.yml CHANGED
@@ -11,7 +11,7 @@ in:
11
11
  filters:
12
12
  - type: timestamp_format
13
13
  default_from_timezone: "Asia/Taipei"
14
- default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
14
+ default_from_timestamp_format: ["%Y-%m-%d", "%Y-%m-%d %z", "%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
15
15
  default_to_timezone: "Asia/Taipei"
16
16
  default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
17
17
  columns:
@@ -11,7 +11,7 @@ in:
11
11
  filters:
12
12
  - type: timestamp_format
13
13
  default_from_timezone: "Asia/Taipei"
14
- default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"] # SSS must be three digit ...
14
+ default_from_timestamp_format: ["yyyy-MM-dd", "yyyy-MM-dd z", "yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
15
15
  default_to_timezone: "Asia/Taipei"
16
16
  default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
17
17
  columns:
@@ -15,9 +15,9 @@ import org.joda.time.DateTimeZone;
15
15
  import org.jruby.embed.ScriptingContainer;
16
16
  import org.jruby.util.RubyDateFormat;
17
17
 
18
- import java.text.SimpleDateFormat;
19
- import java.util.Date;
20
18
  import java.util.Locale;
19
+ import org.joda.time.format.DateTimeFormat;
20
+ import org.joda.time.format.DateTimeFormatter;
21
21
 
22
22
  public class TimestampFormatter
23
23
  {
@@ -44,7 +44,7 @@ public class TimestampFormatter
44
44
  }
45
45
 
46
46
  private final RubyDateFormat jrubyFormatter;
47
- private final SimpleDateFormat javaFormatter;
47
+ private final DateTimeFormatter javaFormatter;
48
48
  private final DateTimeZone toTimeZone;
49
49
 
50
50
  public TimestampFormatter(PluginTask task, Optional<? extends TimestampColumnOption> columnOption)
@@ -67,8 +67,7 @@ public class TimestampFormatter
67
67
  }
68
68
  else {
69
69
  this.jrubyFormatter = null;
70
- this.javaFormatter = new SimpleDateFormat(format, Locale.ENGLISH);
71
- javaFormatter.setTimeZone(toTimeZone.toTimeZone());
70
+ this.javaFormatter = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(toTimeZone);
72
71
  }
73
72
  }
74
73
 
@@ -108,6 +107,6 @@ public class TimestampFormatter
108
107
  private String javaFormat(Timestamp value)
109
108
  {
110
109
  long milliSecond = value.getEpochSecond() * 1000 + value.getNano() / 1000000;
111
- return javaFormatter.format(milliSecond);
110
+ return javaFormatter.print(milliSecond);
112
111
  }
113
112
  }
@@ -14,14 +14,16 @@ import org.embulk.spi.time.Timestamp;
14
14
  import static org.embulk.spi.time.TimestampFormat.parseDateTimeZone;
15
15
 
16
16
  import org.embulk.spi.time.TimestampParseException;
17
+ import org.joda.time.DateTime;
17
18
  import org.joda.time.DateTimeZone;
19
+ import org.joda.time.format.DateTimeFormatter;
18
20
  import org.jruby.embed.ScriptingContainer;
19
21
 
20
- import java.text.ParseException;
21
- import java.text.SimpleDateFormat;
22
22
  import java.util.ArrayList;
23
23
  import java.util.List;
24
- import java.util.TimeZone;
24
+ import java.util.Locale;
25
+
26
+ import org.joda.time.format.DateTimeFormat;
25
27
 
26
28
  public class TimestampParser {
27
29
  public interface Task {
@@ -44,8 +46,8 @@ public class TimestampParser {
44
46
  Optional<List<String>> getFromFormat();
45
47
  }
46
48
 
47
- private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<JRubyTimeParserHelper>();
48
- private final List<SimpleDateFormat> javaParserList = new ArrayList<SimpleDateFormat>();
49
+ private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<>();
50
+ private final List<DateTimeFormatter> javaParserList = new ArrayList<>();
49
51
  private final DateTimeZone defaultFromTimeZone;
50
52
 
51
53
  TimestampParser(PluginTask task) {
@@ -60,14 +62,14 @@ public class TimestampParser {
60
62
 
61
63
  public TimestampParser(ScriptingContainer jruby, List<String> formatList, DateTimeZone defaultFromTimeZone) {
62
64
  JRubyTimeParserHelperFactory helperFactory = (JRubyTimeParserHelperFactory) jruby.runScriptlet("Embulk::Java::TimeParserHelper::Factory.new");
65
+
63
66
  // TODO get default current time from ExecTask.getExecTimestamp
64
67
  for (String format : formatList) {
65
68
  if (format.contains("%")) {
66
69
  JRubyTimeParserHelper helper = (JRubyTimeParserHelper) helperFactory.newInstance(format, 1970, 1, 1, 0, 0, 0, 0); // TODO default time zone
67
70
  this.jrubyParserList.add(helper);
68
71
  } else {
69
- SimpleDateFormat parser = new SimpleDateFormat(format);
70
- parser.setTimeZone(defaultFromTimeZone.toTimeZone());
72
+ DateTimeFormatter parser = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(defaultFromTimeZone);
71
73
  this.javaParserList.add(parser);
72
74
  }
73
75
  }
@@ -78,7 +80,7 @@ public class TimestampParser {
78
80
  return defaultFromTimeZone;
79
81
  }
80
82
 
81
- public Timestamp parse(String text) throws TimestampParseException, ParseException {
83
+ public Timestamp parse(String text) throws TimestampParseException, IllegalArgumentException {
82
84
  if (!jrubyParserList.isEmpty()) {
83
85
  return jrubyParse(text);
84
86
  } else if (!javaParserList.isEmpty()) {
@@ -124,21 +126,22 @@ public class TimestampParser {
124
126
  return Timestamp.ofEpochSecond(sec, usec * 1000);
125
127
  }
126
128
 
127
- private Timestamp javaParse(String text) throws ParseException {
128
- long msec = -1;
129
- ParseException exception = null;
129
+ private Timestamp javaParse(String text) throws IllegalArgumentException {
130
+ DateTime dateTime = null;
131
+ IllegalArgumentException exception = null;
130
132
 
131
- for (SimpleDateFormat parser : javaParserList) {
133
+ for (DateTimeFormatter parser : javaParserList) {
132
134
  try {
133
- msec = parser.parse(text).getTime(); // NOTE: milli second resolution
135
+ dateTime = parser.parseDateTime(text);
134
136
  break;
135
- } catch (ParseException ex) {
137
+ } catch (IllegalArgumentException ex) {
136
138
  exception = ex;
137
139
  }
138
140
  }
139
- if (msec == -1) {
141
+ if (dateTime == null) {
140
142
  throw exception;
141
143
  }
144
+ long msec = dateTime.getMillis(); // NOTE: milli second resolution
142
145
 
143
146
  long nanoAdjustment = msec * 1000000;
144
147
  return Timestamp.ofEpochSecond(0, nanoAdjustment);
@@ -7,8 +7,6 @@ import org.embulk.spi.DataException;
7
7
  import org.embulk.spi.time.Timestamp;
8
8
  import org.embulk.spi.time.TimestampParseException;
9
9
 
10
- import java.text.ParseException;
11
-
12
10
  public class StringCast
13
11
  {
14
12
  private StringCast() {}
@@ -27,7 +25,7 @@ public class StringCast
27
25
  catch (TimestampParseException ex) {
28
26
  throw new DataException(buildErrorMessage(value), ex);
29
27
  }
30
- catch (ParseException ex) {
28
+ catch (IllegalArgumentException ex) {
31
29
  throw new DataException(buildErrorMessage(value), ex);
32
30
  }
33
31
  }
@@ -40,7 +38,7 @@ public class StringCast
40
38
  catch (TimestampParseException ex) {
41
39
  throw new DataException(buildErrorMessage(value), ex);
42
40
  }
43
- catch (ParseException ex) {
41
+ catch (IllegalArgumentException ex) {
44
42
  throw new DataException(buildErrorMessage(value), ex);
45
43
  }
46
44
  }
@@ -54,7 +52,7 @@ public class StringCast
54
52
  catch (TimestampParseException ex) {
55
53
  throw new DataException(buildErrorMessage(value), ex);
56
54
  }
57
- catch (ParseException ex) {
55
+ catch (IllegalArgumentException ex) {
58
56
  throw new DataException(buildErrorMessage(value), ex);
59
57
  }
60
58
  }
@@ -68,7 +66,7 @@ public class StringCast
68
66
  catch (TimestampParseException ex) {
69
67
  throw new DataException(buildErrorMessage(value), ex);
70
68
  }
71
- catch (ParseException ex) {
69
+ catch (IllegalArgumentException ex) {
72
70
  throw new DataException(buildErrorMessage(value), ex);
73
71
  }
74
72
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-timestamp_format
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naotoshi Seo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-01 00:00:00.000000000 Z
11
+ date: 2016-05-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,8 @@ files:
52
52
  - README.md
53
53
  - bench/config_java.yml
54
54
  - bench/config_jruby.yml
55
+ - bench/config_jruby_formatter.yml
56
+ - bench/config_jruby_parser.yml
55
57
  - bench/gen_dummy.rb
56
58
  - build.gradle
57
59
  - config/checkstyle/checkstyle.xml
@@ -93,7 +95,7 @@ files:
93
95
  - src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java
94
96
  - src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java
95
97
  - src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java
96
- - classpath/embulk-filter-timestamp_format-0.1.6.jar
98
+ - classpath/embulk-filter-timestamp_format-0.1.7.jar
97
99
  homepage: https://github.com/sonots/embulk-filter-timestamp_format
98
100
  licenses:
99
101
  - MIT