embulk-filter-timestamp_format 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +18 -16
- data/bench/config_java.yml +0 -1
- data/bench/config_jruby_formatter.yml +13 -0
- data/bench/config_jruby_parser.yml +13 -0
- data/build.gradle +1 -1
- data/example/example.yml +1 -1
- data/example/string.csv +2 -0
- data/example/string.yml +1 -1
- data/example/string_java.yml +1 -1
- data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatter.java +5 -6
- data/src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java +18 -15
- data/src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java +4 -6
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0086188ce09565733308b14cebcfd784df0a6a4e
|
4
|
+
data.tar.gz: 8d974fbc9276c45f07541f62f79f725b375f2d89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcbe6f58659fa857ddf7237aa9534bcb392ff21d60e1e31abe267414b570a93b44bd347839361a207add780306ceb682e6e3f14e93181b3b1741fd497078bf86
|
7
|
+
data.tar.gz: 2b13d3aab703ba5dfd53f41587d20322595fee6b3af5603f8744c4955b1b32ce2525d022a46bda85d5567de8a687ef1718e3aa064094d278a20406fdf94db6c0
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# 0.1.7 (2016-05-09)
|
2
|
+
|
3
|
+
Enhancements:
|
4
|
+
|
5
|
+
* Use Joda-Time DateTimeFormat instead of SimpleDateFormat for Java timestamp parser/formatter
|
6
|
+
* to be thread-safe
|
7
|
+
* to fix ss.SSS resolves 1.1 as 1.001 seconds wrongly
|
8
|
+
|
1
9
|
# 0.1.6 (2016-05-01)
|
2
10
|
|
3
11
|
Enhancements:
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@ A filter plugin for Embulk to change timestamp format
|
|
8
8
|
|
9
9
|
- **columns**: columns to retain (array of hash)
|
10
10
|
- **name**: name of column (required)
|
11
|
-
- **type**: type to cast
|
11
|
+
- **type**: type to cast, choose one of `string`, `timestamp`, `long` (unixtimestamp), `double` (unixtimestamp) (string, default is `string`)
|
12
12
|
- **from_format**: specify the format of the input string (array of strings, default is default_from_timestamp_format)
|
13
13
|
- **from_timezone**: specify the timezone of the input string (string, default is default_from_timezone)
|
14
14
|
- **to_format**: specify the format of the output string (string, default is default_to_timestamp_format)
|
@@ -19,8 +19,8 @@ A filter plugin for Embulk to change timestamp format
|
|
19
19
|
- **default_from_timezone**: default timezone for the input string (string, default is `UTC`)
|
20
20
|
- **default_to_timestamp_format**: default timestamp format for the output string (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
21
21
|
- **default_to_timezone**: default timezone for the output string (string, default is `UTC`)
|
22
|
-
- **
|
23
|
-
- **
|
22
|
+
- **default_from_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the input unixtimestamp (string, default is `second`)
|
23
|
+
- **default_to_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the output unixtimestamp (string, default is `second`)
|
24
24
|
- **stop_on_invalid_record**: stop bulk load transaction if a invalid record is found (boolean, default is `false`)
|
25
25
|
|
26
26
|
## Example
|
@@ -37,33 +37,35 @@ in:
|
|
37
37
|
type: file
|
38
38
|
path_prefix: example/example.jsonl
|
39
39
|
parser:
|
40
|
-
type: jsonl
|
40
|
+
type: jsonl # not json parser
|
41
41
|
columns:
|
42
42
|
- {name: timestamp, type: string}
|
43
43
|
- {name: nested, type: json}
|
44
44
|
filters:
|
45
45
|
- type: timestamp_format
|
46
|
+
default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]
|
46
47
|
default_to_timezone: "Asia/Tokyo"
|
47
48
|
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
48
49
|
columns:
|
49
|
-
- {name: timestamp,
|
50
|
-
- {name: $.nested.timestamp
|
50
|
+
- {name: timestamp, type: long, to_unit: ms}
|
51
|
+
- {name: $.nested.timestamp}
|
52
|
+
out:
|
51
53
|
type: stdout
|
52
54
|
```
|
53
55
|
|
54
56
|
Output will be as:
|
55
57
|
|
56
58
|
```
|
57
|
-
{"timestamp":
|
58
|
-
{"timestamp":
|
59
|
+
{"timestamp":1436713200000,"nested":{"timestamp":"2015-07-13 00:00:00.0}}
|
60
|
+
{"timestamp":1436713200100,"nested":{"timestamp":"2015-07-13 00:00:00.1}}
|
59
61
|
```
|
60
62
|
|
61
63
|
See [./example](./example) for more examples.
|
62
64
|
|
63
|
-
## Timestamp Parser/Formatter Performance Issue
|
65
|
+
## JRuby Timestamp Parser/Formatter Performance Issue
|
64
66
|
|
65
67
|
Embulk's timestamp parser/formatter originally uses jruby implementation, but it is slow.
|
66
|
-
To improve performance, this plugin also supports Java's [
|
68
|
+
To improve performance, this plugin also supports Java's Joda-Time [DateTimeFormat](http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) format as:
|
67
69
|
|
68
70
|
```yaml
|
69
71
|
in:
|
@@ -76,12 +78,11 @@ in:
|
|
76
78
|
- {name: nested, type: json}
|
77
79
|
filters:
|
78
80
|
- type: timestamp_format
|
79
|
-
default_from_timezone: "Asia/Taipei"
|
80
81
|
default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.SSS z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
|
81
82
|
default_to_timezone: "Asia/Taipei"
|
82
83
|
default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
|
83
84
|
columns:
|
84
|
-
- {name: timestamp}
|
85
|
+
- {name: timestamp, type: long, to_unit: ms}
|
85
86
|
- {name: $.nested.timestamp}
|
86
87
|
out:
|
87
88
|
type: stdout
|
@@ -93,13 +94,14 @@ If format strings contain `%`, jruby parser/formatter is used. Otherwirse, java
|
|
93
94
|
|
94
95
|
Benchmark test sets are available at [./bench](./bench). In my environment (Mac Book Pro), for 1000000 timestamps:
|
95
96
|
|
96
|
-
*
|
97
|
-
* java parser/formatter: 1.
|
97
|
+
* java parser / java formatter: 1.3s
|
98
|
+
* java parser / jruby formatter: 1.4s
|
99
|
+
* jruby parser / java formatter: 64.52s
|
100
|
+
* jruby parser / jruby formatter: 65.06s
|
98
101
|
|
99
102
|
**NOTICE:**
|
100
103
|
|
101
|
-
* JRuby parser has micro second resolution, but Java parser (
|
102
|
-
* `S` requires three digits always. For example, `yyyy-MM-dd HH:mm::ss.S` for `2015-12-17 01:02:03.1` gives 001 milli seconds wrongly, but it is the specification of SimpleDateFormat.
|
104
|
+
* JRuby parser has micro second resolution, but Java parser (Joda-Time) has only milli second resolution
|
103
105
|
|
104
106
|
## ToDo
|
105
107
|
|
data/bench/config_java.yml
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: bench/dummy
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
columns:
|
7
|
+
- {name: timestamp, type: string}
|
8
|
+
filters:
|
9
|
+
- type: timestamp_format
|
10
|
+
columns:
|
11
|
+
- {name: timestamp, from_format: ["yyyy-MM-dd hh:mm:ss.SSS"], to_format: "%Y-%m-%d"}
|
12
|
+
out:
|
13
|
+
type: "null"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: bench/dummy
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
columns:
|
7
|
+
- {name: timestamp, type: string}
|
8
|
+
filters:
|
9
|
+
- type: timestamp_format
|
10
|
+
columns:
|
11
|
+
- {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N"], to_format: "yyyy-MM-dd"}
|
12
|
+
out:
|
13
|
+
type: "null"
|
data/build.gradle
CHANGED
data/example/example.yml
CHANGED
@@ -8,7 +8,7 @@ filters:
|
|
8
8
|
default_to_timezone: "Asia/Tokyo"
|
9
9
|
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
10
10
|
columns:
|
11
|
-
- {name: "$.record.timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
11
|
+
- {name: "$.record.timestamp", type: long, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"], to_unit: ms}
|
12
12
|
- {name: "$.record.nested.nested[0].timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
13
13
|
out:
|
14
14
|
type: "null"
|
data/example/string.csv
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
2015-07-13,2015-07-13,2015-07-13,2015-07-13
|
2
|
+
2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC
|
1
3
|
2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00
|
2
4
|
2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC
|
3
5
|
2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC
|
data/example/string.yml
CHANGED
@@ -11,7 +11,7 @@ in:
|
|
11
11
|
filters:
|
12
12
|
- type: timestamp_format
|
13
13
|
default_from_timezone: "Asia/Taipei"
|
14
|
-
default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
|
14
|
+
default_from_timestamp_format: ["%Y-%m-%d", "%Y-%m-%d %z", "%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
|
15
15
|
default_to_timezone: "Asia/Taipei"
|
16
16
|
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
17
17
|
columns:
|
data/example/string_java.yml
CHANGED
@@ -11,7 +11,7 @@ in:
|
|
11
11
|
filters:
|
12
12
|
- type: timestamp_format
|
13
13
|
default_from_timezone: "Asia/Taipei"
|
14
|
-
default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
|
14
|
+
default_from_timestamp_format: ["yyyy-MM-dd", "yyyy-MM-dd z", "yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
|
15
15
|
default_to_timezone: "Asia/Taipei"
|
16
16
|
default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
|
17
17
|
columns:
|
@@ -15,9 +15,9 @@ import org.joda.time.DateTimeZone;
|
|
15
15
|
import org.jruby.embed.ScriptingContainer;
|
16
16
|
import org.jruby.util.RubyDateFormat;
|
17
17
|
|
18
|
-
import java.text.SimpleDateFormat;
|
19
|
-
import java.util.Date;
|
20
18
|
import java.util.Locale;
|
19
|
+
import org.joda.time.format.DateTimeFormat;
|
20
|
+
import org.joda.time.format.DateTimeFormatter;
|
21
21
|
|
22
22
|
public class TimestampFormatter
|
23
23
|
{
|
@@ -44,7 +44,7 @@ public class TimestampFormatter
|
|
44
44
|
}
|
45
45
|
|
46
46
|
private final RubyDateFormat jrubyFormatter;
|
47
|
-
private final
|
47
|
+
private final DateTimeFormatter javaFormatter;
|
48
48
|
private final DateTimeZone toTimeZone;
|
49
49
|
|
50
50
|
public TimestampFormatter(PluginTask task, Optional<? extends TimestampColumnOption> columnOption)
|
@@ -67,8 +67,7 @@ public class TimestampFormatter
|
|
67
67
|
}
|
68
68
|
else {
|
69
69
|
this.jrubyFormatter = null;
|
70
|
-
this.javaFormatter =
|
71
|
-
javaFormatter.setTimeZone(toTimeZone.toTimeZone());
|
70
|
+
this.javaFormatter = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(toTimeZone);
|
72
71
|
}
|
73
72
|
}
|
74
73
|
|
@@ -108,6 +107,6 @@ public class TimestampFormatter
|
|
108
107
|
private String javaFormat(Timestamp value)
|
109
108
|
{
|
110
109
|
long milliSecond = value.getEpochSecond() * 1000 + value.getNano() / 1000000;
|
111
|
-
return javaFormatter.
|
110
|
+
return javaFormatter.print(milliSecond);
|
112
111
|
}
|
113
112
|
}
|
@@ -14,14 +14,16 @@ import org.embulk.spi.time.Timestamp;
|
|
14
14
|
import static org.embulk.spi.time.TimestampFormat.parseDateTimeZone;
|
15
15
|
|
16
16
|
import org.embulk.spi.time.TimestampParseException;
|
17
|
+
import org.joda.time.DateTime;
|
17
18
|
import org.joda.time.DateTimeZone;
|
19
|
+
import org.joda.time.format.DateTimeFormatter;
|
18
20
|
import org.jruby.embed.ScriptingContainer;
|
19
21
|
|
20
|
-
import java.text.ParseException;
|
21
|
-
import java.text.SimpleDateFormat;
|
22
22
|
import java.util.ArrayList;
|
23
23
|
import java.util.List;
|
24
|
-
import java.util.
|
24
|
+
import java.util.Locale;
|
25
|
+
|
26
|
+
import org.joda.time.format.DateTimeFormat;
|
25
27
|
|
26
28
|
public class TimestampParser {
|
27
29
|
public interface Task {
|
@@ -44,8 +46,8 @@ public class TimestampParser {
|
|
44
46
|
Optional<List<String>> getFromFormat();
|
45
47
|
}
|
46
48
|
|
47
|
-
private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList
|
48
|
-
private final List<
|
49
|
+
private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<>();
|
50
|
+
private final List<DateTimeFormatter> javaParserList = new ArrayList<>();
|
49
51
|
private final DateTimeZone defaultFromTimeZone;
|
50
52
|
|
51
53
|
TimestampParser(PluginTask task) {
|
@@ -60,14 +62,14 @@ public class TimestampParser {
|
|
60
62
|
|
61
63
|
public TimestampParser(ScriptingContainer jruby, List<String> formatList, DateTimeZone defaultFromTimeZone) {
|
62
64
|
JRubyTimeParserHelperFactory helperFactory = (JRubyTimeParserHelperFactory) jruby.runScriptlet("Embulk::Java::TimeParserHelper::Factory.new");
|
65
|
+
|
63
66
|
// TODO get default current time from ExecTask.getExecTimestamp
|
64
67
|
for (String format : formatList) {
|
65
68
|
if (format.contains("%")) {
|
66
69
|
JRubyTimeParserHelper helper = (JRubyTimeParserHelper) helperFactory.newInstance(format, 1970, 1, 1, 0, 0, 0, 0); // TODO default time zone
|
67
70
|
this.jrubyParserList.add(helper);
|
68
71
|
} else {
|
69
|
-
|
70
|
-
parser.setTimeZone(defaultFromTimeZone.toTimeZone());
|
72
|
+
DateTimeFormatter parser = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(defaultFromTimeZone);
|
71
73
|
this.javaParserList.add(parser);
|
72
74
|
}
|
73
75
|
}
|
@@ -78,7 +80,7 @@ public class TimestampParser {
|
|
78
80
|
return defaultFromTimeZone;
|
79
81
|
}
|
80
82
|
|
81
|
-
public Timestamp parse(String text) throws TimestampParseException,
|
83
|
+
public Timestamp parse(String text) throws TimestampParseException, IllegalArgumentException {
|
82
84
|
if (!jrubyParserList.isEmpty()) {
|
83
85
|
return jrubyParse(text);
|
84
86
|
} else if (!javaParserList.isEmpty()) {
|
@@ -124,21 +126,22 @@ public class TimestampParser {
|
|
124
126
|
return Timestamp.ofEpochSecond(sec, usec * 1000);
|
125
127
|
}
|
126
128
|
|
127
|
-
private Timestamp javaParse(String text) throws
|
128
|
-
|
129
|
-
|
129
|
+
private Timestamp javaParse(String text) throws IllegalArgumentException {
|
130
|
+
DateTime dateTime = null;
|
131
|
+
IllegalArgumentException exception = null;
|
130
132
|
|
131
|
-
for (
|
133
|
+
for (DateTimeFormatter parser : javaParserList) {
|
132
134
|
try {
|
133
|
-
|
135
|
+
dateTime = parser.parseDateTime(text);
|
134
136
|
break;
|
135
|
-
} catch (
|
137
|
+
} catch (IllegalArgumentException ex) {
|
136
138
|
exception = ex;
|
137
139
|
}
|
138
140
|
}
|
139
|
-
if (
|
141
|
+
if (dateTime == null) {
|
140
142
|
throw exception;
|
141
143
|
}
|
144
|
+
long msec = dateTime.getMillis(); // NOTE: milli second resolution
|
142
145
|
|
143
146
|
long nanoAdjustment = msec * 1000000;
|
144
147
|
return Timestamp.ofEpochSecond(0, nanoAdjustment);
|
@@ -7,8 +7,6 @@ import org.embulk.spi.DataException;
|
|
7
7
|
import org.embulk.spi.time.Timestamp;
|
8
8
|
import org.embulk.spi.time.TimestampParseException;
|
9
9
|
|
10
|
-
import java.text.ParseException;
|
11
|
-
|
12
10
|
public class StringCast
|
13
11
|
{
|
14
12
|
private StringCast() {}
|
@@ -27,7 +25,7 @@ public class StringCast
|
|
27
25
|
catch (TimestampParseException ex) {
|
28
26
|
throw new DataException(buildErrorMessage(value), ex);
|
29
27
|
}
|
30
|
-
catch (
|
28
|
+
catch (IllegalArgumentException ex) {
|
31
29
|
throw new DataException(buildErrorMessage(value), ex);
|
32
30
|
}
|
33
31
|
}
|
@@ -40,7 +38,7 @@ public class StringCast
|
|
40
38
|
catch (TimestampParseException ex) {
|
41
39
|
throw new DataException(buildErrorMessage(value), ex);
|
42
40
|
}
|
43
|
-
catch (
|
41
|
+
catch (IllegalArgumentException ex) {
|
44
42
|
throw new DataException(buildErrorMessage(value), ex);
|
45
43
|
}
|
46
44
|
}
|
@@ -54,7 +52,7 @@ public class StringCast
|
|
54
52
|
catch (TimestampParseException ex) {
|
55
53
|
throw new DataException(buildErrorMessage(value), ex);
|
56
54
|
}
|
57
|
-
catch (
|
55
|
+
catch (IllegalArgumentException ex) {
|
58
56
|
throw new DataException(buildErrorMessage(value), ex);
|
59
57
|
}
|
60
58
|
}
|
@@ -68,7 +66,7 @@ public class StringCast
|
|
68
66
|
catch (TimestampParseException ex) {
|
69
67
|
throw new DataException(buildErrorMessage(value), ex);
|
70
68
|
}
|
71
|
-
catch (
|
69
|
+
catch (IllegalArgumentException ex) {
|
72
70
|
throw new DataException(buildErrorMessage(value), ex);
|
73
71
|
}
|
74
72
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-timestamp_format
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,8 @@ files:
|
|
52
52
|
- README.md
|
53
53
|
- bench/config_java.yml
|
54
54
|
- bench/config_jruby.yml
|
55
|
+
- bench/config_jruby_formatter.yml
|
56
|
+
- bench/config_jruby_parser.yml
|
55
57
|
- bench/gen_dummy.rb
|
56
58
|
- build.gradle
|
57
59
|
- config/checkstyle/checkstyle.xml
|
@@ -93,7 +95,7 @@ files:
|
|
93
95
|
- src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java
|
94
96
|
- src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java
|
95
97
|
- src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java
|
96
|
-
- classpath/embulk-filter-timestamp_format-0.1.
|
98
|
+
- classpath/embulk-filter-timestamp_format-0.1.7.jar
|
97
99
|
homepage: https://github.com/sonots/embulk-filter-timestamp_format
|
98
100
|
licenses:
|
99
101
|
- MIT
|