embulk-filter-timestamp_format 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +18 -16
- data/bench/config_java.yml +0 -1
- data/bench/config_jruby_formatter.yml +13 -0
- data/bench/config_jruby_parser.yml +13 -0
- data/build.gradle +1 -1
- data/example/example.yml +1 -1
- data/example/string.csv +2 -0
- data/example/string.yml +1 -1
- data/example/string_java.yml +1 -1
- data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatter.java +5 -6
- data/src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java +18 -15
- data/src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java +4 -6
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0086188ce09565733308b14cebcfd784df0a6a4e
|
4
|
+
data.tar.gz: 8d974fbc9276c45f07541f62f79f725b375f2d89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcbe6f58659fa857ddf7237aa9534bcb392ff21d60e1e31abe267414b570a93b44bd347839361a207add780306ceb682e6e3f14e93181b3b1741fd497078bf86
|
7
|
+
data.tar.gz: 2b13d3aab703ba5dfd53f41587d20322595fee6b3af5603f8744c4955b1b32ce2525d022a46bda85d5567de8a687ef1718e3aa064094d278a20406fdf94db6c0
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# 0.1.7 (2016-05-09)
|
2
|
+
|
3
|
+
Enhancements:
|
4
|
+
|
5
|
+
* Use Joda-Time DateTimeFormat instead of SimpleDateFormat for Java timestamp parser/formatter
|
6
|
+
* to be thread-safe
|
7
|
+
* to fix ss.SSS resolves 1.1 as 1.001 seconds wrongly
|
8
|
+
|
1
9
|
# 0.1.6 (2016-05-01)
|
2
10
|
|
3
11
|
Enhancements:
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@ A filter plugin for Embulk to change timestamp format
|
|
8
8
|
|
9
9
|
- **columns**: columns to retain (array of hash)
|
10
10
|
- **name**: name of column (required)
|
11
|
-
- **type**: type to cast
|
11
|
+
- **type**: type to cast, choose one of `string`, `timestamp`, `long` (unixtimestamp), `double` (unixtimestamp) (string, default is `string`)
|
12
12
|
- **from_format**: specify the format of the input string (array of strings, default is default_from_timestamp_format)
|
13
13
|
- **from_timezone**: specify the timezone of the input string (string, default is default_from_timezone)
|
14
14
|
- **to_format**: specify the format of the output string (string, default is default_to_timestamp_format)
|
@@ -19,8 +19,8 @@ A filter plugin for Embulk to change timestamp format
|
|
19
19
|
- **default_from_timezone**: default timezone for the input string (string, default is `UTC`)
|
20
20
|
- **default_to_timestamp_format**: default timestamp format for the output string (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
21
21
|
- **default_to_timezone**: default timezone for the output string (string, default is `UTC`)
|
22
|
-
- **
|
23
|
-
- **
|
22
|
+
- **default_from_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the input unixtimestamp (string, default is `second`)
|
23
|
+
- **default_to_timestamp_unit**: default time unit such as `second`, `ms`, `us`, `ns` for the output unixtimestamp (string, default is `second`)
|
24
24
|
- **stop_on_invalid_record**: stop bulk load transaction if a invalid record is found (boolean, default is `false`)
|
25
25
|
|
26
26
|
## Example
|
@@ -37,33 +37,35 @@ in:
|
|
37
37
|
type: file
|
38
38
|
path_prefix: example/example.jsonl
|
39
39
|
parser:
|
40
|
-
type: jsonl
|
40
|
+
type: jsonl # not json parser
|
41
41
|
columns:
|
42
42
|
- {name: timestamp, type: string}
|
43
43
|
- {name: nested, type: json}
|
44
44
|
filters:
|
45
45
|
- type: timestamp_format
|
46
|
+
default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]
|
46
47
|
default_to_timezone: "Asia/Tokyo"
|
47
48
|
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
48
49
|
columns:
|
49
|
-
- {name: timestamp,
|
50
|
-
- {name: $.nested.timestamp
|
50
|
+
- {name: timestamp, type: long, to_unit: ms}
|
51
|
+
- {name: $.nested.timestamp}
|
52
|
+
out:
|
51
53
|
type: stdout
|
52
54
|
```
|
53
55
|
|
54
56
|
Output will be as:
|
55
57
|
|
56
58
|
```
|
57
|
-
{"timestamp":
|
58
|
-
{"timestamp":
|
59
|
+
{"timestamp":1436713200000,"nested":{"timestamp":"2015-07-13 00:00:00.0}}
|
60
|
+
{"timestamp":1436713200100,"nested":{"timestamp":"2015-07-13 00:00:00.1}}
|
59
61
|
```
|
60
62
|
|
61
63
|
See [./example](./example) for more examples.
|
62
64
|
|
63
|
-
## Timestamp Parser/Formatter Performance Issue
|
65
|
+
## JRuby Timestamp Parser/Formatter Performance Issue
|
64
66
|
|
65
67
|
Embulk's timestamp parser/formatter originally uses jruby implementation, but it is slow.
|
66
|
-
To improve performance, this plugin also supports Java's [
|
68
|
+
To improve performance, this plugin also supports Java's Joda-Time [DateTimeFormat](http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) format as:
|
67
69
|
|
68
70
|
```yaml
|
69
71
|
in:
|
@@ -76,12 +78,11 @@ in:
|
|
76
78
|
- {name: nested, type: json}
|
77
79
|
filters:
|
78
80
|
- type: timestamp_format
|
79
|
-
default_from_timezone: "Asia/Taipei"
|
80
81
|
default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.SSS z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
|
81
82
|
default_to_timezone: "Asia/Taipei"
|
82
83
|
default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
|
83
84
|
columns:
|
84
|
-
- {name: timestamp}
|
85
|
+
- {name: timestamp, type: long, to_unit: ms}
|
85
86
|
- {name: $.nested.timestamp}
|
86
87
|
out:
|
87
88
|
type: stdout
|
@@ -93,13 +94,14 @@ If format strings contain `%`, jruby parser/formatter is used. Otherwirse, java
|
|
93
94
|
|
94
95
|
Benchmark test sets are available at [./bench](./bench). In my environment (Mac Book Pro), for 1000000 timestamps:
|
95
96
|
|
96
|
-
*
|
97
|
-
* java parser/formatter: 1.
|
97
|
+
* java parser / java formatter: 1.3s
|
98
|
+
* java parser / jruby formatter: 1.4s
|
99
|
+
* jruby parser / java formatter: 64.52s
|
100
|
+
* jruby parser / jruby formatter: 65.06s
|
98
101
|
|
99
102
|
**NOTICE:**
|
100
103
|
|
101
|
-
* JRuby parser has micro second resolution, but Java parser (
|
102
|
-
* `S` requires three digits always. For example, `yyyy-MM-dd HH:mm::ss.S` for `2015-12-17 01:02:03.1` gives 001 milli seconds wrongly, but it is the specification of SimpleDateFormat.
|
104
|
+
* JRuby parser has micro second resolution, but Java parser (Joda-Time) has only milli second resolution
|
103
105
|
|
104
106
|
## ToDo
|
105
107
|
|
data/bench/config_java.yml
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: bench/dummy
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
columns:
|
7
|
+
- {name: timestamp, type: string}
|
8
|
+
filters:
|
9
|
+
- type: timestamp_format
|
10
|
+
columns:
|
11
|
+
- {name: timestamp, from_format: ["yyyy-MM-dd hh:mm:ss.SSS"], to_format: "%Y-%m-%d"}
|
12
|
+
out:
|
13
|
+
type: "null"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: bench/dummy
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
columns:
|
7
|
+
- {name: timestamp, type: string}
|
8
|
+
filters:
|
9
|
+
- type: timestamp_format
|
10
|
+
columns:
|
11
|
+
- {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N"], to_format: "yyyy-MM-dd"}
|
12
|
+
out:
|
13
|
+
type: "null"
|
data/build.gradle
CHANGED
data/example/example.yml
CHANGED
@@ -8,7 +8,7 @@ filters:
|
|
8
8
|
default_to_timezone: "Asia/Tokyo"
|
9
9
|
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
10
10
|
columns:
|
11
|
-
- {name: "$.record.timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
11
|
+
- {name: "$.record.timestamp", type: long, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"], to_unit: ms}
|
12
12
|
- {name: "$.record.nested.nested[0].timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
13
13
|
out:
|
14
14
|
type: "null"
|
data/example/string.csv
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
2015-07-13,2015-07-13,2015-07-13,2015-07-13
|
2
|
+
2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC,2015-07-13 UTC
|
1
3
|
2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00,2015-07-13 00:00:00
|
2
4
|
2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC,2015-07-12 16:00:00 UTC
|
3
5
|
2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC,2015-07-12 16:00:00.1 UTC
|
data/example/string.yml
CHANGED
@@ -11,7 +11,7 @@ in:
|
|
11
11
|
filters:
|
12
12
|
- type: timestamp_format
|
13
13
|
default_from_timezone: "Asia/Taipei"
|
14
|
-
default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
|
14
|
+
default_from_timestamp_format: ["%Y-%m-%d", "%Y-%m-%d %z", "%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"]
|
15
15
|
default_to_timezone: "Asia/Taipei"
|
16
16
|
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
17
17
|
columns:
|
data/example/string_java.yml
CHANGED
@@ -11,7 +11,7 @@ in:
|
|
11
11
|
filters:
|
12
12
|
- type: timestamp_format
|
13
13
|
default_from_timezone: "Asia/Taipei"
|
14
|
-
default_from_timestamp_format: ["yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
|
14
|
+
default_from_timestamp_format: ["yyyy-MM-dd", "yyyy-MM-dd z", "yyyy-MM-dd HH:mm:ss.S z", "yyyy-MM-dd HH:mm:ss z", "yyyy-MM-dd HH:mm:ss"]
|
15
15
|
default_to_timezone: "Asia/Taipei"
|
16
16
|
default_to_timestamp_format: "yyyy-MM-dd HH:mm:ss.SSS Z"
|
17
17
|
columns:
|
@@ -15,9 +15,9 @@ import org.joda.time.DateTimeZone;
|
|
15
15
|
import org.jruby.embed.ScriptingContainer;
|
16
16
|
import org.jruby.util.RubyDateFormat;
|
17
17
|
|
18
|
-
import java.text.SimpleDateFormat;
|
19
|
-
import java.util.Date;
|
20
18
|
import java.util.Locale;
|
19
|
+
import org.joda.time.format.DateTimeFormat;
|
20
|
+
import org.joda.time.format.DateTimeFormatter;
|
21
21
|
|
22
22
|
public class TimestampFormatter
|
23
23
|
{
|
@@ -44,7 +44,7 @@ public class TimestampFormatter
|
|
44
44
|
}
|
45
45
|
|
46
46
|
private final RubyDateFormat jrubyFormatter;
|
47
|
-
private final
|
47
|
+
private final DateTimeFormatter javaFormatter;
|
48
48
|
private final DateTimeZone toTimeZone;
|
49
49
|
|
50
50
|
public TimestampFormatter(PluginTask task, Optional<? extends TimestampColumnOption> columnOption)
|
@@ -67,8 +67,7 @@ public class TimestampFormatter
|
|
67
67
|
}
|
68
68
|
else {
|
69
69
|
this.jrubyFormatter = null;
|
70
|
-
this.javaFormatter =
|
71
|
-
javaFormatter.setTimeZone(toTimeZone.toTimeZone());
|
70
|
+
this.javaFormatter = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(toTimeZone);
|
72
71
|
}
|
73
72
|
}
|
74
73
|
|
@@ -108,6 +107,6 @@ public class TimestampFormatter
|
|
108
107
|
private String javaFormat(Timestamp value)
|
109
108
|
{
|
110
109
|
long milliSecond = value.getEpochSecond() * 1000 + value.getNano() / 1000000;
|
111
|
-
return javaFormatter.
|
110
|
+
return javaFormatter.print(milliSecond);
|
112
111
|
}
|
113
112
|
}
|
@@ -14,14 +14,16 @@ import org.embulk.spi.time.Timestamp;
|
|
14
14
|
import static org.embulk.spi.time.TimestampFormat.parseDateTimeZone;
|
15
15
|
|
16
16
|
import org.embulk.spi.time.TimestampParseException;
|
17
|
+
import org.joda.time.DateTime;
|
17
18
|
import org.joda.time.DateTimeZone;
|
19
|
+
import org.joda.time.format.DateTimeFormatter;
|
18
20
|
import org.jruby.embed.ScriptingContainer;
|
19
21
|
|
20
|
-
import java.text.ParseException;
|
21
|
-
import java.text.SimpleDateFormat;
|
22
22
|
import java.util.ArrayList;
|
23
23
|
import java.util.List;
|
24
|
-
import java.util.
|
24
|
+
import java.util.Locale;
|
25
|
+
|
26
|
+
import org.joda.time.format.DateTimeFormat;
|
25
27
|
|
26
28
|
public class TimestampParser {
|
27
29
|
public interface Task {
|
@@ -44,8 +46,8 @@ public class TimestampParser {
|
|
44
46
|
Optional<List<String>> getFromFormat();
|
45
47
|
}
|
46
48
|
|
47
|
-
private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList
|
48
|
-
private final List<
|
49
|
+
private final List<JRubyTimeParserHelper> jrubyParserList = new ArrayList<>();
|
50
|
+
private final List<DateTimeFormatter> javaParserList = new ArrayList<>();
|
49
51
|
private final DateTimeZone defaultFromTimeZone;
|
50
52
|
|
51
53
|
TimestampParser(PluginTask task) {
|
@@ -60,14 +62,14 @@ public class TimestampParser {
|
|
60
62
|
|
61
63
|
public TimestampParser(ScriptingContainer jruby, List<String> formatList, DateTimeZone defaultFromTimeZone) {
|
62
64
|
JRubyTimeParserHelperFactory helperFactory = (JRubyTimeParserHelperFactory) jruby.runScriptlet("Embulk::Java::TimeParserHelper::Factory.new");
|
65
|
+
|
63
66
|
// TODO get default current time from ExecTask.getExecTimestamp
|
64
67
|
for (String format : formatList) {
|
65
68
|
if (format.contains("%")) {
|
66
69
|
JRubyTimeParserHelper helper = (JRubyTimeParserHelper) helperFactory.newInstance(format, 1970, 1, 1, 0, 0, 0, 0); // TODO default time zone
|
67
70
|
this.jrubyParserList.add(helper);
|
68
71
|
} else {
|
69
|
-
|
70
|
-
parser.setTimeZone(defaultFromTimeZone.toTimeZone());
|
72
|
+
DateTimeFormatter parser = DateTimeFormat.forPattern(format).withLocale(Locale.ENGLISH).withZone(defaultFromTimeZone);
|
71
73
|
this.javaParserList.add(parser);
|
72
74
|
}
|
73
75
|
}
|
@@ -78,7 +80,7 @@ public class TimestampParser {
|
|
78
80
|
return defaultFromTimeZone;
|
79
81
|
}
|
80
82
|
|
81
|
-
public Timestamp parse(String text) throws TimestampParseException,
|
83
|
+
public Timestamp parse(String text) throws TimestampParseException, IllegalArgumentException {
|
82
84
|
if (!jrubyParserList.isEmpty()) {
|
83
85
|
return jrubyParse(text);
|
84
86
|
} else if (!javaParserList.isEmpty()) {
|
@@ -124,21 +126,22 @@ public class TimestampParser {
|
|
124
126
|
return Timestamp.ofEpochSecond(sec, usec * 1000);
|
125
127
|
}
|
126
128
|
|
127
|
-
private Timestamp javaParse(String text) throws
|
128
|
-
|
129
|
-
|
129
|
+
private Timestamp javaParse(String text) throws IllegalArgumentException {
|
130
|
+
DateTime dateTime = null;
|
131
|
+
IllegalArgumentException exception = null;
|
130
132
|
|
131
|
-
for (
|
133
|
+
for (DateTimeFormatter parser : javaParserList) {
|
132
134
|
try {
|
133
|
-
|
135
|
+
dateTime = parser.parseDateTime(text);
|
134
136
|
break;
|
135
|
-
} catch (
|
137
|
+
} catch (IllegalArgumentException ex) {
|
136
138
|
exception = ex;
|
137
139
|
}
|
138
140
|
}
|
139
|
-
if (
|
141
|
+
if (dateTime == null) {
|
140
142
|
throw exception;
|
141
143
|
}
|
144
|
+
long msec = dateTime.getMillis(); // NOTE: milli second resolution
|
142
145
|
|
143
146
|
long nanoAdjustment = msec * 1000000;
|
144
147
|
return Timestamp.ofEpochSecond(0, nanoAdjustment);
|
@@ -7,8 +7,6 @@ import org.embulk.spi.DataException;
|
|
7
7
|
import org.embulk.spi.time.Timestamp;
|
8
8
|
import org.embulk.spi.time.TimestampParseException;
|
9
9
|
|
10
|
-
import java.text.ParseException;
|
11
|
-
|
12
10
|
public class StringCast
|
13
11
|
{
|
14
12
|
private StringCast() {}
|
@@ -27,7 +25,7 @@ public class StringCast
|
|
27
25
|
catch (TimestampParseException ex) {
|
28
26
|
throw new DataException(buildErrorMessage(value), ex);
|
29
27
|
}
|
30
|
-
catch (
|
28
|
+
catch (IllegalArgumentException ex) {
|
31
29
|
throw new DataException(buildErrorMessage(value), ex);
|
32
30
|
}
|
33
31
|
}
|
@@ -40,7 +38,7 @@ public class StringCast
|
|
40
38
|
catch (TimestampParseException ex) {
|
41
39
|
throw new DataException(buildErrorMessage(value), ex);
|
42
40
|
}
|
43
|
-
catch (
|
41
|
+
catch (IllegalArgumentException ex) {
|
44
42
|
throw new DataException(buildErrorMessage(value), ex);
|
45
43
|
}
|
46
44
|
}
|
@@ -54,7 +52,7 @@ public class StringCast
|
|
54
52
|
catch (TimestampParseException ex) {
|
55
53
|
throw new DataException(buildErrorMessage(value), ex);
|
56
54
|
}
|
57
|
-
catch (
|
55
|
+
catch (IllegalArgumentException ex) {
|
58
56
|
throw new DataException(buildErrorMessage(value), ex);
|
59
57
|
}
|
60
58
|
}
|
@@ -68,7 +66,7 @@ public class StringCast
|
|
68
66
|
catch (TimestampParseException ex) {
|
69
67
|
throw new DataException(buildErrorMessage(value), ex);
|
70
68
|
}
|
71
|
-
catch (
|
69
|
+
catch (IllegalArgumentException ex) {
|
72
70
|
throw new DataException(buildErrorMessage(value), ex);
|
73
71
|
}
|
74
72
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-timestamp_format
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,8 @@ files:
|
|
52
52
|
- README.md
|
53
53
|
- bench/config_java.yml
|
54
54
|
- bench/config_jruby.yml
|
55
|
+
- bench/config_jruby_formatter.yml
|
56
|
+
- bench/config_jruby_parser.yml
|
55
57
|
- bench/gen_dummy.rb
|
56
58
|
- build.gradle
|
57
59
|
- config/checkstyle/checkstyle.xml
|
@@ -93,7 +95,7 @@ files:
|
|
93
95
|
- src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java
|
94
96
|
- src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java
|
95
97
|
- src/test/java/org/embulk/filter/timestamp_format/TestTimestampUnit.java
|
96
|
-
- classpath/embulk-filter-timestamp_format-0.1.
|
98
|
+
- classpath/embulk-filter-timestamp_format-0.1.7.jar
|
97
99
|
homepage: https://github.com/sonots/embulk-filter-timestamp_format
|
98
100
|
licenses:
|
99
101
|
- MIT
|