embulk-filter-timestamp_format 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +21 -20
- data/build.gradle +1 -2
- data/example/example.yml +13 -9
- data/example/json_example.jsonl +2 -0
- data/example/json_example.yml +14 -0
- data/example/string_example.yml +22 -0
- data/example/timestamp_example.yml +22 -0
- data/src/main/java/org/embulk/filter/timestamp_format/ColumnCaster.java +132 -0
- data/src/main/java/org/embulk/filter/timestamp_format/ColumnVisitorImpl.java +75 -172
- data/src/main/java/org/embulk/filter/timestamp_format/JsonCaster.java +54 -0
- data/src/main/java/org/embulk/filter/timestamp_format/JsonVisitor.java +119 -0
- data/src/main/java/org/embulk/filter/timestamp_format/TimestampFormatFilterPlugin.java +76 -10
- data/src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java +1 -1
- data/src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java +59 -0
- data/src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java +32 -0
- metadata +12 -18
- data/example/example.jsonl +0 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d7569b8adc1db79b292e271214f852fb080151b
|
4
|
+
data.tar.gz: df0c01a5893dc4a4bbb1f1228e3d72b031e59f93
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d81f4f2df4775444b5608432a2451158453f768cf6687188afae1169ea5eb15c699141d670987e73139c14c5ae8bbeb2122fbfcb6f73c89f5e98a425db8f2519
|
7
|
+
data.tar.gz: 5ecdc2f30763b7768fd1e9176c2c6b01fdafbd214f9191885e79de7f333303a9a7b86054a7c10acbf87ed458db5cab5d3b2c4871115198b829f65b8c36d855cc
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -2,20 +2,21 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/sonots/embulk-filter-timestamp_format.png?branch=master)](http://travis-ci.org/sonots/embulk-filter-timestamp_format)
|
4
4
|
|
5
|
-
A filter plugin for Embulk to change
|
5
|
+
A filter plugin for Embulk to change timestamp format
|
6
6
|
|
7
7
|
## Configuration
|
8
8
|
|
9
9
|
- **columns**: columns to retain (array of hash)
|
10
|
-
- **name**: name of column
|
11
|
-
- **
|
12
|
-
- **
|
13
|
-
- **
|
14
|
-
- **
|
15
|
-
- **
|
16
|
-
- **
|
17
|
-
- **
|
18
|
-
- **
|
10
|
+
- **name**: name of column (required)
|
11
|
+
- **type**: type to cast (string, timestamp, long (unixtimestamp), double (unixtimestamp), default is string)
|
12
|
+
- **from_format**: specify the format of the input string (array of strings, default is default_from_timestamp_format)
|
13
|
+
- **from_timezone**: specify the timezone of the input string (string, default is default_from_timezone)
|
14
|
+
- **to_format**: specify the format of the output string (string, default is default_to_timestamp_format)
|
15
|
+
- **to_timezone**: specify the timezone of the output string (string, default is default_to_timezone)
|
16
|
+
- **default_from_timestamp_format**: default timestamp format for the input string (array of strings, default is `["%Y-%m-%d %H:%M:%S.%N %z"]`)
|
17
|
+
- **default_from_timezone**: default timezone for the input string (string, default is `UTC`)
|
18
|
+
- **default_to_timestamp_format**: default timestamp format for the output string (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
19
|
+
- **default_to_timezone**: default timezone for the output string (string, default is `UTC`)
|
19
20
|
* **stop_on_invalid_record**: stop bulk load transaction if a invalid record is found (boolean, default is `false)
|
20
21
|
|
21
22
|
## Example
|
@@ -23,8 +24,8 @@ A filter plugin for Embulk to change timesatmp format
|
|
23
24
|
Say example.jsonl is as follows (this is a typical format which Exporting BigQuery table outputs):
|
24
25
|
|
25
26
|
```
|
26
|
-
{"timestamp":"2015-07-12 15:00:00 UTC","
|
27
|
-
{"timestamp":"2015-07-12 15:00:00.1 UTC","
|
27
|
+
{"timestamp":"2015-07-12 15:00:00 UTC","nested":{"timestamp":"2015-07-12 15:00:00 UTC"}}
|
28
|
+
{"timestamp":"2015-07-12 15:00:00.1 UTC","nested":{"timestamp":"2015-07-12 15:00:00.1 UTC"}}
|
28
29
|
```
|
29
30
|
|
30
31
|
```yaml
|
@@ -35,27 +36,28 @@ in:
|
|
35
36
|
type: jsonl
|
36
37
|
columns:
|
37
38
|
- {name: timestamp, type: string}
|
38
|
-
- {name:
|
39
|
+
- {name: nested, type: json}
|
39
40
|
filters:
|
40
41
|
- type: timestamp_format
|
41
42
|
default_to_timezone: "Asia/Tokyo"
|
42
|
-
|
43
|
+
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
43
44
|
columns:
|
44
45
|
- {name: timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
45
|
-
- {name:
|
46
|
+
- {name: $.nested.timestamp, from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
46
47
|
type: stdout
|
47
48
|
```
|
48
49
|
|
49
50
|
Output will be as:
|
50
51
|
|
51
52
|
```
|
52
|
-
{"timestamp":"2015-07-13 00:00:00.0","
|
53
|
-
{"timestamp":"2015-07-13 00:00:00.1","
|
53
|
+
{"timestamp":"2015-07-13 00:00:00.0","nested":{"timestamp":"2015-07-13 00:00:00.0}}
|
54
|
+
{"timestamp":"2015-07-13 00:00:00.1","nested":{"timestamp":"2015-07-13 00:00:00.1}}
|
54
55
|
```
|
55
56
|
|
57
|
+
See [./example](./example) for more examples.
|
58
|
+
|
56
59
|
## ToDo
|
57
60
|
|
58
|
-
* Currently, input must be a String column and output will be a String column. But, support Timestamp column (input / output)
|
59
61
|
* Write test
|
60
62
|
|
61
63
|
## Development
|
@@ -63,9 +65,8 @@ Output will be as:
|
|
63
65
|
Run example:
|
64
66
|
|
65
67
|
```
|
66
|
-
$ embulk gem install embulk-parser-jsonl
|
67
68
|
$ ./gradlew classpath
|
68
|
-
$ embulk
|
69
|
+
$ embulk preview -I lib example/example.yml
|
69
70
|
```
|
70
71
|
|
71
72
|
Run test:
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.5"
|
17
17
|
sourceCompatibility = 1.7
|
18
18
|
targetCompatibility = 1.7
|
19
19
|
|
@@ -72,7 +72,6 @@ Gem::Specification.new do |spec|
|
|
72
72
|
|
73
73
|
spec.add_development_dependency 'bundler', ['~> 1.0']
|
74
74
|
spec.add_development_dependency 'rake', ['>= 10.0']
|
75
|
-
spec.add_development_dependency 'embulk-parser-jsonl'
|
76
75
|
end
|
77
76
|
/$)
|
78
77
|
}
|
data/example/example.yml
CHANGED
@@ -1,18 +1,22 @@
|
|
1
1
|
in:
|
2
2
|
type: file
|
3
|
-
path_prefix: example/
|
3
|
+
path_prefix: example/string_example.csv
|
4
4
|
parser:
|
5
|
-
type:
|
5
|
+
type: csv
|
6
6
|
columns:
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
- {name: string1, type: string}
|
8
|
+
- {name: string2, type: string}
|
9
|
+
- {name: string3, type: string}
|
10
|
+
- {name: string4, type: string}
|
10
11
|
filters:
|
11
12
|
- type: timestamp_format
|
12
13
|
default_to_timezone: "Asia/Tokyo"
|
13
|
-
|
14
|
+
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
15
|
+
default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]
|
14
16
|
columns:
|
15
|
-
- {name:
|
16
|
-
- {name:
|
17
|
+
- {name: string1}
|
18
|
+
- {name: string2, type: timestamp}
|
19
|
+
- {name: string3, type: long}
|
20
|
+
- {name: string4, type: double}
|
17
21
|
out:
|
18
|
-
type:
|
22
|
+
type: "null"
|
@@ -0,0 +1,2 @@
|
|
1
|
+
{"timestamp":"2015-07-12 15:00:00 UTC","nested":{"nested":[{"timestamp":"2015-07-12 15:00:00 UTC"}]},"ignore_nested":{"timestamp":"2015-07-12 15:00:00 UTC"}}
|
2
|
+
{"timestamp":"2015-07-12 15:00:00.1 UTC","nested":{"nested":[{"timestamp":"2015-07-12 15:00:00.1 UTC"}]},"ignore_nested":{"timestamp":"2015-07-12 15:00:00.1 UTC"}}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/json_example.jsonl
|
4
|
+
parser:
|
5
|
+
type: json
|
6
|
+
filters:
|
7
|
+
- type: timestamp_format
|
8
|
+
default_to_timezone: "Asia/Tokyo"
|
9
|
+
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
10
|
+
columns:
|
11
|
+
- {name: "$.record.timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
12
|
+
- {name: "$.record.nested.nested[0].timestamp", from_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]}
|
13
|
+
out:
|
14
|
+
type: "null"
|
@@ -0,0 +1,22 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/string_example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
columns:
|
7
|
+
- {name: string1, type: string}
|
8
|
+
- {name: string2, type: string}
|
9
|
+
- {name: string3, type: string}
|
10
|
+
- {name: string4, type: string}
|
11
|
+
filters:
|
12
|
+
- type: timestamp_format
|
13
|
+
default_to_timezone: "Asia/Tokyo"
|
14
|
+
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
15
|
+
default_from_timestamp_format: ["%Y-%m-%d %H:%M:%S.%N %z", "%Y-%m-%d %H:%M:%S %z"]
|
16
|
+
columns:
|
17
|
+
- {name: string1}
|
18
|
+
- {name: string2, type: timestamp}
|
19
|
+
- {name: string3, type: long}
|
20
|
+
- {name: string4, type: double}
|
21
|
+
out:
|
22
|
+
type: "null"
|
@@ -0,0 +1,22 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/timestamp_example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
default_timestamp_format: "%Y-%m-%d %H:%M:%S.%N %z"
|
7
|
+
columns:
|
8
|
+
- {name: timestamp1, type: timestamp}
|
9
|
+
- {name: timestamp2, type: timestamp}
|
10
|
+
- {name: timestamp3, type: timestamp}
|
11
|
+
- {name: timestamp4, type: timestamp}
|
12
|
+
filters:
|
13
|
+
- type: timestamp_format
|
14
|
+
default_to_timezone: "Asia/Tokyo"
|
15
|
+
default_to_timestamp_format: "%Y-%m-%d %H:%M:%S.%N"
|
16
|
+
columns:
|
17
|
+
- {name: timestamp1}
|
18
|
+
- {name: timestamp2, type: timestamp}
|
19
|
+
- {name: timestamp3, type: long}
|
20
|
+
- {name: timestamp4, type: double}
|
21
|
+
out:
|
22
|
+
type: "null"
|
@@ -0,0 +1,132 @@
|
|
1
|
+
package org.embulk.filter.timestamp_format;
|
2
|
+
|
3
|
+
import org.embulk.filter.timestamp_format.cast.StringCast;
|
4
|
+
import org.embulk.filter.timestamp_format.cast.TimestampCast;
|
5
|
+
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
|
6
|
+
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
|
7
|
+
import org.embulk.spi.Column;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.PageBuilder;
|
10
|
+
import org.embulk.spi.PageReader;
|
11
|
+
import org.embulk.spi.Schema;
|
12
|
+
import org.embulk.spi.time.Timestamp;
|
13
|
+
import org.embulk.spi.type.DoubleType;
|
14
|
+
import org.embulk.spi.type.LongType;
|
15
|
+
import org.embulk.spi.type.StringType;
|
16
|
+
import org.embulk.spi.type.TimestampType;
|
17
|
+
import org.embulk.spi.type.Type;
|
18
|
+
import org.joda.time.DateTimeZone;
|
19
|
+
import org.msgpack.value.Value;
|
20
|
+
import org.slf4j.Logger;
|
21
|
+
|
22
|
+
import java.util.HashMap;
|
23
|
+
import java.util.List;
|
24
|
+
|
25
|
+
public class ColumnCaster
|
26
|
+
{
|
27
|
+
private static final Logger logger = Exec.getLogger(TimestampFormatFilterPlugin.class);
|
28
|
+
private final PluginTask task;
|
29
|
+
private final Schema inputSchema;
|
30
|
+
private final Schema outputSchema;
|
31
|
+
private final PageReader pageReader;
|
32
|
+
private final PageBuilder pageBuilder;
|
33
|
+
private final HashMap<String, TimestampParser> timestampParserMap = new HashMap<>();
|
34
|
+
private final HashMap<String, TimestampFormatter> timestampFormatterMap = new HashMap<>();
|
35
|
+
private final JsonVisitor jsonVisitor;
|
36
|
+
|
37
|
+
ColumnCaster(PluginTask task, Schema inputSchema, Schema outputSchema, PageReader pageReader, PageBuilder pageBuilder)
|
38
|
+
{
|
39
|
+
this.task = task;
|
40
|
+
this.inputSchema = inputSchema;
|
41
|
+
this.outputSchema = outputSchema;
|
42
|
+
this.pageReader = pageReader;
|
43
|
+
this.pageBuilder = pageBuilder;
|
44
|
+
|
45
|
+
buildTimestampParserMap();
|
46
|
+
buildTimestampFormatterMap();
|
47
|
+
|
48
|
+
JsonCaster jsonCaster = new JsonCaster(task, timestampParserMap, timestampFormatterMap);
|
49
|
+
this.jsonVisitor = new JsonVisitor(task, jsonCaster);
|
50
|
+
}
|
51
|
+
|
52
|
+
private void buildTimestampParserMap()
|
53
|
+
{
|
54
|
+
// columnName or jsonPath => TimestampParser
|
55
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
56
|
+
TimestampParser parser = getTimestampParser(columnConfig, task);
|
57
|
+
this.timestampParserMap.put(columnConfig.getName(), parser);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
private void buildTimestampFormatterMap()
|
62
|
+
{
|
63
|
+
// columnName or jsonPath => TimestampFormatter
|
64
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
65
|
+
TimestampFormatter parser = getTimestampFormatter(columnConfig, task);
|
66
|
+
this.timestampFormatterMap.put(columnConfig.getName(), parser);
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
private TimestampParser getTimestampParser(ColumnConfig columnConfig, PluginTask task)
|
71
|
+
{
|
72
|
+
DateTimeZone timezone = columnConfig.getFromTimeZone().or(task.getDefaultFromTimeZone());
|
73
|
+
List<String> formatList = columnConfig.getFromFormat().or(task.getDefaultFromTimestampFormat());
|
74
|
+
return new TimestampParser(task.getJRuby(), formatList, timezone);
|
75
|
+
}
|
76
|
+
|
77
|
+
private TimestampFormatter getTimestampFormatter(ColumnConfig columnConfig, PluginTask task)
|
78
|
+
{
|
79
|
+
String format = columnConfig.getToFormat().or(task.getDefaultToTimestampFormat());
|
80
|
+
DateTimeZone timezone = columnConfig.getToTimeZone().or(task.getDefaultToTimeZone());
|
81
|
+
return new TimestampFormatter(task.getJRuby(), format, timezone);
|
82
|
+
}
|
83
|
+
|
84
|
+
public void setFromString(Column outputColumn, String value)
|
85
|
+
{
|
86
|
+
Type outputType = outputColumn.getType();
|
87
|
+
TimestampParser timestampParser = timestampParserMap.get(outputColumn.getName());
|
88
|
+
if (outputType instanceof StringType) {
|
89
|
+
TimestampFormatter timestampFormatter = timestampFormatterMap.get(outputColumn.getName());
|
90
|
+
pageBuilder.setString(outputColumn, StringCast.asString(value, timestampParser, timestampFormatter));
|
91
|
+
}
|
92
|
+
else if (outputType instanceof TimestampType) {
|
93
|
+
pageBuilder.setTimestamp(outputColumn, StringCast.asTimestamp(value, timestampParser));
|
94
|
+
}
|
95
|
+
else if (outputType instanceof LongType) {
|
96
|
+
pageBuilder.setLong(outputColumn, StringCast.asLong(value, timestampParser));
|
97
|
+
}
|
98
|
+
else if (outputType instanceof DoubleType) {
|
99
|
+
pageBuilder.setDouble(outputColumn, StringCast.asDouble(value, timestampParser));
|
100
|
+
}
|
101
|
+
else {
|
102
|
+
assert false;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
public void setFromTimestamp(Column outputColumn, Timestamp value)
|
107
|
+
{
|
108
|
+
Type outputType = outputColumn.getType();
|
109
|
+
if (outputType instanceof StringType) {
|
110
|
+
TimestampFormatter timestampFormatter = timestampFormatterMap.get(outputColumn.getName());
|
111
|
+
pageBuilder.setString(outputColumn, TimestampCast.asString(value, timestampFormatter));
|
112
|
+
}
|
113
|
+
else if (outputType instanceof TimestampType) {
|
114
|
+
pageBuilder.setTimestamp(outputColumn, value);
|
115
|
+
}
|
116
|
+
else if (outputType instanceof LongType) {
|
117
|
+
pageBuilder.setLong(outputColumn, TimestampCast.asLong(value));
|
118
|
+
}
|
119
|
+
else if (outputType instanceof DoubleType) {
|
120
|
+
pageBuilder.setDouble(outputColumn, TimestampCast.asDouble(value));
|
121
|
+
}
|
122
|
+
else {
|
123
|
+
assert false;
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
public void setFromJson(Column outputColumn, Value value)
|
128
|
+
{
|
129
|
+
String jsonPath = new StringBuilder("$.").append(outputColumn.getName()).toString();
|
130
|
+
pageBuilder.setJson(outputColumn, jsonVisitor.visit(jsonPath, value));
|
131
|
+
}
|
132
|
+
}
|
@@ -1,251 +1,154 @@
|
|
1
1
|
package org.embulk.filter.timestamp_format;
|
2
2
|
|
3
|
-
import
|
3
|
+
import org.embulk.spi.DataException;
|
4
4
|
import org.embulk.spi.PageReader;
|
5
|
-
import org.
|
6
|
-
import org.msgpack.value.MapValue;
|
7
|
-
import org.msgpack.value.Value;
|
8
|
-
import org.msgpack.value.ValueFactory;
|
5
|
+
import org.embulk.spi.Schema;
|
9
6
|
|
10
|
-
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
|
11
7
|
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
|
12
8
|
|
13
9
|
import org.embulk.spi.Column;
|
14
10
|
import org.embulk.spi.ColumnVisitor;
|
15
11
|
import org.embulk.spi.Exec;
|
16
12
|
import org.embulk.spi.PageBuilder;
|
17
|
-
import org.embulk.spi.time.Timestamp;
|
18
|
-
import org.embulk.spi.time.TimestampParseException;
|
19
|
-
import org.joda.time.DateTimeZone;
|
20
13
|
import org.slf4j.Logger;
|
21
14
|
|
22
15
|
import java.util.HashMap;
|
23
|
-
import java.util.HashSet;
|
24
|
-
import java.util.List;
|
25
|
-
import java.util.Map;
|
26
|
-
import java.util.Objects;
|
27
16
|
|
28
17
|
public class ColumnVisitorImpl
|
29
18
|
implements ColumnVisitor
|
30
19
|
{
|
31
20
|
private static final Logger logger = Exec.getLogger(TimestampFormatFilterPlugin.class);
|
32
21
|
private final PluginTask task;
|
22
|
+
private final Schema inputSchema;
|
23
|
+
private final Schema outputSchema;
|
33
24
|
private final PageReader pageReader;
|
34
25
|
private final PageBuilder pageBuilder;
|
35
|
-
private final HashMap<String,
|
36
|
-
private final
|
37
|
-
private final HashSet<String> shouldVisitRecursivelySet = new HashSet<String>();
|
26
|
+
private final HashMap<String, Column> outputColumnMap = new HashMap<>();
|
27
|
+
private final ColumnCaster columnCaster;
|
38
28
|
|
39
|
-
ColumnVisitorImpl(PluginTask task,
|
29
|
+
ColumnVisitorImpl(PluginTask task, Schema inputSchema, Schema outputSchema,
|
30
|
+
PageReader pageReader, PageBuilder pageBuilder)
|
40
31
|
{
|
41
|
-
this.task
|
42
|
-
this.
|
43
|
-
this.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
32
|
+
this.task = task;
|
33
|
+
this.inputSchema = inputSchema;
|
34
|
+
this.outputSchema = outputSchema;
|
35
|
+
this.pageReader = pageReader;
|
36
|
+
this.pageBuilder = pageBuilder;
|
37
|
+
|
38
|
+
buildOutputColumnMap();
|
39
|
+
this.columnCaster = new ColumnCaster(task, inputSchema, outputSchema, pageReader, pageBuilder);
|
48
40
|
}
|
49
41
|
|
50
|
-
private void
|
42
|
+
private void buildOutputColumnMap()
|
51
43
|
{
|
52
|
-
// columnName
|
53
|
-
for (
|
54
|
-
|
55
|
-
this.timestampParserMap.put(columnConfig.getName(), parser); // NOTE: value would be null
|
44
|
+
// columnName => outputColumn
|
45
|
+
for (Column column : outputSchema.getColumns()) {
|
46
|
+
this.outputColumnMap.put(column.getName(), column);
|
56
47
|
}
|
57
48
|
}
|
58
49
|
|
59
|
-
private
|
60
|
-
{
|
61
|
-
DateTimeZone timezone = columnConfig.getFromTimeZone().or(task.getDefaultFromTimeZone());
|
62
|
-
List<String> formatList = columnConfig.getFromFormat().or(task.getDefaultFromTimestampFormat());
|
63
|
-
return new TimestampParser(task.getJRuby(), formatList, timezone);
|
64
|
-
}
|
65
|
-
|
66
|
-
private void buildTimestampFormatterMap()
|
50
|
+
private interface PageBuildable
|
67
51
|
{
|
68
|
-
|
69
|
-
for (ColumnConfig columnConfig : task.getColumns()) {
|
70
|
-
TimestampFormatter parser = getTimestampFormatter(columnConfig, task);
|
71
|
-
this.timestampFormatterMap.put(columnConfig.getName(), parser); // NOTE: value would be null
|
72
|
-
}
|
52
|
+
public void run() throws DataException;
|
73
53
|
}
|
74
54
|
|
75
|
-
private
|
55
|
+
private void withStopOnInvalidRecord(final PageBuildable op,
|
56
|
+
final Column inputColumn, final Column outputColumn) throws DataException
|
76
57
|
{
|
77
|
-
|
78
|
-
|
79
|
-
return new TimestampFormatter(task.getJRuby(), format, timezone);
|
80
|
-
}
|
81
|
-
|
82
|
-
|
83
|
-
private void buildShouldVisitRecursivelySet()
|
84
|
-
{
|
85
|
-
// json partial path => Boolean to avoid unnecessary type: json visit
|
86
|
-
for (ColumnConfig columnConfig : task.getColumns()) {
|
87
|
-
String name = columnConfig.getName();
|
88
|
-
if (!name.startsWith("$.")) {
|
89
|
-
continue;
|
90
|
-
}
|
91
|
-
String[] parts = name.split("\\.");
|
92
|
-
StringBuilder partialPath = new StringBuilder("$");
|
93
|
-
for (int i = 1; i < parts.length; i++) {
|
94
|
-
if (parts[i].contains("[")) {
|
95
|
-
String[] arrayParts = parts[i].split("\\[");
|
96
|
-
partialPath.append(".").append(arrayParts[0]);
|
97
|
-
this.shouldVisitRecursivelySet.add(partialPath.toString());
|
98
|
-
for (int j = 1; j < arrayParts.length; j++) {
|
99
|
-
partialPath.append("[").append(arrayParts[j]);
|
100
|
-
this.shouldVisitRecursivelySet.add(partialPath.toString());
|
101
|
-
}
|
102
|
-
}
|
103
|
-
else {
|
104
|
-
partialPath.append(".").append(parts[i]);
|
105
|
-
this.shouldVisitRecursivelySet.add(partialPath.toString());
|
106
|
-
}
|
107
|
-
}
|
108
|
-
}
|
109
|
-
}
|
110
|
-
|
111
|
-
private boolean shouldVisitRecursively(String name)
|
112
|
-
{
|
113
|
-
return shouldVisitRecursivelySet.contains(name);
|
114
|
-
}
|
115
|
-
|
116
|
-
private Value formatTimestampStringRecursively(PluginTask task, String path, Value value)
|
117
|
-
throws TimestampParseException
|
118
|
-
{
|
119
|
-
if (!shouldVisitRecursively(path)) {
|
120
|
-
return value;
|
121
|
-
}
|
122
|
-
if (value.isArrayValue()) {
|
123
|
-
ArrayValue arrayValue = value.asArrayValue();
|
124
|
-
int size = arrayValue.size();
|
125
|
-
Value[] newValue = new Value[size];
|
126
|
-
for (int i = 0; i < size; i++) {
|
127
|
-
String k = new StringBuilder(path).append("[").append(Integer.toString(i)).append("]").toString();
|
128
|
-
Value v = arrayValue.get(i);
|
129
|
-
newValue[i] = formatTimestampStringRecursively(task, k, v);
|
130
|
-
}
|
131
|
-
return ValueFactory.newArray(newValue, true);
|
132
|
-
}
|
133
|
-
else if (value.isMapValue()) {
|
134
|
-
MapValue mapValue = value.asMapValue();
|
135
|
-
int size = mapValue.size() * 2;
|
136
|
-
Value[] newValue = new Value[size];
|
137
|
-
int i = 0;
|
138
|
-
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
139
|
-
Value k = entry.getKey();
|
140
|
-
Value v = entry.getValue();
|
141
|
-
String newPath = new StringBuilder(path).append(".").append(k.asStringValue().asString()).toString();
|
142
|
-
Value r = formatTimestampStringRecursively(task, newPath, v);
|
143
|
-
newValue[i++] = k;
|
144
|
-
newValue[i++] = r;
|
145
|
-
}
|
146
|
-
return ValueFactory.newMap(newValue, true);
|
147
|
-
}
|
148
|
-
else if (value.isStringValue()) {
|
149
|
-
String stringValue = value.asStringValue().asString();
|
150
|
-
String newValue = formatTimestampString(task, path, stringValue);
|
151
|
-
return (Objects.equals(newValue, stringValue)) ? value : ValueFactory.newString(newValue);
|
58
|
+
if (pageReader.isNull(inputColumn)) {
|
59
|
+
pageBuilder.setNull(outputColumn);
|
152
60
|
}
|
153
61
|
else {
|
154
|
-
return value;
|
155
|
-
}
|
156
|
-
}
|
157
|
-
|
158
|
-
private String formatTimestampString(PluginTask task, String name, String value)
|
159
|
-
throws TimestampParseException
|
160
|
-
{
|
161
|
-
TimestampParser parser = timestampParserMap.get(name);
|
162
|
-
TimestampFormatter formatter = timestampFormatterMap.get(name);
|
163
|
-
if (formatter == null || parser == null) {
|
164
|
-
return value;
|
165
|
-
}
|
166
|
-
try {
|
167
|
-
Timestamp timestamp = parser.parse(value);
|
168
|
-
return formatter.format(timestamp);
|
169
|
-
}
|
170
|
-
catch (TimestampParseException ex) {
|
171
62
|
if (task.getStopOnInvalidRecord()) {
|
172
|
-
|
63
|
+
op.run();
|
173
64
|
}
|
174
65
|
else {
|
175
|
-
|
176
|
-
|
66
|
+
try {
|
67
|
+
op.run();
|
68
|
+
}
|
69
|
+
catch (final DataException ex) {
|
70
|
+
logger.warn(ex.getMessage());
|
71
|
+
pageBuilder.setNull(outputColumn);
|
72
|
+
}
|
177
73
|
}
|
178
74
|
}
|
179
75
|
}
|
180
76
|
|
181
|
-
|
182
77
|
@Override
|
183
|
-
public void booleanColumn(Column
|
78
|
+
public void booleanColumn(final Column inputColumn)
|
184
79
|
{
|
185
|
-
if (pageReader.isNull(
|
186
|
-
pageBuilder.setNull(
|
80
|
+
if (pageReader.isNull(inputColumn)) {
|
81
|
+
pageBuilder.setNull(inputColumn);
|
187
82
|
}
|
188
83
|
else {
|
189
|
-
pageBuilder.setBoolean(
|
84
|
+
pageBuilder.setBoolean(inputColumn, pageReader.getBoolean(inputColumn));
|
190
85
|
}
|
191
86
|
}
|
192
87
|
|
193
88
|
@Override
|
194
|
-
public void longColumn(Column
|
89
|
+
public void longColumn(final Column inputColumn)
|
195
90
|
{
|
196
|
-
if (pageReader.isNull(
|
197
|
-
pageBuilder.setNull(
|
91
|
+
if (pageReader.isNull(inputColumn)) {
|
92
|
+
pageBuilder.setNull(inputColumn);
|
198
93
|
}
|
199
94
|
else {
|
200
|
-
pageBuilder.setLong(
|
95
|
+
pageBuilder.setLong(inputColumn, pageReader.getLong(inputColumn));
|
201
96
|
}
|
202
97
|
}
|
203
98
|
|
204
99
|
@Override
|
205
|
-
public void doubleColumn(Column
|
100
|
+
public void doubleColumn(final Column inputColumn)
|
206
101
|
{
|
207
|
-
if (pageReader.isNull(
|
208
|
-
pageBuilder.setNull(
|
102
|
+
if (pageReader.isNull(inputColumn)) {
|
103
|
+
pageBuilder.setNull(inputColumn);
|
209
104
|
}
|
210
105
|
else {
|
211
|
-
pageBuilder.setDouble(
|
106
|
+
pageBuilder.setDouble(inputColumn, pageReader.getDouble(inputColumn));
|
212
107
|
}
|
213
108
|
}
|
214
109
|
|
215
110
|
@Override
|
216
|
-
public void stringColumn(Column
|
111
|
+
public void stringColumn(final Column inputColumn)
|
217
112
|
{
|
218
|
-
if (pageReader.isNull(
|
219
|
-
pageBuilder.setNull(
|
113
|
+
if (pageReader.isNull(inputColumn)) {
|
114
|
+
pageBuilder.setNull(inputColumn);
|
220
115
|
return;
|
221
116
|
}
|
222
|
-
|
223
|
-
|
224
|
-
|
117
|
+
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
118
|
+
PageBuildable op = new PageBuildable() {
|
119
|
+
public void run() throws DataException
|
120
|
+
{
|
121
|
+
columnCaster.setFromString(outputColumn, pageReader.getString(inputColumn));
|
122
|
+
}
|
123
|
+
};
|
124
|
+
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
225
125
|
}
|
226
126
|
|
227
127
|
@Override
|
228
|
-
public void
|
128
|
+
public void timestampColumn(final Column inputColumn)
|
229
129
|
{
|
230
|
-
if (pageReader.isNull(
|
231
|
-
pageBuilder.setNull(
|
232
|
-
|
233
|
-
else {
|
234
|
-
String path = new StringBuilder("$.").append(column.getName()).toString();
|
235
|
-
Value value = pageReader.getJson(column);
|
236
|
-
Value formatted = formatTimestampStringRecursively(task, path, value);
|
237
|
-
pageBuilder.setJson(column, formatted);
|
130
|
+
if (pageReader.isNull(inputColumn)) {
|
131
|
+
pageBuilder.setNull(inputColumn);
|
132
|
+
return;
|
238
133
|
}
|
134
|
+
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
135
|
+
PageBuildable op = new PageBuildable() {
|
136
|
+
public void run() throws DataException
|
137
|
+
{
|
138
|
+
columnCaster.setFromTimestamp(outputColumn, pageReader.getTimestamp(inputColumn));
|
139
|
+
}
|
140
|
+
};
|
141
|
+
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
239
142
|
}
|
240
143
|
|
241
144
|
@Override
|
242
|
-
public void
|
145
|
+
public void jsonColumn(final Column inputColumn)
|
243
146
|
{
|
244
|
-
if (pageReader.isNull(
|
245
|
-
pageBuilder.setNull(
|
246
|
-
|
247
|
-
else {
|
248
|
-
pageBuilder.setTimestamp(column, pageReader.getTimestamp(column));
|
147
|
+
if (pageReader.isNull(inputColumn)) {
|
148
|
+
pageBuilder.setNull(inputColumn);
|
149
|
+
return;
|
249
150
|
}
|
151
|
+
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
152
|
+
columnCaster.setFromJson(outputColumn, pageReader.getJson(inputColumn));
|
250
153
|
}
|
251
154
|
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
package org.embulk.filter.timestamp_format;
|
2
|
+
|
3
|
+
import org.embulk.filter.timestamp_format.cast.StringCast;
|
4
|
+
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
|
5
|
+
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
|
6
|
+
import org.embulk.spi.Exec;
|
7
|
+
import org.embulk.spi.type.DoubleType;
|
8
|
+
import org.embulk.spi.type.LongType;
|
9
|
+
import org.embulk.spi.type.StringType;
|
10
|
+
import org.embulk.spi.type.Type;
|
11
|
+
import org.msgpack.value.StringValue;
|
12
|
+
import org.msgpack.value.Value;
|
13
|
+
import org.msgpack.value.ValueFactory;
|
14
|
+
|
15
|
+
import org.slf4j.Logger;
|
16
|
+
|
17
|
+
import java.util.HashMap;
|
18
|
+
|
19
|
+
class JsonCaster
|
20
|
+
{
|
21
|
+
private static final Logger logger = Exec.getLogger(TimestampFormatFilterPlugin.class);
|
22
|
+
private final PluginTask task;
|
23
|
+
private final HashMap<String, TimestampParser> timestampParserMap;
|
24
|
+
private final HashMap<String, TimestampFormatter> timestampFormatterMap;
|
25
|
+
|
26
|
+
JsonCaster(PluginTask task,
|
27
|
+
HashMap<String, TimestampParser> timestampParserMap,
|
28
|
+
HashMap<String, TimestampFormatter> timestampFormatterMap)
|
29
|
+
{
|
30
|
+
this.task = task;
|
31
|
+
this.timestampParserMap = timestampParserMap;
|
32
|
+
this.timestampFormatterMap = timestampFormatterMap;
|
33
|
+
}
|
34
|
+
|
35
|
+
public Value fromString(ColumnConfig columnConfig, StringValue value)
|
36
|
+
{
|
37
|
+
Type outputType = columnConfig.getType();
|
38
|
+
TimestampParser parser = timestampParserMap.get(columnConfig.getName());
|
39
|
+
if (outputType instanceof StringType) {
|
40
|
+
TimestampFormatter formatter = timestampFormatterMap.get(columnConfig.getName());
|
41
|
+
return ValueFactory.newString(StringCast.asString(value.asString(), parser, formatter));
|
42
|
+
}
|
43
|
+
else if (outputType instanceof LongType) {
|
44
|
+
return ValueFactory.newInteger(StringCast.asLong(value.asString(), parser));
|
45
|
+
}
|
46
|
+
else if (outputType instanceof DoubleType) {
|
47
|
+
return ValueFactory.newFloat(StringCast.asDouble(value.asString(), parser));
|
48
|
+
}
|
49
|
+
else {
|
50
|
+
assert false;
|
51
|
+
return null;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
}
|
@@ -0,0 +1,119 @@
|
|
1
|
+
package org.embulk.filter.timestamp_format;
|
2
|
+
|
3
|
+
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.ColumnConfig;
|
4
|
+
import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask;
|
5
|
+
|
6
|
+
import org.embulk.spi.Exec;
|
7
|
+
import org.msgpack.value.ArrayValue;
|
8
|
+
import org.msgpack.value.MapValue;
|
9
|
+
import org.msgpack.value.Value;
|
10
|
+
import org.msgpack.value.ValueFactory;
|
11
|
+
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.util.HashMap;
|
15
|
+
import java.util.HashSet;
|
16
|
+
import java.util.Map;
|
17
|
+
|
18
|
+
public class JsonVisitor
|
19
|
+
{
|
20
|
+
private static final Logger logger = Exec.getLogger(TimestampFormatFilterPlugin.class);
|
21
|
+
private final PluginTask task;
|
22
|
+
private final JsonCaster jsonCaster;
|
23
|
+
private final HashMap<String, ColumnConfig> jsonPathColumnConfigMap = new HashMap<>();
|
24
|
+
private final HashSet<String> shouldVisitSet = new HashSet<>();
|
25
|
+
|
26
|
+
JsonVisitor(PluginTask task, JsonCaster jsonCaster)
|
27
|
+
{
|
28
|
+
this.task = task;
|
29
|
+
this.jsonCaster = jsonCaster;
|
30
|
+
|
31
|
+
buildJsonPathColumnConfigMap();
|
32
|
+
buildShouldVisitSet();
|
33
|
+
}
|
34
|
+
|
35
|
+
private void buildJsonPathColumnConfigMap()
|
36
|
+
{
|
37
|
+
// json path => Type
|
38
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
39
|
+
String name = columnConfig.getName();
|
40
|
+
if (!name.startsWith("$.")) {
|
41
|
+
continue;
|
42
|
+
}
|
43
|
+
this.jsonPathColumnConfigMap.put(name, columnConfig);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
private void buildShouldVisitSet()
|
48
|
+
{
|
49
|
+
// json partial path => Boolean to avoid unnecessary type: json visit
|
50
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
51
|
+
String name = columnConfig.getName();
|
52
|
+
if (!name.startsWith("$.")) {
|
53
|
+
continue;
|
54
|
+
}
|
55
|
+
String[] parts = name.split("\\.");
|
56
|
+
StringBuilder partialPath = new StringBuilder("$");
|
57
|
+
for (int i = 1; i < parts.length; i++) {
|
58
|
+
if (parts[i].contains("[")) {
|
59
|
+
String[] arrayParts = parts[i].split("\\[");
|
60
|
+
partialPath.append(".").append(arrayParts[0]);
|
61
|
+
this.shouldVisitSet.add(partialPath.toString());
|
62
|
+
for (int j = 1; j < arrayParts.length; j++) {
|
63
|
+
partialPath.append("[").append(arrayParts[j]);
|
64
|
+
this.shouldVisitSet.add(partialPath.toString());
|
65
|
+
}
|
66
|
+
}
|
67
|
+
else {
|
68
|
+
partialPath.append(".").append(parts[i]);
|
69
|
+
this.shouldVisitSet.add(partialPath.toString());
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
private boolean shouldVisit(String jsonPath)
|
76
|
+
{
|
77
|
+
return shouldVisitSet.contains(jsonPath);
|
78
|
+
}
|
79
|
+
|
80
|
+
public Value visit(String jsonPath, Value value)
|
81
|
+
{
|
82
|
+
if (!shouldVisit(jsonPath)) {
|
83
|
+
return value;
|
84
|
+
}
|
85
|
+
if (value.isArrayValue()) {
|
86
|
+
ArrayValue arrayValue = value.asArrayValue();
|
87
|
+
int size = arrayValue.size();
|
88
|
+
Value[] newValue = new Value[size];
|
89
|
+
for (int i = 0; i < size; i++) {
|
90
|
+
String k = new StringBuilder(jsonPath).append("[").append(Integer.toString(i)).append("]").toString();
|
91
|
+
Value v = arrayValue.get(i);
|
92
|
+
newValue[i] = visit(k, v);
|
93
|
+
}
|
94
|
+
return ValueFactory.newArray(newValue, true);
|
95
|
+
}
|
96
|
+
else if (value.isMapValue()) {
|
97
|
+
MapValue mapValue = value.asMapValue();
|
98
|
+
int size = mapValue.size() * 2;
|
99
|
+
Value[] newValue = new Value[size];
|
100
|
+
int i = 0;
|
101
|
+
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
102
|
+
Value k = entry.getKey();
|
103
|
+
Value v = entry.getValue();
|
104
|
+
String newPath = new StringBuilder(jsonPath).append(".").append(k.asStringValue().asString()).toString();
|
105
|
+
Value r = visit(newPath, v);
|
106
|
+
newValue[i++] = k;
|
107
|
+
newValue[i++] = r;
|
108
|
+
}
|
109
|
+
return ValueFactory.newMap(newValue, true);
|
110
|
+
}
|
111
|
+
else if (value.isStringValue()) {
|
112
|
+
ColumnConfig columnConfig = jsonPathColumnConfigMap.get(jsonPath);
|
113
|
+
return jsonCaster.fromString(columnConfig, value.asStringValue());
|
114
|
+
}
|
115
|
+
else {
|
116
|
+
return value;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
@@ -1,12 +1,15 @@
|
|
1
1
|
package org.embulk.filter.timestamp_format;
|
2
2
|
|
3
|
+
import com.google.common.collect.ImmutableList;
|
3
4
|
import org.embulk.config.Config;
|
4
5
|
import org.embulk.config.ConfigDefault;
|
6
|
+
import org.embulk.config.ConfigException;
|
5
7
|
import org.embulk.config.ConfigInject;
|
6
8
|
import org.embulk.config.ConfigSource;
|
7
9
|
import org.embulk.config.Task;
|
8
10
|
import org.embulk.config.TaskSource;
|
9
11
|
|
12
|
+
import org.embulk.spi.Column;
|
10
13
|
import org.embulk.spi.Exec;
|
11
14
|
import org.embulk.spi.FilterPlugin;
|
12
15
|
import org.embulk.spi.Page;
|
@@ -15,6 +18,11 @@ import org.embulk.spi.PageOutput;
|
|
15
18
|
import org.embulk.spi.PageReader;
|
16
19
|
import org.embulk.spi.Schema;
|
17
20
|
|
21
|
+
import org.embulk.spi.type.DoubleType;
|
22
|
+
import org.embulk.spi.type.LongType;
|
23
|
+
import org.embulk.spi.type.StringType;
|
24
|
+
import org.embulk.spi.type.TimestampType;
|
25
|
+
import org.embulk.spi.type.Type;
|
18
26
|
import org.jruby.embed.ScriptingContainer;
|
19
27
|
import org.slf4j.Logger;
|
20
28
|
|
@@ -24,20 +32,22 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
|
|
24
32
|
{
|
25
33
|
private static final Logger logger = Exec.getLogger(TimestampFormatFilterPlugin.class);
|
26
34
|
|
27
|
-
public TimestampFormatFilterPlugin()
|
28
|
-
{
|
29
|
-
}
|
35
|
+
public TimestampFormatFilterPlugin() {}
|
30
36
|
|
31
37
|
// NOTE: This is not spi.ColumnConfig
|
32
|
-
|
38
|
+
interface ColumnConfig extends Task,
|
33
39
|
TimestampParser.TimestampColumnOption, TimestampFormatter.TimestampColumnOption
|
34
40
|
{
|
35
41
|
@Config("name")
|
36
42
|
String getName();
|
43
|
+
|
44
|
+
@Config("type")
|
45
|
+
@ConfigDefault("\"string\"")
|
46
|
+
Type getType();
|
37
47
|
}
|
38
48
|
|
39
|
-
|
40
|
-
|
49
|
+
interface PluginTask extends Task,
|
50
|
+
TimestampParser.Task, TimestampFormatter.Task
|
41
51
|
{
|
42
52
|
@Config("columns")
|
43
53
|
@ConfigDefault("[]")
|
@@ -57,12 +67,20 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
|
|
57
67
|
{
|
58
68
|
PluginTask task = config.loadConfig(PluginTask.class);
|
59
69
|
|
70
|
+
configure(task, inputSchema);
|
71
|
+
Schema outputSchema = buildOuputSchema(task, inputSchema);
|
72
|
+
control.run(task.dump(), outputSchema);
|
73
|
+
}
|
74
|
+
|
75
|
+
private void configure(PluginTask task, Schema inputSchema)
|
76
|
+
{
|
60
77
|
List<ColumnConfig> columns = task.getColumns();
|
78
|
+
|
61
79
|
// throw if column does not exist
|
62
80
|
for (ColumnConfig columnConfig : columns) {
|
63
81
|
String name = columnConfig.getName();
|
64
82
|
if (name.startsWith("$.")) {
|
65
|
-
String firstName = name.split("\\.", 3)[1];
|
83
|
+
String firstName = name.split("\\.", 3)[1]; // check only top level column name
|
66
84
|
inputSchema.lookupColumn(firstName);
|
67
85
|
}
|
68
86
|
else {
|
@@ -70,7 +88,55 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
|
|
70
88
|
}
|
71
89
|
}
|
72
90
|
|
73
|
-
|
91
|
+
// throw if column type is not string or timestamp
|
92
|
+
for (ColumnConfig columnConfig : columns) {
|
93
|
+
Type type = columnConfig.getType();
|
94
|
+
boolean acceptable = false;
|
95
|
+
if (type instanceof StringType) {
|
96
|
+
continue;
|
97
|
+
}
|
98
|
+
else if (type instanceof TimestampType) {
|
99
|
+
continue;
|
100
|
+
}
|
101
|
+
else if (type instanceof LongType) {
|
102
|
+
continue;
|
103
|
+
}
|
104
|
+
else if (type instanceof DoubleType) {
|
105
|
+
continue;
|
106
|
+
}
|
107
|
+
else {
|
108
|
+
throw new ConfigException("column type must be string, timestamp, long, or double");
|
109
|
+
}
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
private Schema buildOuputSchema(final PluginTask task, final Schema inputSchema)
|
114
|
+
{
|
115
|
+
List<ColumnConfig> columnConfigs = task.getColumns();
|
116
|
+
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
117
|
+
int i = 0;
|
118
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
119
|
+
String name = inputColumn.getName();
|
120
|
+
Type type = inputColumn.getType();
|
121
|
+
ColumnConfig columnConfig = getColumnConfig(name, columnConfigs);
|
122
|
+
if (columnConfig != null) {
|
123
|
+
type = columnConfig.getType();
|
124
|
+
}
|
125
|
+
Column outputColumn = new Column(i++, name, type);
|
126
|
+
builder.add(outputColumn);
|
127
|
+
}
|
128
|
+
return new Schema(builder.build());
|
129
|
+
}
|
130
|
+
|
131
|
+
private ColumnConfig getColumnConfig(String name, List<ColumnConfig> columnConfigs)
|
132
|
+
{
|
133
|
+
// hash should be faster, though
|
134
|
+
for (ColumnConfig columnConfig : columnConfigs) {
|
135
|
+
if (columnConfig.getName().equals(name)) {
|
136
|
+
return columnConfig;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
return null;
|
74
140
|
}
|
75
141
|
|
76
142
|
@Override
|
@@ -82,7 +148,7 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
|
|
82
148
|
return new PageOutput() {
|
83
149
|
private PageReader pageReader = new PageReader(inputSchema);
|
84
150
|
private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
85
|
-
private ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, pageReader, pageBuilder);
|
151
|
+
private ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, inputSchema, outputSchema, pageReader, pageBuilder);
|
86
152
|
|
87
153
|
@Override
|
88
154
|
public void finish()
|
@@ -102,7 +168,7 @@ public class TimestampFormatFilterPlugin implements FilterPlugin
|
|
102
168
|
pageReader.setPage(page);
|
103
169
|
|
104
170
|
while (pageReader.nextRecord()) {
|
105
|
-
|
171
|
+
inputSchema.visitColumns(visitor);
|
106
172
|
pageBuilder.addRecord();
|
107
173
|
}
|
108
174
|
}
|
@@ -10,10 +10,10 @@ import org.embulk.filter.timestamp_format.TimestampFormatFilterPlugin.PluginTask
|
|
10
10
|
import org.embulk.spi.time.JRubyTimeParserHelper;
|
11
11
|
import org.embulk.spi.time.JRubyTimeParserHelperFactory;
|
12
12
|
import org.embulk.spi.time.Timestamp;
|
13
|
-
import org.embulk.spi.time.TimestampParseException;
|
14
13
|
|
15
14
|
import static org.embulk.spi.time.TimestampFormat.parseDateTimeZone;
|
16
15
|
|
16
|
+
import org.embulk.spi.time.TimestampParseException;
|
17
17
|
import org.joda.time.DateTimeZone;
|
18
18
|
import org.jruby.embed.ScriptingContainer;
|
19
19
|
|
@@ -0,0 +1,59 @@
|
|
1
|
+
package org.embulk.filter.timestamp_format.cast;
|
2
|
+
|
3
|
+
import org.embulk.filter.timestamp_format.TimestampFormatter;
|
4
|
+
import org.embulk.filter.timestamp_format.TimestampParser;
|
5
|
+
import org.embulk.spi.DataException;
|
6
|
+
import org.embulk.spi.time.Timestamp;
|
7
|
+
import org.embulk.spi.time.TimestampParseException;
|
8
|
+
|
9
|
+
public class StringCast
|
10
|
+
{
|
11
|
+
private StringCast() {}
|
12
|
+
|
13
|
+
private static String buildErrorMessage(String value)
|
14
|
+
{
|
15
|
+
return String.format("failed to parse string: \"%s\"", value);
|
16
|
+
}
|
17
|
+
|
18
|
+
public static String asString(String value, TimestampParser parser, TimestampFormatter formatter) throws DataException
|
19
|
+
{
|
20
|
+
try {
|
21
|
+
Timestamp timestamp = parser.parse(value);
|
22
|
+
return formatter.format(timestamp);
|
23
|
+
}
|
24
|
+
catch (TimestampParseException ex) {
|
25
|
+
throw new DataException(buildErrorMessage(value), ex);
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
public static Timestamp asTimestamp(String value, TimestampParser parser) throws DataException
|
30
|
+
{
|
31
|
+
try {
|
32
|
+
return parser.parse(value);
|
33
|
+
}
|
34
|
+
catch (TimestampParseException ex) {
|
35
|
+
throw new DataException(buildErrorMessage(value), ex);
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
public static long asLong(String value, TimestampParser parser) throws DataException
|
40
|
+
{
|
41
|
+
try {
|
42
|
+
Timestamp timestamp = parser.parse(value);
|
43
|
+
return timestamp.getEpochSecond();
|
44
|
+
}
|
45
|
+
catch (TimestampParseException ex) {
|
46
|
+
throw new DataException(buildErrorMessage(value), ex);
|
47
|
+
}
|
48
|
+
}
|
49
|
+
public static double asDouble(String value, TimestampParser parser) throws DataException
|
50
|
+
{
|
51
|
+
try {
|
52
|
+
Timestamp timestamp = parser.parse(value);
|
53
|
+
return TimestampCast.asDouble(timestamp);
|
54
|
+
}
|
55
|
+
catch (TimestampParseException ex) {
|
56
|
+
throw new DataException(buildErrorMessage(value), ex);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
package org.embulk.filter.timestamp_format.cast;
|
2
|
+
|
3
|
+
import org.embulk.filter.timestamp_format.TimestampFormatter;
|
4
|
+
import org.embulk.spi.DataException;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
|
7
|
+
public class TimestampCast
|
8
|
+
{
|
9
|
+
private TimestampCast() {}
|
10
|
+
|
11
|
+
public static String asString(Timestamp value, TimestampFormatter formatter) throws DataException
|
12
|
+
{
|
13
|
+
return formatter.format(value);
|
14
|
+
}
|
15
|
+
|
16
|
+
public static Timestamp asTimestamp(Timestamp value) throws DataException
|
17
|
+
{
|
18
|
+
return value;
|
19
|
+
}
|
20
|
+
|
21
|
+
public static long asLong(Timestamp value) throws DataException
|
22
|
+
{
|
23
|
+
return value.getEpochSecond();
|
24
|
+
}
|
25
|
+
|
26
|
+
public static double asDouble(Timestamp value) throws DataException
|
27
|
+
{
|
28
|
+
long epoch = value.getEpochSecond();
|
29
|
+
int nano = value.getNano();
|
30
|
+
return (double) epoch + ((double) nano / 1000000000.0);
|
31
|
+
}
|
32
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-timestamp_format
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
version: '10.0'
|
39
39
|
prerelease: false
|
40
40
|
type: :development
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: embulk-parser-jsonl
|
43
|
-
version_requirements: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - '>='
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
requirement: !ruby/object:Gem::Requirement
|
49
|
-
requirements:
|
50
|
-
- - '>='
|
51
|
-
- !ruby/object:Gem::Version
|
52
|
-
version: '0'
|
53
|
-
prerelease: false
|
54
|
-
type: :development
|
55
41
|
description: A filter plugin for Embulk to change timestamp format.
|
56
42
|
email:
|
57
43
|
- sonots@gmail.com
|
@@ -66,19 +52,27 @@ files:
|
|
66
52
|
- README.md
|
67
53
|
- build.gradle
|
68
54
|
- config/checkstyle/checkstyle.xml
|
69
|
-
- example/example.jsonl
|
70
55
|
- example/example.yml
|
56
|
+
- example/json_example.jsonl
|
57
|
+
- example/json_example.yml
|
58
|
+
- example/string_example.yml
|
59
|
+
- example/timestamp_example.yml
|
71
60
|
- gradle/wrapper/gradle-wrapper.jar
|
72
61
|
- gradle/wrapper/gradle-wrapper.properties
|
73
62
|
- gradlew
|
74
63
|
- gradlew.bat
|
75
64
|
- lib/embulk/filter/timestamp_format.rb
|
65
|
+
- src/main/java/org/embulk/filter/timestamp_format/ColumnCaster.java
|
76
66
|
- src/main/java/org/embulk/filter/timestamp_format/ColumnVisitorImpl.java
|
67
|
+
- src/main/java/org/embulk/filter/timestamp_format/JsonCaster.java
|
68
|
+
- src/main/java/org/embulk/filter/timestamp_format/JsonVisitor.java
|
77
69
|
- src/main/java/org/embulk/filter/timestamp_format/TimestampFormatFilterPlugin.java
|
78
70
|
- src/main/java/org/embulk/filter/timestamp_format/TimestampFormatter.java
|
79
71
|
- src/main/java/org/embulk/filter/timestamp_format/TimestampParser.java
|
72
|
+
- src/main/java/org/embulk/filter/timestamp_format/cast/StringCast.java
|
73
|
+
- src/main/java/org/embulk/filter/timestamp_format/cast/TimestampCast.java
|
80
74
|
- src/test/java/org/embulk/filter/TestTimestampFormatFilterPlugin.java
|
81
|
-
- classpath/embulk-filter-timestamp_format-0.1.
|
75
|
+
- classpath/embulk-filter-timestamp_format-0.1.5.jar
|
82
76
|
homepage: https://github.com/sonots/embulk-filter-timestamp_format
|
83
77
|
licenses:
|
84
78
|
- MIT
|
data/example/example.jsonl
DELETED
@@ -1,2 +0,0 @@
|
|
1
|
-
{"timestamp":"2015-07-12 15:00:00 UTC","record":{"record":[{"timestamp":"2015-07-12 15:00:00 UTC"}]},"ignore_record":{"timestamp":"2015-07-12 15:00:00 UTC"}}
|
2
|
-
{"timestamp":"2015-07-12 15:00:00.1 UTC","record":{"record":[{"timestamp":"2015-07-12 15:00:00.1 UTC"}]},"ignore_record":{"timestamp":"2015-07-12 15:00:00.1 UTC"}}
|