embulk-filter-icu4j 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3ad23292fec87d922796c9d337b198874e682ca
|
4
|
+
data.tar.gz: 7814cc24b2e3d7dc50c80b8cece4edfd5178d530
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60f43f75a52dab4228e2133c96ba93d85c72824e5cc4a8c81875acf045d5d76a9a0a348850d4e2b597d18b6d4077db0f279a23e96448b7cdf09dbe13425803a5
|
7
|
+
data.tar.gz: c6645ca93ddf302582cb82cf120f746a82a197afbf02081b57680e31155493c056159d171c6fa5e250578a2e80ea43d911371a29548794643884e3652625dfc8
|
data/README.md
CHANGED
data/build.gradle
CHANGED
@@ -14,7 +14,7 @@ configurations {
|
|
14
14
|
provided
|
15
15
|
}
|
16
16
|
|
17
|
-
version = "0.3.
|
17
|
+
version = "0.3.1"
|
18
18
|
|
19
19
|
sourceCompatibility = 1.7
|
20
20
|
targetCompatibility = 1.7
|
@@ -77,8 +77,8 @@ Gem::Specification.new do |spec|
|
|
77
77
|
spec.name = "${project.name}"
|
78
78
|
spec.version = "${project.version}"
|
79
79
|
spec.authors = ["toyama0919"]
|
80
|
-
spec.summary = %[
|
81
|
-
spec.description = %[
|
80
|
+
spec.summary = %[Unicode normalize string value.]
|
81
|
+
spec.description = %[Unicode normalize string value. see http://site.icu-project.org/]
|
82
82
|
spec.email = ["toyama0919@gmail.com"]
|
83
83
|
spec.licenses = ["MIT"]
|
84
84
|
spec.homepage = "https://github.com/toyama0919/embulk-filter-icu4j"
|
@@ -9,19 +9,13 @@ import org.embulk.config.ConfigSource;
|
|
9
9
|
import org.embulk.config.Task;
|
10
10
|
import org.embulk.config.TaskSource;
|
11
11
|
import org.embulk.spi.Column;
|
12
|
-
import org.embulk.spi.Exec;
|
13
12
|
import org.embulk.spi.FilterPlugin;
|
14
|
-
import org.embulk.spi.Page;
|
15
|
-
import org.embulk.spi.PageBuilder;
|
16
13
|
import org.embulk.spi.PageOutput;
|
17
|
-
import org.embulk.spi.PageReader;
|
18
14
|
import org.embulk.spi.Schema;
|
19
15
|
import org.embulk.spi.type.Types;
|
20
16
|
|
21
17
|
import com.google.common.base.MoreObjects;
|
22
18
|
import com.google.common.collect.ImmutableList;
|
23
|
-
import com.google.common.collect.Lists;
|
24
|
-
import com.ibm.icu.text.Transliterator;
|
25
19
|
|
26
20
|
public class Icu4jFilterPlugin implements FilterPlugin
|
27
21
|
{
|
@@ -44,6 +38,15 @@ public class Icu4jFilterPlugin implements FilterPlugin
|
|
44
38
|
{
|
45
39
|
PluginTask task = config.loadConfig(PluginTask.class);
|
46
40
|
|
41
|
+
control.run(task.dump(), buildOutputSchema(task, inputSchema));
|
42
|
+
}
|
43
|
+
|
44
|
+
/**
|
45
|
+
* @param inputSchema
|
46
|
+
* @param task
|
47
|
+
* @return
|
48
|
+
*/
|
49
|
+
private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
|
47
50
|
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
48
51
|
int i = 0;
|
49
52
|
if (task.getKeepInput()) {
|
@@ -66,103 +69,12 @@ public class Icu4jFilterPlugin implements FilterPlugin
|
|
66
69
|
}
|
67
70
|
}
|
68
71
|
Schema outputSchema = new Schema(builder.build());
|
69
|
-
|
72
|
+
return outputSchema;
|
70
73
|
}
|
71
74
|
|
72
75
|
@Override
|
73
76
|
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
74
77
|
{
|
75
|
-
|
76
|
-
final List<Column> keyNameColumns = Lists.newArrayList();
|
77
|
-
for (String keyName : task.getKeyNames()) {
|
78
|
-
keyNameColumns.add(inputSchema.lookupColumn(keyName));
|
79
|
-
}
|
80
|
-
final List<List<Transliterator>> transliterators = Lists.newArrayList();
|
81
|
-
for (Map<String, String> setting : task.getSettings()) {
|
82
|
-
List<Transliterator> tokenizers = Lists.newArrayList();
|
83
|
-
for (String convertType : setting.get("transliterators").split(",")) {
|
84
|
-
Transliterator transliterator = Transliterator.getInstance(convertType);
|
85
|
-
tokenizers.add(transliterator);
|
86
|
-
}
|
87
|
-
transliterators.add(tokenizers);
|
88
|
-
}
|
89
|
-
|
90
|
-
return new PageOutput() {
|
91
|
-
private PageReader reader = new PageReader(inputSchema);
|
92
|
-
private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
93
|
-
|
94
|
-
@Override
|
95
|
-
public void finish() {
|
96
|
-
builder.finish();
|
97
|
-
}
|
98
|
-
|
99
|
-
@Override
|
100
|
-
public void close() {
|
101
|
-
builder.close();
|
102
|
-
}
|
103
|
-
|
104
|
-
@Override
|
105
|
-
public void add(Page page) {
|
106
|
-
reader.setPage(page);
|
107
|
-
while (reader.nextRecord()) {
|
108
|
-
if (task.getKeepInput()) {
|
109
|
-
for (Column inputColumn: inputSchema.getColumns()) {
|
110
|
-
if (reader.isNull(inputColumn)) {
|
111
|
-
builder.setNull(inputColumn);
|
112
|
-
continue;
|
113
|
-
}
|
114
|
-
if (Types.STRING.equals(inputColumn.getType())) {
|
115
|
-
builder.setString(inputColumn, reader.getString(inputColumn));
|
116
|
-
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
117
|
-
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
118
|
-
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
119
|
-
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
120
|
-
} else if (Types.LONG.equals(inputColumn.getType())) {
|
121
|
-
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
122
|
-
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
123
|
-
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
124
|
-
} else if (Types.JSON.equals(inputColumn.getType())) {
|
125
|
-
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
126
|
-
}
|
127
|
-
}
|
128
|
-
}
|
129
|
-
|
130
|
-
List<Map<String, String>> settings = task.getSettings();
|
131
|
-
for (Column column : keyNameColumns) {
|
132
|
-
for (int i = 0; i < settings.size(); i++) {
|
133
|
-
Map<String, String> setting = settings.get(i);
|
134
|
-
String suffix = setting.get("suffix");
|
135
|
-
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
136
|
-
String convert = convert(column, suffix, setting.get("case"), transliterators.get(i));
|
137
|
-
if (convert == null) {
|
138
|
-
builder.setNull(outputColumn);
|
139
|
-
} else {
|
140
|
-
builder.setString(outputColumn, convert);
|
141
|
-
}
|
142
|
-
}
|
143
|
-
}
|
144
|
-
builder.addRecord();
|
145
|
-
}
|
146
|
-
}
|
147
|
-
|
148
|
-
/**
|
149
|
-
* @param column
|
150
|
-
* @param suffix
|
151
|
-
* @param type
|
152
|
-
* @return
|
153
|
-
*/
|
154
|
-
private String convert(Column column, String suffix, String type, List<Transliterator> transliterators) {
|
155
|
-
String string = reader.getString(column);
|
156
|
-
for (Transliterator transliterator : transliterators) {
|
157
|
-
string = transliterator.transliterate(string);
|
158
|
-
}
|
159
|
-
if ("upper".equals(type)) {
|
160
|
-
string = string.toUpperCase();
|
161
|
-
} else if ("lower".equals(type)) {
|
162
|
-
string = string.toLowerCase();
|
163
|
-
}
|
164
|
-
return string;
|
165
|
-
}
|
166
|
-
};
|
78
|
+
return new Icu4jPageOutput(taskSource, inputSchema, outputSchema, output);
|
167
79
|
}
|
168
80
|
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
package org.embulk.filter.icu4j;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.Map;
|
5
|
+
|
6
|
+
import org.embulk.config.TaskSource;
|
7
|
+
import org.embulk.filter.icu4j.Icu4jFilterPlugin.PluginTask;
|
8
|
+
import org.embulk.spi.Column;
|
9
|
+
import org.embulk.spi.Exec;
|
10
|
+
import org.embulk.spi.Page;
|
11
|
+
import org.embulk.spi.PageBuilder;
|
12
|
+
import org.embulk.spi.PageOutput;
|
13
|
+
import org.embulk.spi.PageReader;
|
14
|
+
import org.embulk.spi.Schema;
|
15
|
+
import org.embulk.spi.type.Types;
|
16
|
+
import org.slf4j.Logger;
|
17
|
+
|
18
|
+
import com.google.common.base.MoreObjects;
|
19
|
+
import com.google.common.collect.Lists;
|
20
|
+
import com.ibm.icu.text.Transliterator;
|
21
|
+
|
22
|
+
public class Icu4jPageOutput implements PageOutput
|
23
|
+
{
|
24
|
+
private final PluginTask task;
|
25
|
+
private final List<Column> keyNameColumns;
|
26
|
+
private final List<List<Transliterator>> transliteratorsList;
|
27
|
+
private final PageReader reader;
|
28
|
+
private final PageBuilder builder;
|
29
|
+
private final Schema inputSchema;
|
30
|
+
private final Schema outputSchema;
|
31
|
+
private static final Logger logger = Exec.getLogger(Icu4jFilterPlugin.class);
|
32
|
+
|
33
|
+
public Icu4jPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
|
34
|
+
this.task = taskSource.loadTask(PluginTask.class);
|
35
|
+
this.keyNameColumns = Lists.newArrayList();
|
36
|
+
this.transliteratorsList = Lists.newArrayList();
|
37
|
+
this.inputSchema = inputSchema;
|
38
|
+
this.outputSchema = outputSchema;
|
39
|
+
|
40
|
+
for (String keyName : task.getKeyNames()) {
|
41
|
+
keyNameColumns.add(inputSchema.lookupColumn(keyName));
|
42
|
+
}
|
43
|
+
for (Map<String, String> setting : task.getSettings()) {
|
44
|
+
List<Transliterator> tokenizers = Lists.newArrayList();
|
45
|
+
for (String convertType : setting.get("transliterators").split(",")) {
|
46
|
+
Transliterator transliterator = Transliterator.getInstance(convertType);
|
47
|
+
tokenizers.add(transliterator);
|
48
|
+
}
|
49
|
+
transliteratorsList.add(tokenizers);
|
50
|
+
}
|
51
|
+
reader = new PageReader(inputSchema);
|
52
|
+
builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
53
|
+
}
|
54
|
+
|
55
|
+
@Override
|
56
|
+
public void finish() {
|
57
|
+
builder.finish();
|
58
|
+
}
|
59
|
+
|
60
|
+
@Override
|
61
|
+
public void close() {
|
62
|
+
builder.close();
|
63
|
+
}
|
64
|
+
|
65
|
+
@Override
|
66
|
+
public void add(Page page) {
|
67
|
+
reader.setPage(page);
|
68
|
+
while (reader.nextRecord()) {
|
69
|
+
if (task.getKeepInput()) {
|
70
|
+
for (Column inputColumn: inputSchema.getColumns()) {
|
71
|
+
if (reader.isNull(inputColumn)) {
|
72
|
+
builder.setNull(inputColumn);
|
73
|
+
continue;
|
74
|
+
}
|
75
|
+
if (Types.STRING.equals(inputColumn.getType())) {
|
76
|
+
builder.setString(inputColumn, reader.getString(inputColumn));
|
77
|
+
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
78
|
+
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
79
|
+
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
80
|
+
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
81
|
+
} else if (Types.LONG.equals(inputColumn.getType())) {
|
82
|
+
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
83
|
+
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
84
|
+
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
85
|
+
} else if (Types.JSON.equals(inputColumn.getType())) {
|
86
|
+
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
87
|
+
}
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
List<Map<String, String>> settings = task.getSettings();
|
92
|
+
for (Column column : keyNameColumns) {
|
93
|
+
for (int i = 0; i < settings.size(); i++) {
|
94
|
+
Map<String, String> setting = settings.get(i);
|
95
|
+
String suffix = setting.get("suffix");
|
96
|
+
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
97
|
+
final String source = reader.getString(column);
|
98
|
+
final List<Transliterator> transliterators = transliteratorsList.get(i);
|
99
|
+
String converted = convert(source, suffix, setting.get("case"), transliterators);
|
100
|
+
logger.debug("before => [{}], after => [{}]", source, converted);
|
101
|
+
if (converted == null) {
|
102
|
+
builder.setNull(outputColumn);
|
103
|
+
} else {
|
104
|
+
builder.setString(outputColumn, converted);
|
105
|
+
}
|
106
|
+
}
|
107
|
+
}
|
108
|
+
builder.addRecord();
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
/**
|
113
|
+
* @param column
|
114
|
+
* @param suffix
|
115
|
+
* @param type
|
116
|
+
* @return
|
117
|
+
*/
|
118
|
+
private String convert(String string, String suffix, String type, List<Transliterator> transliterators) {
|
119
|
+
for (Transliterator transliterator : transliterators) {
|
120
|
+
string = transliterator.transliterate(string);
|
121
|
+
}
|
122
|
+
if ("upper".equals(type)) {
|
123
|
+
string = string.toUpperCase();
|
124
|
+
} else if ("lower".equals(type)) {
|
125
|
+
string = string.toLowerCase();
|
126
|
+
}
|
127
|
+
return string;
|
128
|
+
}
|
129
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-icu4j
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description:
|
41
|
+
description: Unicode normalize string value. see http://site.icu-project.org/
|
42
42
|
email:
|
43
43
|
- toyama0919@gmail.com
|
44
44
|
executables: []
|
@@ -58,8 +58,9 @@ files:
|
|
58
58
|
- gradlew.bat
|
59
59
|
- lib/embulk/filter/icu4j.rb
|
60
60
|
- src/main/java/org/embulk/filter/icu4j/Icu4jFilterPlugin.java
|
61
|
+
- src/main/java/org/embulk/filter/icu4j/Icu4jPageOutput.java
|
61
62
|
- src/test/java/org/embulk/filter/icu4j/TestIcu4jFilterPlugin.java
|
62
|
-
- classpath/embulk-filter-icu4j-0.3.
|
63
|
+
- classpath/embulk-filter-icu4j-0.3.1.jar
|
63
64
|
- classpath/icu4j-56.1.jar
|
64
65
|
homepage: https://github.com/toyama0919/embulk-filter-icu4j
|
65
66
|
licenses:
|
@@ -84,6 +85,6 @@ rubyforge_project:
|
|
84
85
|
rubygems_version: 2.1.9
|
85
86
|
signing_key:
|
86
87
|
specification_version: 4
|
87
|
-
summary:
|
88
|
+
summary: Unicode normalize string value.
|
88
89
|
test_files: []
|
89
90
|
has_rdoc:
|