embulk-filter-icu4j 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3ad23292fec87d922796c9d337b198874e682ca
|
4
|
+
data.tar.gz: 7814cc24b2e3d7dc50c80b8cece4edfd5178d530
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60f43f75a52dab4228e2133c96ba93d85c72824e5cc4a8c81875acf045d5d76a9a0a348850d4e2b597d18b6d4077db0f279a23e96448b7cdf09dbe13425803a5
|
7
|
+
data.tar.gz: c6645ca93ddf302582cb82cf120f746a82a197afbf02081b57680e31155493c056159d171c6fa5e250578a2e80ea43d911371a29548794643884e3652625dfc8
|
data/README.md
CHANGED
data/build.gradle
CHANGED
@@ -14,7 +14,7 @@ configurations {
|
|
14
14
|
provided
|
15
15
|
}
|
16
16
|
|
17
|
-
version = "0.3.
|
17
|
+
version = "0.3.1"
|
18
18
|
|
19
19
|
sourceCompatibility = 1.7
|
20
20
|
targetCompatibility = 1.7
|
@@ -77,8 +77,8 @@ Gem::Specification.new do |spec|
|
|
77
77
|
spec.name = "${project.name}"
|
78
78
|
spec.version = "${project.version}"
|
79
79
|
spec.authors = ["toyama0919"]
|
80
|
-
spec.summary = %[
|
81
|
-
spec.description = %[
|
80
|
+
spec.summary = %[Unicode normalize string value.]
|
81
|
+
spec.description = %[Unicode normalize string value. see http://site.icu-project.org/]
|
82
82
|
spec.email = ["toyama0919@gmail.com"]
|
83
83
|
spec.licenses = ["MIT"]
|
84
84
|
spec.homepage = "https://github.com/toyama0919/embulk-filter-icu4j"
|
@@ -9,19 +9,13 @@ import org.embulk.config.ConfigSource;
|
|
9
9
|
import org.embulk.config.Task;
|
10
10
|
import org.embulk.config.TaskSource;
|
11
11
|
import org.embulk.spi.Column;
|
12
|
-
import org.embulk.spi.Exec;
|
13
12
|
import org.embulk.spi.FilterPlugin;
|
14
|
-
import org.embulk.spi.Page;
|
15
|
-
import org.embulk.spi.PageBuilder;
|
16
13
|
import org.embulk.spi.PageOutput;
|
17
|
-
import org.embulk.spi.PageReader;
|
18
14
|
import org.embulk.spi.Schema;
|
19
15
|
import org.embulk.spi.type.Types;
|
20
16
|
|
21
17
|
import com.google.common.base.MoreObjects;
|
22
18
|
import com.google.common.collect.ImmutableList;
|
23
|
-
import com.google.common.collect.Lists;
|
24
|
-
import com.ibm.icu.text.Transliterator;
|
25
19
|
|
26
20
|
public class Icu4jFilterPlugin implements FilterPlugin
|
27
21
|
{
|
@@ -44,6 +38,15 @@ public class Icu4jFilterPlugin implements FilterPlugin
|
|
44
38
|
{
|
45
39
|
PluginTask task = config.loadConfig(PluginTask.class);
|
46
40
|
|
41
|
+
control.run(task.dump(), buildOutputSchema(task, inputSchema));
|
42
|
+
}
|
43
|
+
|
44
|
+
/**
|
45
|
+
* @param inputSchema
|
46
|
+
* @param task
|
47
|
+
* @return
|
48
|
+
*/
|
49
|
+
private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
|
47
50
|
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
48
51
|
int i = 0;
|
49
52
|
if (task.getKeepInput()) {
|
@@ -66,103 +69,12 @@ public class Icu4jFilterPlugin implements FilterPlugin
|
|
66
69
|
}
|
67
70
|
}
|
68
71
|
Schema outputSchema = new Schema(builder.build());
|
69
|
-
|
72
|
+
return outputSchema;
|
70
73
|
}
|
71
74
|
|
72
75
|
@Override
|
73
76
|
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
74
77
|
{
|
75
|
-
|
76
|
-
final List<Column> keyNameColumns = Lists.newArrayList();
|
77
|
-
for (String keyName : task.getKeyNames()) {
|
78
|
-
keyNameColumns.add(inputSchema.lookupColumn(keyName));
|
79
|
-
}
|
80
|
-
final List<List<Transliterator>> transliterators = Lists.newArrayList();
|
81
|
-
for (Map<String, String> setting : task.getSettings()) {
|
82
|
-
List<Transliterator> tokenizers = Lists.newArrayList();
|
83
|
-
for (String convertType : setting.get("transliterators").split(",")) {
|
84
|
-
Transliterator transliterator = Transliterator.getInstance(convertType);
|
85
|
-
tokenizers.add(transliterator);
|
86
|
-
}
|
87
|
-
transliterators.add(tokenizers);
|
88
|
-
}
|
89
|
-
|
90
|
-
return new PageOutput() {
|
91
|
-
private PageReader reader = new PageReader(inputSchema);
|
92
|
-
private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
93
|
-
|
94
|
-
@Override
|
95
|
-
public void finish() {
|
96
|
-
builder.finish();
|
97
|
-
}
|
98
|
-
|
99
|
-
@Override
|
100
|
-
public void close() {
|
101
|
-
builder.close();
|
102
|
-
}
|
103
|
-
|
104
|
-
@Override
|
105
|
-
public void add(Page page) {
|
106
|
-
reader.setPage(page);
|
107
|
-
while (reader.nextRecord()) {
|
108
|
-
if (task.getKeepInput()) {
|
109
|
-
for (Column inputColumn: inputSchema.getColumns()) {
|
110
|
-
if (reader.isNull(inputColumn)) {
|
111
|
-
builder.setNull(inputColumn);
|
112
|
-
continue;
|
113
|
-
}
|
114
|
-
if (Types.STRING.equals(inputColumn.getType())) {
|
115
|
-
builder.setString(inputColumn, reader.getString(inputColumn));
|
116
|
-
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
117
|
-
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
118
|
-
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
119
|
-
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
120
|
-
} else if (Types.LONG.equals(inputColumn.getType())) {
|
121
|
-
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
122
|
-
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
123
|
-
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
124
|
-
} else if (Types.JSON.equals(inputColumn.getType())) {
|
125
|
-
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
126
|
-
}
|
127
|
-
}
|
128
|
-
}
|
129
|
-
|
130
|
-
List<Map<String, String>> settings = task.getSettings();
|
131
|
-
for (Column column : keyNameColumns) {
|
132
|
-
for (int i = 0; i < settings.size(); i++) {
|
133
|
-
Map<String, String> setting = settings.get(i);
|
134
|
-
String suffix = setting.get("suffix");
|
135
|
-
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
136
|
-
String convert = convert(column, suffix, setting.get("case"), transliterators.get(i));
|
137
|
-
if (convert == null) {
|
138
|
-
builder.setNull(outputColumn);
|
139
|
-
} else {
|
140
|
-
builder.setString(outputColumn, convert);
|
141
|
-
}
|
142
|
-
}
|
143
|
-
}
|
144
|
-
builder.addRecord();
|
145
|
-
}
|
146
|
-
}
|
147
|
-
|
148
|
-
/**
|
149
|
-
* @param column
|
150
|
-
* @param suffix
|
151
|
-
* @param type
|
152
|
-
* @return
|
153
|
-
*/
|
154
|
-
private String convert(Column column, String suffix, String type, List<Transliterator> transliterators) {
|
155
|
-
String string = reader.getString(column);
|
156
|
-
for (Transliterator transliterator : transliterators) {
|
157
|
-
string = transliterator.transliterate(string);
|
158
|
-
}
|
159
|
-
if ("upper".equals(type)) {
|
160
|
-
string = string.toUpperCase();
|
161
|
-
} else if ("lower".equals(type)) {
|
162
|
-
string = string.toLowerCase();
|
163
|
-
}
|
164
|
-
return string;
|
165
|
-
}
|
166
|
-
};
|
78
|
+
return new Icu4jPageOutput(taskSource, inputSchema, outputSchema, output);
|
167
79
|
}
|
168
80
|
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
package org.embulk.filter.icu4j;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.Map;
|
5
|
+
|
6
|
+
import org.embulk.config.TaskSource;
|
7
|
+
import org.embulk.filter.icu4j.Icu4jFilterPlugin.PluginTask;
|
8
|
+
import org.embulk.spi.Column;
|
9
|
+
import org.embulk.spi.Exec;
|
10
|
+
import org.embulk.spi.Page;
|
11
|
+
import org.embulk.spi.PageBuilder;
|
12
|
+
import org.embulk.spi.PageOutput;
|
13
|
+
import org.embulk.spi.PageReader;
|
14
|
+
import org.embulk.spi.Schema;
|
15
|
+
import org.embulk.spi.type.Types;
|
16
|
+
import org.slf4j.Logger;
|
17
|
+
|
18
|
+
import com.google.common.base.MoreObjects;
|
19
|
+
import com.google.common.collect.Lists;
|
20
|
+
import com.ibm.icu.text.Transliterator;
|
21
|
+
|
22
|
+
public class Icu4jPageOutput implements PageOutput
|
23
|
+
{
|
24
|
+
private final PluginTask task;
|
25
|
+
private final List<Column> keyNameColumns;
|
26
|
+
private final List<List<Transliterator>> transliteratorsList;
|
27
|
+
private final PageReader reader;
|
28
|
+
private final PageBuilder builder;
|
29
|
+
private final Schema inputSchema;
|
30
|
+
private final Schema outputSchema;
|
31
|
+
private static final Logger logger = Exec.getLogger(Icu4jFilterPlugin.class);
|
32
|
+
|
33
|
+
public Icu4jPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
|
34
|
+
this.task = taskSource.loadTask(PluginTask.class);
|
35
|
+
this.keyNameColumns = Lists.newArrayList();
|
36
|
+
this.transliteratorsList = Lists.newArrayList();
|
37
|
+
this.inputSchema = inputSchema;
|
38
|
+
this.outputSchema = outputSchema;
|
39
|
+
|
40
|
+
for (String keyName : task.getKeyNames()) {
|
41
|
+
keyNameColumns.add(inputSchema.lookupColumn(keyName));
|
42
|
+
}
|
43
|
+
for (Map<String, String> setting : task.getSettings()) {
|
44
|
+
List<Transliterator> tokenizers = Lists.newArrayList();
|
45
|
+
for (String convertType : setting.get("transliterators").split(",")) {
|
46
|
+
Transliterator transliterator = Transliterator.getInstance(convertType);
|
47
|
+
tokenizers.add(transliterator);
|
48
|
+
}
|
49
|
+
transliteratorsList.add(tokenizers);
|
50
|
+
}
|
51
|
+
reader = new PageReader(inputSchema);
|
52
|
+
builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
53
|
+
}
|
54
|
+
|
55
|
+
@Override
|
56
|
+
public void finish() {
|
57
|
+
builder.finish();
|
58
|
+
}
|
59
|
+
|
60
|
+
@Override
|
61
|
+
public void close() {
|
62
|
+
builder.close();
|
63
|
+
}
|
64
|
+
|
65
|
+
@Override
|
66
|
+
public void add(Page page) {
|
67
|
+
reader.setPage(page);
|
68
|
+
while (reader.nextRecord()) {
|
69
|
+
if (task.getKeepInput()) {
|
70
|
+
for (Column inputColumn: inputSchema.getColumns()) {
|
71
|
+
if (reader.isNull(inputColumn)) {
|
72
|
+
builder.setNull(inputColumn);
|
73
|
+
continue;
|
74
|
+
}
|
75
|
+
if (Types.STRING.equals(inputColumn.getType())) {
|
76
|
+
builder.setString(inputColumn, reader.getString(inputColumn));
|
77
|
+
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
78
|
+
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
79
|
+
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
80
|
+
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
81
|
+
} else if (Types.LONG.equals(inputColumn.getType())) {
|
82
|
+
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
83
|
+
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
84
|
+
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
85
|
+
} else if (Types.JSON.equals(inputColumn.getType())) {
|
86
|
+
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
87
|
+
}
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
List<Map<String, String>> settings = task.getSettings();
|
92
|
+
for (Column column : keyNameColumns) {
|
93
|
+
for (int i = 0; i < settings.size(); i++) {
|
94
|
+
Map<String, String> setting = settings.get(i);
|
95
|
+
String suffix = setting.get("suffix");
|
96
|
+
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
97
|
+
final String source = reader.getString(column);
|
98
|
+
final List<Transliterator> transliterators = transliteratorsList.get(i);
|
99
|
+
String converted = convert(source, suffix, setting.get("case"), transliterators);
|
100
|
+
logger.debug("before => [{}], after => [{}]", source, converted);
|
101
|
+
if (converted == null) {
|
102
|
+
builder.setNull(outputColumn);
|
103
|
+
} else {
|
104
|
+
builder.setString(outputColumn, converted);
|
105
|
+
}
|
106
|
+
}
|
107
|
+
}
|
108
|
+
builder.addRecord();
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
/**
|
113
|
+
* @param column
|
114
|
+
* @param suffix
|
115
|
+
* @param type
|
116
|
+
* @return
|
117
|
+
*/
|
118
|
+
private String convert(String string, String suffix, String type, List<Transliterator> transliterators) {
|
119
|
+
for (Transliterator transliterator : transliterators) {
|
120
|
+
string = transliterator.transliterate(string);
|
121
|
+
}
|
122
|
+
if ("upper".equals(type)) {
|
123
|
+
string = string.toUpperCase();
|
124
|
+
} else if ("lower".equals(type)) {
|
125
|
+
string = string.toLowerCase();
|
126
|
+
}
|
127
|
+
return string;
|
128
|
+
}
|
129
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-icu4j
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description:
|
41
|
+
description: Unicode normalize string value. see http://site.icu-project.org/
|
42
42
|
email:
|
43
43
|
- toyama0919@gmail.com
|
44
44
|
executables: []
|
@@ -58,8 +58,9 @@ files:
|
|
58
58
|
- gradlew.bat
|
59
59
|
- lib/embulk/filter/icu4j.rb
|
60
60
|
- src/main/java/org/embulk/filter/icu4j/Icu4jFilterPlugin.java
|
61
|
+
- src/main/java/org/embulk/filter/icu4j/Icu4jPageOutput.java
|
61
62
|
- src/test/java/org/embulk/filter/icu4j/TestIcu4jFilterPlugin.java
|
62
|
-
- classpath/embulk-filter-icu4j-0.3.
|
63
|
+
- classpath/embulk-filter-icu4j-0.3.1.jar
|
63
64
|
- classpath/icu4j-56.1.jar
|
64
65
|
homepage: https://github.com/toyama0919/embulk-filter-icu4j
|
65
66
|
licenses:
|
@@ -84,6 +85,6 @@ rubyforge_project:
|
|
84
85
|
rubygems_version: 2.1.9
|
85
86
|
signing_key:
|
86
87
|
specification_version: 4
|
87
|
-
summary:
|
88
|
+
summary: Unicode normalize string value.
|
88
89
|
test_files: []
|
89
90
|
has_rdoc:
|