embulk-filter-icu4j 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 09261334b3c9f968efa092671477b1a0a5078406
4
- data.tar.gz: e385274276530797fd6a23625d057d94a4994cb1
3
+ metadata.gz: a3ad23292fec87d922796c9d337b198874e682ca
4
+ data.tar.gz: 7814cc24b2e3d7dc50c80b8cece4edfd5178d530
5
5
  SHA512:
6
- metadata.gz: 1958ce9f826198b577bc4ed394887c805f2b5c51057e308115b913f4d8cf6d846f02df60d1522e012d80903e8ee9b84213798a9d03b65017888311ca18399bd2
7
- data.tar.gz: 948f5bdc9a54aba692e5852e067acabaf3a5050681a07a1b6c9eb75de057dcec15eeaaa9cc18f5aa56a77a7ed62beaf99653c378bcad66de9cea7dbf7a6373bf
6
+ metadata.gz: 60f43f75a52dab4228e2133c96ba93d85c72824e5cc4a8c81875acf045d5d76a9a0a348850d4e2b597d18b6d4077db0f279a23e96448b7cdf09dbe13425803a5
7
+ data.tar.gz: c6645ca93ddf302582cb82cf120f746a82a197afbf02081b57680e31155493c056159d171c6fa5e250578a2e80ea43d911371a29548794643884e3652625dfc8
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # Icu4j filter plugin for Embulk
2
2
 
3
+ Unicode normalize string value.
4
+
3
5
  Icu4j filter plugin for Embulk.
4
6
  see. http://site.icu-project.org/
5
7
 
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.3.0"
17
+ version = "0.3.1"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -77,8 +77,8 @@ Gem::Specification.new do |spec|
77
77
  spec.name = "${project.name}"
78
78
  spec.version = "${project.version}"
79
79
  spec.authors = ["toyama0919"]
80
- spec.summary = %[Icu4j filter plugin for Embulk]
81
- spec.description = %[Icu4j filter plugin for Embulk. see http://site.icu-project.org/]
80
+ spec.summary = %[Unicode normalize string value.]
81
+ spec.description = %[Unicode normalize string value. see http://site.icu-project.org/]
82
82
  spec.email = ["toyama0919@gmail.com"]
83
83
  spec.licenses = ["MIT"]
84
84
  spec.homepage = "https://github.com/toyama0919/embulk-filter-icu4j"
@@ -9,19 +9,13 @@ import org.embulk.config.ConfigSource;
9
9
  import org.embulk.config.Task;
10
10
  import org.embulk.config.TaskSource;
11
11
  import org.embulk.spi.Column;
12
- import org.embulk.spi.Exec;
13
12
  import org.embulk.spi.FilterPlugin;
14
- import org.embulk.spi.Page;
15
- import org.embulk.spi.PageBuilder;
16
13
  import org.embulk.spi.PageOutput;
17
- import org.embulk.spi.PageReader;
18
14
  import org.embulk.spi.Schema;
19
15
  import org.embulk.spi.type.Types;
20
16
 
21
17
  import com.google.common.base.MoreObjects;
22
18
  import com.google.common.collect.ImmutableList;
23
- import com.google.common.collect.Lists;
24
- import com.ibm.icu.text.Transliterator;
25
19
 
26
20
  public class Icu4jFilterPlugin implements FilterPlugin
27
21
  {
@@ -44,6 +38,15 @@ public class Icu4jFilterPlugin implements FilterPlugin
44
38
  {
45
39
  PluginTask task = config.loadConfig(PluginTask.class);
46
40
 
41
+ control.run(task.dump(), buildOutputSchema(task, inputSchema));
42
+ }
43
+
44
+ /**
45
+ * @param inputSchema
46
+ * @param task
47
+ * @return
48
+ */
49
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
47
50
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
48
51
  int i = 0;
49
52
  if (task.getKeepInput()) {
@@ -66,103 +69,12 @@ public class Icu4jFilterPlugin implements FilterPlugin
66
69
  }
67
70
  }
68
71
  Schema outputSchema = new Schema(builder.build());
69
- control.run(task.dump(), outputSchema);
72
+ return outputSchema;
70
73
  }
71
74
 
72
75
  @Override
73
76
  public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
74
77
  {
75
- final PluginTask task = taskSource.loadTask(PluginTask.class);
76
- final List<Column> keyNameColumns = Lists.newArrayList();
77
- for (String keyName : task.getKeyNames()) {
78
- keyNameColumns.add(inputSchema.lookupColumn(keyName));
79
- }
80
- final List<List<Transliterator>> transliterators = Lists.newArrayList();
81
- for (Map<String, String> setting : task.getSettings()) {
82
- List<Transliterator> tokenizers = Lists.newArrayList();
83
- for (String convertType : setting.get("transliterators").split(",")) {
84
- Transliterator transliterator = Transliterator.getInstance(convertType);
85
- tokenizers.add(transliterator);
86
- }
87
- transliterators.add(tokenizers);
88
- }
89
-
90
- return new PageOutput() {
91
- private PageReader reader = new PageReader(inputSchema);
92
- private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
93
-
94
- @Override
95
- public void finish() {
96
- builder.finish();
97
- }
98
-
99
- @Override
100
- public void close() {
101
- builder.close();
102
- }
103
-
104
- @Override
105
- public void add(Page page) {
106
- reader.setPage(page);
107
- while (reader.nextRecord()) {
108
- if (task.getKeepInput()) {
109
- for (Column inputColumn: inputSchema.getColumns()) {
110
- if (reader.isNull(inputColumn)) {
111
- builder.setNull(inputColumn);
112
- continue;
113
- }
114
- if (Types.STRING.equals(inputColumn.getType())) {
115
- builder.setString(inputColumn, reader.getString(inputColumn));
116
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
117
- builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
118
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
119
- builder.setDouble(inputColumn, reader.getDouble(inputColumn));
120
- } else if (Types.LONG.equals(inputColumn.getType())) {
121
- builder.setLong(inputColumn, reader.getLong(inputColumn));
122
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
123
- builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
124
- } else if (Types.JSON.equals(inputColumn.getType())) {
125
- builder.setJson(inputColumn, reader.getJson(inputColumn));
126
- }
127
- }
128
- }
129
-
130
- List<Map<String, String>> settings = task.getSettings();
131
- for (Column column : keyNameColumns) {
132
- for (int i = 0; i < settings.size(); i++) {
133
- Map<String, String> setting = settings.get(i);
134
- String suffix = setting.get("suffix");
135
- Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
136
- String convert = convert(column, suffix, setting.get("case"), transliterators.get(i));
137
- if (convert == null) {
138
- builder.setNull(outputColumn);
139
- } else {
140
- builder.setString(outputColumn, convert);
141
- }
142
- }
143
- }
144
- builder.addRecord();
145
- }
146
- }
147
-
148
- /**
149
- * @param column
150
- * @param suffix
151
- * @param type
152
- * @return
153
- */
154
- private String convert(Column column, String suffix, String type, List<Transliterator> transliterators) {
155
- String string = reader.getString(column);
156
- for (Transliterator transliterator : transliterators) {
157
- string = transliterator.transliterate(string);
158
- }
159
- if ("upper".equals(type)) {
160
- string = string.toUpperCase();
161
- } else if ("lower".equals(type)) {
162
- string = string.toLowerCase();
163
- }
164
- return string;
165
- }
166
- };
78
+ return new Icu4jPageOutput(taskSource, inputSchema, outputSchema, output);
167
79
  }
168
80
  }
@@ -0,0 +1,129 @@
1
+ package org.embulk.filter.icu4j;
2
+
3
+ import java.util.List;
4
+ import java.util.Map;
5
+
6
+ import org.embulk.config.TaskSource;
7
+ import org.embulk.filter.icu4j.Icu4jFilterPlugin.PluginTask;
8
+ import org.embulk.spi.Column;
9
+ import org.embulk.spi.Exec;
10
+ import org.embulk.spi.Page;
11
+ import org.embulk.spi.PageBuilder;
12
+ import org.embulk.spi.PageOutput;
13
+ import org.embulk.spi.PageReader;
14
+ import org.embulk.spi.Schema;
15
+ import org.embulk.spi.type.Types;
16
+ import org.slf4j.Logger;
17
+
18
+ import com.google.common.base.MoreObjects;
19
+ import com.google.common.collect.Lists;
20
+ import com.ibm.icu.text.Transliterator;
21
+
22
+ public class Icu4jPageOutput implements PageOutput
23
+ {
24
+ private final PluginTask task;
25
+ private final List<Column> keyNameColumns;
26
+ private final List<List<Transliterator>> transliteratorsList;
27
+ private final PageReader reader;
28
+ private final PageBuilder builder;
29
+ private final Schema inputSchema;
30
+ private final Schema outputSchema;
31
+ private static final Logger logger = Exec.getLogger(Icu4jFilterPlugin.class);
32
+
33
+ public Icu4jPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
34
+ this.task = taskSource.loadTask(PluginTask.class);
35
+ this.keyNameColumns = Lists.newArrayList();
36
+ this.transliteratorsList = Lists.newArrayList();
37
+ this.inputSchema = inputSchema;
38
+ this.outputSchema = outputSchema;
39
+
40
+ for (String keyName : task.getKeyNames()) {
41
+ keyNameColumns.add(inputSchema.lookupColumn(keyName));
42
+ }
43
+ for (Map<String, String> setting : task.getSettings()) {
44
+ List<Transliterator> tokenizers = Lists.newArrayList();
45
+ for (String convertType : setting.get("transliterators").split(",")) {
46
+ Transliterator transliterator = Transliterator.getInstance(convertType);
47
+ tokenizers.add(transliterator);
48
+ }
49
+ transliteratorsList.add(tokenizers);
50
+ }
51
+ reader = new PageReader(inputSchema);
52
+ builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
53
+ }
54
+
55
+ @Override
56
+ public void finish() {
57
+ builder.finish();
58
+ }
59
+
60
+ @Override
61
+ public void close() {
62
+ builder.close();
63
+ }
64
+
65
+ @Override
66
+ public void add(Page page) {
67
+ reader.setPage(page);
68
+ while (reader.nextRecord()) {
69
+ if (task.getKeepInput()) {
70
+ for (Column inputColumn: inputSchema.getColumns()) {
71
+ if (reader.isNull(inputColumn)) {
72
+ builder.setNull(inputColumn);
73
+ continue;
74
+ }
75
+ if (Types.STRING.equals(inputColumn.getType())) {
76
+ builder.setString(inputColumn, reader.getString(inputColumn));
77
+ } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
78
+ builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
79
+ } else if (Types.DOUBLE.equals(inputColumn.getType())) {
80
+ builder.setDouble(inputColumn, reader.getDouble(inputColumn));
81
+ } else if (Types.LONG.equals(inputColumn.getType())) {
82
+ builder.setLong(inputColumn, reader.getLong(inputColumn));
83
+ } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
84
+ builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
85
+ } else if (Types.JSON.equals(inputColumn.getType())) {
86
+ builder.setJson(inputColumn, reader.getJson(inputColumn));
87
+ }
88
+ }
89
+ }
90
+
91
+ List<Map<String, String>> settings = task.getSettings();
92
+ for (Column column : keyNameColumns) {
93
+ for (int i = 0; i < settings.size(); i++) {
94
+ Map<String, String> setting = settings.get(i);
95
+ String suffix = setting.get("suffix");
96
+ Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
97
+ final String source = reader.getString(column);
98
+ final List<Transliterator> transliterators = transliteratorsList.get(i);
99
+ String converted = convert(source, suffix, setting.get("case"), transliterators);
100
+ logger.debug("before => [{}], after => [{}]", source, converted);
101
+ if (converted == null) {
102
+ builder.setNull(outputColumn);
103
+ } else {
104
+ builder.setString(outputColumn, converted);
105
+ }
106
+ }
107
+ }
108
+ builder.addRecord();
109
+ }
110
+ }
111
+
112
+ /**
113
+ * @param column
114
+ * @param suffix
115
+ * @param type
116
+ * @return
117
+ */
118
+ private String convert(String string, String suffix, String type, List<Transliterator> transliterators) {
119
+ for (Transliterator transliterator : transliterators) {
120
+ string = transliterator.transliterate(string);
121
+ }
122
+ if ("upper".equals(type)) {
123
+ string = string.toUpperCase();
124
+ } else if ("lower".equals(type)) {
125
+ string = string.toLowerCase();
126
+ }
127
+ return string;
128
+ }
129
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-icu4j
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-10 00:00:00.000000000 Z
11
+ date: 2016-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -38,7 +38,7 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: Icu4j filter plugin for Embulk. see http://site.icu-project.org/
41
+ description: Unicode normalize string value. see http://site.icu-project.org/
42
42
  email:
43
43
  - toyama0919@gmail.com
44
44
  executables: []
@@ -58,8 +58,9 @@ files:
58
58
  - gradlew.bat
59
59
  - lib/embulk/filter/icu4j.rb
60
60
  - src/main/java/org/embulk/filter/icu4j/Icu4jFilterPlugin.java
61
+ - src/main/java/org/embulk/filter/icu4j/Icu4jPageOutput.java
61
62
  - src/test/java/org/embulk/filter/icu4j/TestIcu4jFilterPlugin.java
62
- - classpath/embulk-filter-icu4j-0.3.0.jar
63
+ - classpath/embulk-filter-icu4j-0.3.1.jar
63
64
  - classpath/icu4j-56.1.jar
64
65
  homepage: https://github.com/toyama0919/embulk-filter-icu4j
65
66
  licenses:
@@ -84,6 +85,6 @@ rubyforge_project:
84
85
  rubygems_version: 2.1.9
85
86
  signing_key:
86
87
  specification_version: 4
87
- summary: Icu4j filter plugin for Embulk
88
+ summary: Unicode normalize string value.
88
89
  test_files: []
89
90
  has_rdoc: