embulk-filter-icu4j 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 09261334b3c9f968efa092671477b1a0a5078406
4
- data.tar.gz: e385274276530797fd6a23625d057d94a4994cb1
3
+ metadata.gz: a3ad23292fec87d922796c9d337b198874e682ca
4
+ data.tar.gz: 7814cc24b2e3d7dc50c80b8cece4edfd5178d530
5
5
  SHA512:
6
- metadata.gz: 1958ce9f826198b577bc4ed394887c805f2b5c51057e308115b913f4d8cf6d846f02df60d1522e012d80903e8ee9b84213798a9d03b65017888311ca18399bd2
7
- data.tar.gz: 948f5bdc9a54aba692e5852e067acabaf3a5050681a07a1b6c9eb75de057dcec15eeaaa9cc18f5aa56a77a7ed62beaf99653c378bcad66de9cea7dbf7a6373bf
6
+ metadata.gz: 60f43f75a52dab4228e2133c96ba93d85c72824e5cc4a8c81875acf045d5d76a9a0a348850d4e2b597d18b6d4077db0f279a23e96448b7cdf09dbe13425803a5
7
+ data.tar.gz: c6645ca93ddf302582cb82cf120f746a82a197afbf02081b57680e31155493c056159d171c6fa5e250578a2e80ea43d911371a29548794643884e3652625dfc8
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # Icu4j filter plugin for Embulk
2
2
 
3
+ Unicode normalize string value.
4
+
3
5
  Icu4j filter plugin for Embulk.
4
6
  see. http://site.icu-project.org/
5
7
 
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.3.0"
17
+ version = "0.3.1"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -77,8 +77,8 @@ Gem::Specification.new do |spec|
77
77
  spec.name = "${project.name}"
78
78
  spec.version = "${project.version}"
79
79
  spec.authors = ["toyama0919"]
80
- spec.summary = %[Icu4j filter plugin for Embulk]
81
- spec.description = %[Icu4j filter plugin for Embulk. see http://site.icu-project.org/]
80
+ spec.summary = %[Unicode normalize string value.]
81
+ spec.description = %[Unicode normalize string value. see http://site.icu-project.org/]
82
82
  spec.email = ["toyama0919@gmail.com"]
83
83
  spec.licenses = ["MIT"]
84
84
  spec.homepage = "https://github.com/toyama0919/embulk-filter-icu4j"
@@ -9,19 +9,13 @@ import org.embulk.config.ConfigSource;
9
9
  import org.embulk.config.Task;
10
10
  import org.embulk.config.TaskSource;
11
11
  import org.embulk.spi.Column;
12
- import org.embulk.spi.Exec;
13
12
  import org.embulk.spi.FilterPlugin;
14
- import org.embulk.spi.Page;
15
- import org.embulk.spi.PageBuilder;
16
13
  import org.embulk.spi.PageOutput;
17
- import org.embulk.spi.PageReader;
18
14
  import org.embulk.spi.Schema;
19
15
  import org.embulk.spi.type.Types;
20
16
 
21
17
  import com.google.common.base.MoreObjects;
22
18
  import com.google.common.collect.ImmutableList;
23
- import com.google.common.collect.Lists;
24
- import com.ibm.icu.text.Transliterator;
25
19
 
26
20
  public class Icu4jFilterPlugin implements FilterPlugin
27
21
  {
@@ -44,6 +38,15 @@ public class Icu4jFilterPlugin implements FilterPlugin
44
38
  {
45
39
  PluginTask task = config.loadConfig(PluginTask.class);
46
40
 
41
+ control.run(task.dump(), buildOutputSchema(task, inputSchema));
42
+ }
43
+
44
+ /**
45
+ * @param inputSchema
46
+ * @param task
47
+ * @return
48
+ */
49
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
47
50
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
48
51
  int i = 0;
49
52
  if (task.getKeepInput()) {
@@ -66,103 +69,12 @@ public class Icu4jFilterPlugin implements FilterPlugin
66
69
  }
67
70
  }
68
71
  Schema outputSchema = new Schema(builder.build());
69
- control.run(task.dump(), outputSchema);
72
+ return outputSchema;
70
73
  }
71
74
 
72
75
  @Override
73
76
  public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
74
77
  {
75
- final PluginTask task = taskSource.loadTask(PluginTask.class);
76
- final List<Column> keyNameColumns = Lists.newArrayList();
77
- for (String keyName : task.getKeyNames()) {
78
- keyNameColumns.add(inputSchema.lookupColumn(keyName));
79
- }
80
- final List<List<Transliterator>> transliterators = Lists.newArrayList();
81
- for (Map<String, String> setting : task.getSettings()) {
82
- List<Transliterator> tokenizers = Lists.newArrayList();
83
- for (String convertType : setting.get("transliterators").split(",")) {
84
- Transliterator transliterator = Transliterator.getInstance(convertType);
85
- tokenizers.add(transliterator);
86
- }
87
- transliterators.add(tokenizers);
88
- }
89
-
90
- return new PageOutput() {
91
- private PageReader reader = new PageReader(inputSchema);
92
- private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
93
-
94
- @Override
95
- public void finish() {
96
- builder.finish();
97
- }
98
-
99
- @Override
100
- public void close() {
101
- builder.close();
102
- }
103
-
104
- @Override
105
- public void add(Page page) {
106
- reader.setPage(page);
107
- while (reader.nextRecord()) {
108
- if (task.getKeepInput()) {
109
- for (Column inputColumn: inputSchema.getColumns()) {
110
- if (reader.isNull(inputColumn)) {
111
- builder.setNull(inputColumn);
112
- continue;
113
- }
114
- if (Types.STRING.equals(inputColumn.getType())) {
115
- builder.setString(inputColumn, reader.getString(inputColumn));
116
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
117
- builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
118
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
119
- builder.setDouble(inputColumn, reader.getDouble(inputColumn));
120
- } else if (Types.LONG.equals(inputColumn.getType())) {
121
- builder.setLong(inputColumn, reader.getLong(inputColumn));
122
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
123
- builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
124
- } else if (Types.JSON.equals(inputColumn.getType())) {
125
- builder.setJson(inputColumn, reader.getJson(inputColumn));
126
- }
127
- }
128
- }
129
-
130
- List<Map<String, String>> settings = task.getSettings();
131
- for (Column column : keyNameColumns) {
132
- for (int i = 0; i < settings.size(); i++) {
133
- Map<String, String> setting = settings.get(i);
134
- String suffix = setting.get("suffix");
135
- Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
136
- String convert = convert(column, suffix, setting.get("case"), transliterators.get(i));
137
- if (convert == null) {
138
- builder.setNull(outputColumn);
139
- } else {
140
- builder.setString(outputColumn, convert);
141
- }
142
- }
143
- }
144
- builder.addRecord();
145
- }
146
- }
147
-
148
- /**
149
- * @param column
150
- * @param suffix
151
- * @param type
152
- * @return
153
- */
154
- private String convert(Column column, String suffix, String type, List<Transliterator> transliterators) {
155
- String string = reader.getString(column);
156
- for (Transliterator transliterator : transliterators) {
157
- string = transliterator.transliterate(string);
158
- }
159
- if ("upper".equals(type)) {
160
- string = string.toUpperCase();
161
- } else if ("lower".equals(type)) {
162
- string = string.toLowerCase();
163
- }
164
- return string;
165
- }
166
- };
78
+ return new Icu4jPageOutput(taskSource, inputSchema, outputSchema, output);
167
79
  }
168
80
  }
@@ -0,0 +1,129 @@
1
+ package org.embulk.filter.icu4j;
2
+
3
+ import java.util.List;
4
+ import java.util.Map;
5
+
6
+ import org.embulk.config.TaskSource;
7
+ import org.embulk.filter.icu4j.Icu4jFilterPlugin.PluginTask;
8
+ import org.embulk.spi.Column;
9
+ import org.embulk.spi.Exec;
10
+ import org.embulk.spi.Page;
11
+ import org.embulk.spi.PageBuilder;
12
+ import org.embulk.spi.PageOutput;
13
+ import org.embulk.spi.PageReader;
14
+ import org.embulk.spi.Schema;
15
+ import org.embulk.spi.type.Types;
16
+ import org.slf4j.Logger;
17
+
18
+ import com.google.common.base.MoreObjects;
19
+ import com.google.common.collect.Lists;
20
+ import com.ibm.icu.text.Transliterator;
21
+
22
+ public class Icu4jPageOutput implements PageOutput
23
+ {
24
+ private final PluginTask task;
25
+ private final List<Column> keyNameColumns;
26
+ private final List<List<Transliterator>> transliteratorsList;
27
+ private final PageReader reader;
28
+ private final PageBuilder builder;
29
+ private final Schema inputSchema;
30
+ private final Schema outputSchema;
31
+ private static final Logger logger = Exec.getLogger(Icu4jFilterPlugin.class);
32
+
33
+ public Icu4jPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
34
+ this.task = taskSource.loadTask(PluginTask.class);
35
+ this.keyNameColumns = Lists.newArrayList();
36
+ this.transliteratorsList = Lists.newArrayList();
37
+ this.inputSchema = inputSchema;
38
+ this.outputSchema = outputSchema;
39
+
40
+ for (String keyName : task.getKeyNames()) {
41
+ keyNameColumns.add(inputSchema.lookupColumn(keyName));
42
+ }
43
+ for (Map<String, String> setting : task.getSettings()) {
44
+ List<Transliterator> tokenizers = Lists.newArrayList();
45
+ for (String convertType : setting.get("transliterators").split(",")) {
46
+ Transliterator transliterator = Transliterator.getInstance(convertType);
47
+ tokenizers.add(transliterator);
48
+ }
49
+ transliteratorsList.add(tokenizers);
50
+ }
51
+ reader = new PageReader(inputSchema);
52
+ builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
53
+ }
54
+
55
+ @Override
56
+ public void finish() {
57
+ builder.finish();
58
+ }
59
+
60
+ @Override
61
+ public void close() {
62
+ builder.close();
63
+ }
64
+
65
+ @Override
66
+ public void add(Page page) {
67
+ reader.setPage(page);
68
+ while (reader.nextRecord()) {
69
+ if (task.getKeepInput()) {
70
+ for (Column inputColumn: inputSchema.getColumns()) {
71
+ if (reader.isNull(inputColumn)) {
72
+ builder.setNull(inputColumn);
73
+ continue;
74
+ }
75
+ if (Types.STRING.equals(inputColumn.getType())) {
76
+ builder.setString(inputColumn, reader.getString(inputColumn));
77
+ } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
78
+ builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
79
+ } else if (Types.DOUBLE.equals(inputColumn.getType())) {
80
+ builder.setDouble(inputColumn, reader.getDouble(inputColumn));
81
+ } else if (Types.LONG.equals(inputColumn.getType())) {
82
+ builder.setLong(inputColumn, reader.getLong(inputColumn));
83
+ } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
84
+ builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
85
+ } else if (Types.JSON.equals(inputColumn.getType())) {
86
+ builder.setJson(inputColumn, reader.getJson(inputColumn));
87
+ }
88
+ }
89
+ }
90
+
91
+ List<Map<String, String>> settings = task.getSettings();
92
+ for (Column column : keyNameColumns) {
93
+ for (int i = 0; i < settings.size(); i++) {
94
+ Map<String, String> setting = settings.get(i);
95
+ String suffix = setting.get("suffix");
96
+ Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
97
+ final String source = reader.getString(column);
98
+ final List<Transliterator> transliterators = transliteratorsList.get(i);
99
+ String converted = convert(source, suffix, setting.get("case"), transliterators);
100
+ logger.debug("before => [{}], after => [{}]", source, converted);
101
+ if (converted == null) {
102
+ builder.setNull(outputColumn);
103
+ } else {
104
+ builder.setString(outputColumn, converted);
105
+ }
106
+ }
107
+ }
108
+ builder.addRecord();
109
+ }
110
+ }
111
+
112
+ /**
113
+ * @param column
114
+ * @param suffix
115
+ * @param type
116
+ * @return
117
+ */
118
+ private String convert(String string, String suffix, String type, List<Transliterator> transliterators) {
119
+ for (Transliterator transliterator : transliterators) {
120
+ string = transliterator.transliterate(string);
121
+ }
122
+ if ("upper".equals(type)) {
123
+ string = string.toUpperCase();
124
+ } else if ("lower".equals(type)) {
125
+ string = string.toLowerCase();
126
+ }
127
+ return string;
128
+ }
129
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-icu4j
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-10 00:00:00.000000000 Z
11
+ date: 2016-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -38,7 +38,7 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: Icu4j filter plugin for Embulk. see http://site.icu-project.org/
41
+ description: Unicode normalize string value. see http://site.icu-project.org/
42
42
  email:
43
43
  - toyama0919@gmail.com
44
44
  executables: []
@@ -58,8 +58,9 @@ files:
58
58
  - gradlew.bat
59
59
  - lib/embulk/filter/icu4j.rb
60
60
  - src/main/java/org/embulk/filter/icu4j/Icu4jFilterPlugin.java
61
+ - src/main/java/org/embulk/filter/icu4j/Icu4jPageOutput.java
61
62
  - src/test/java/org/embulk/filter/icu4j/TestIcu4jFilterPlugin.java
62
- - classpath/embulk-filter-icu4j-0.3.0.jar
63
+ - classpath/embulk-filter-icu4j-0.3.1.jar
63
64
  - classpath/icu4j-56.1.jar
64
65
  homepage: https://github.com/toyama0919/embulk-filter-icu4j
65
66
  licenses:
@@ -84,6 +85,6 @@ rubyforge_project:
84
85
  rubygems_version: 2.1.9
85
86
  signing_key:
86
87
  specification_version: 4
87
- summary: Icu4j filter plugin for Embulk
88
+ summary: Unicode normalize string value.
88
89
  test_files: []
89
90
  has_rdoc: