embulk-filter-kuromoji 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 39f5fef3dee7c66100e57b525ffa310c693cbad6
4
- data.tar.gz: 40bc8e9a948364242d091ceb84e9b10b8ec57392
3
+ metadata.gz: 2fcf8d7ebe4ba3b82de6056dad530ed8254bf358
4
+ data.tar.gz: 1af3161a76c6f26429f56c09522f3329e1db3745
5
5
  SHA512:
6
- metadata.gz: af95448ce60356db7f65ad3d27c5fbb6441a46d0dc18df64e47f527ccc2fb92052acbb21ce292e94ab26f6b060f9ac0e4c60976a9764d85681f86b6b70bbf0d6
7
- data.tar.gz: 4d0f23b0bee7ea2aff3cdc2785a2bfb4102aa9b390976902f2d0405e6c17ae6dc9cccffbc9efb3043fd97e7f8410f787158f55b6dd887effb8e6f874df5c532b
6
+ metadata.gz: 38de81bb46e81d51ae79d809d862f5323ac1d6301c7802ef3a51b55a0a19df4c2ce333585672ee71e9eb11a5e9509045d08f527421fe748e670028848b6985f7
7
+ data.tar.gz: 32e55429d7045a2f62a8dac4cab4a44fa2a373e858c3bdcdb2044f810512ce2c2af1c7ffb314c6c65660658816aed408f4dd8e2d468344b02cc4e51eef918121
data/build.gradle CHANGED
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.3.1"
17
+ version = "0.3.2"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -1,7 +1,5 @@
1
1
  package org.embulk.filter.kuromoji;
2
2
 
3
- import java.io.FileNotFoundException;
4
- import java.io.IOException;
5
3
  import java.util.List;
6
4
  import java.util.Map;
7
5
 
@@ -13,28 +11,21 @@ import org.embulk.config.TaskSource;
13
11
  import org.embulk.spi.Column;
14
12
  import org.embulk.spi.Exec;
15
13
  import org.embulk.spi.FilterPlugin;
16
- import org.embulk.spi.Page;
17
- import org.embulk.spi.PageBuilder;
18
14
  import org.embulk.spi.PageOutput;
19
- import org.embulk.spi.PageReader;
20
15
  import org.embulk.spi.Schema;
21
16
  import org.embulk.spi.type.Type;
22
17
  import org.embulk.spi.type.Types;
23
- import org.msgpack.value.Value;
24
- import org.msgpack.value.ValueFactory;
18
+ import org.slf4j.Logger;
25
19
 
26
- import com.atilika.kuromoji.ipadic.Token;
27
- import com.atilika.kuromoji.ipadic.Tokenizer;
28
- import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
29
- import com.google.common.base.Joiner;
30
20
  import com.google.common.base.MoreObjects;
31
21
  import com.google.common.base.Optional;
32
22
  import com.google.common.collect.ImmutableList;
33
- import com.google.common.collect.Lists;
34
23
  import com.google.common.collect.Maps;
35
24
 
36
25
  public class KuromojiFilterPlugin implements FilterPlugin
37
26
  {
27
+ private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
28
+
38
29
  public interface PluginTask extends Task
39
30
  {
40
31
  @Config("key_names")
@@ -62,6 +53,34 @@ public class KuromojiFilterPlugin implements FilterPlugin
62
53
  {
63
54
  PluginTask task = config.loadConfig(PluginTask.class);
64
55
 
56
+ Schema outputSchema = buildOutputSchema(task, inputSchema);
57
+
58
+ control.run(task.dump(), outputSchema);
59
+ }
60
+
61
+ @Override
62
+ public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
63
+ {
64
+ return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
65
+ }
66
+
67
+ /**
68
+ * @param inputSchema
69
+ * @param task
70
+ * @return
71
+ */
72
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
73
+ final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
74
+ logger.debug("outputColumns => {}", outputColumns);
75
+ return new Schema(outputColumns);
76
+ }
77
+
78
+ /**
79
+ * @param task
80
+ * @param inputSchema
81
+ * @return
82
+ */
83
+ private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
65
84
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
66
85
  Map<String, Column> map = Maps.newHashMap();
67
86
  int i = 0;
@@ -85,120 +104,6 @@ public class KuromojiFilterPlugin implements FilterPlugin
85
104
  final Column column = e.getValue();
86
105
  builder.add(new Column(i++, column.getName(), column.getType()));
87
106
  }
88
-
89
- Schema outputSchema = new Schema(builder.build());
90
- control.run(task.dump(), outputSchema);
91
- }
92
-
93
- @Override
94
- public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
95
- {
96
- final PluginTask task = taskSource.loadTask(PluginTask.class);
97
-
98
- Builder builder = new Tokenizer.Builder();
99
- if (task.getDictionaryPath().isPresent()) {
100
- try {
101
- builder.userDictionary(task.getDictionaryPath().get());
102
- } catch (FileNotFoundException e) {
103
- e.printStackTrace();
104
- } catch (IOException e) {
105
- e.printStackTrace();
106
- }
107
- }
108
- final Tokenizer tokenizer = builder.build();
109
- final List<Column> keyNameColumns = Lists.newArrayList();
110
-
111
- for (String keyName : task.getKeyNames()) {
112
- keyNameColumns.add(outputSchema.lookupColumn(keyName));
113
- }
114
-
115
- return new PageOutput() {
116
- private PageReader reader = new PageReader(inputSchema);
117
- private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
118
-
119
- @Override
120
- public void finish() {
121
- builder.finish();
122
- }
123
-
124
- @Override
125
- public void close() {
126
- builder.close();
127
- }
128
-
129
- @Override
130
- public void add(Page page) {
131
- reader.setPage(page);
132
- while (reader.nextRecord()) {
133
- setValue(builder);
134
- builder.addRecord();
135
- }
136
- }
137
-
138
- /**
139
- * @param builder
140
- */
141
- private void setValue(PageBuilder builder) {
142
- if (task.getKeepInput()) {
143
- for (Column inputColumn : inputSchema.getColumns()) {
144
- if (reader.isNull(inputColumn)) {
145
- builder.setNull(inputColumn);
146
- continue;
147
- }
148
- if (Types.STRING.equals(inputColumn.getType())) {
149
- builder.setString(inputColumn, reader.getString(inputColumn));
150
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
151
- builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
152
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
153
- builder.setDouble(inputColumn, reader.getDouble(inputColumn));
154
- } else if (Types.LONG.equals(inputColumn.getType())) {
155
- builder.setLong(inputColumn, reader.getLong(inputColumn));
156
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
157
- builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
158
- } else if (Types.JSON.equals(inputColumn.getType())) {
159
- builder.setJson(inputColumn, reader.getJson(inputColumn));
160
- }
161
- }
162
- }
163
-
164
- for (Column column : keyNameColumns) {
165
- List<Token> tokens = tokenizer.tokenize(reader.getString(column));
166
- for (Map<String, String> setting : task.getSettings()) {
167
- String suffix = setting.get("suffix");
168
- String method = setting.get("method");
169
- Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
170
- List<Value> outputs = Lists.newArrayList();
171
- for (Token token : tokens) {
172
- if (!isOkPartsOfSpeech(token)) { continue; }
173
- String word = null;
174
- if ("base_form".equals(method)) {
175
- word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
176
- } else if ("reading".equals(method)) {
177
- word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
178
- } else if ("surface_form".equals(method)) {
179
- word = token.getSurface();
180
- }
181
- outputs.add(ValueFactory.newString(word));
182
- }
183
- if (outputColumn.getType().equals(Types.STRING)) {
184
- Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
185
- builder.setString(outputColumn, joiner.join(outputs));
186
- } else if (outputColumn.getType().equals(Types.JSON)) {
187
- builder.setJson(outputColumn, ValueFactory.newArray(outputs));
188
- }
189
- }
190
- }
191
- }
192
-
193
- private boolean isOkPartsOfSpeech(Token token) {
194
- if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
195
- for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
196
- if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
197
- return true;
198
- }
199
- }
200
- return false;
201
- }
202
- };
107
+ return builder.build();
203
108
  }
204
109
  }
@@ -0,0 +1,151 @@
1
+ package org.embulk.filter.kuromoji;
2
+
3
+ import java.io.FileNotFoundException;
4
+ import java.io.IOException;
5
+ import java.util.List;
6
+ import java.util.Map;
7
+
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
10
+ import org.embulk.spi.Column;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.Page;
13
+ import org.embulk.spi.PageBuilder;
14
+ import org.embulk.spi.PageOutput;
15
+ import org.embulk.spi.PageReader;
16
+ import org.embulk.spi.Schema;
17
+ import org.embulk.spi.type.Types;
18
+ import org.msgpack.value.Value;
19
+ import org.msgpack.value.ValueFactory;
20
+ import org.slf4j.Logger;
21
+
22
+ import com.atilika.kuromoji.ipadic.Token;
23
+ import com.atilika.kuromoji.ipadic.Tokenizer;
24
+ import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
25
+ import com.google.common.base.Joiner;
26
+ import com.google.common.base.MoreObjects;
27
+ import com.google.common.collect.Lists;
28
+
29
+ public class KuromojiPageOutput implements PageOutput
30
+ {
31
+ private final PluginTask task;
32
+ private final Tokenizer tokenizer;
33
+ private final List<Column> keyNameColumns;
34
+ private final PageReader reader;
35
+ private final PageBuilder builder;
36
+ private final Schema inputSchema;
37
+ private final Schema outputSchema;
38
+ private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
39
+
40
+ public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
41
+ this.task = taskSource.loadTask(PluginTask.class);
42
+ this.inputSchema = inputSchema;
43
+ this.outputSchema = outputSchema;
44
+
45
+ Builder builder = new Tokenizer.Builder();
46
+ if (task.getDictionaryPath().isPresent()) {
47
+ try {
48
+ builder.userDictionary(task.getDictionaryPath().get());
49
+ } catch (FileNotFoundException e) {
50
+ e.printStackTrace();
51
+ } catch (IOException e) {
52
+ e.printStackTrace();
53
+ }
54
+ }
55
+ this.tokenizer = builder.build();
56
+ this.keyNameColumns = Lists.newArrayList();
57
+
58
+ for (String keyName : task.getKeyNames()) {
59
+ this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
60
+ }
61
+ this.reader = new PageReader(inputSchema);
62
+ this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
63
+ }
64
+
65
+ @Override
66
+ public void finish() {
67
+ builder.finish();
68
+ }
69
+
70
+ @Override
71
+ public void close() {
72
+ builder.close();
73
+ }
74
+
75
+ @Override
76
+ public void add(Page page) {
77
+ reader.setPage(page);
78
+ while (reader.nextRecord()) {
79
+ setValue(builder);
80
+ builder.addRecord();
81
+ }
82
+ }
83
+
84
+ /**
85
+ * @param builder
86
+ */
87
+ private void setValue(PageBuilder builder) {
88
+ if (task.getKeepInput()) {
89
+ for (Column inputColumn : inputSchema.getColumns()) {
90
+ if (reader.isNull(inputColumn)) {
91
+ builder.setNull(inputColumn);
92
+ continue;
93
+ }
94
+ if (Types.STRING.equals(inputColumn.getType())) {
95
+ builder.setString(inputColumn, reader.getString(inputColumn));
96
+ } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
97
+ builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
98
+ } else if (Types.DOUBLE.equals(inputColumn.getType())) {
99
+ builder.setDouble(inputColumn, reader.getDouble(inputColumn));
100
+ } else if (Types.LONG.equals(inputColumn.getType())) {
101
+ builder.setLong(inputColumn, reader.getLong(inputColumn));
102
+ } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
103
+ builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
104
+ } else if (Types.JSON.equals(inputColumn.getType())) {
105
+ builder.setJson(inputColumn, reader.getJson(inputColumn));
106
+ }
107
+ }
108
+ }
109
+
110
+ for (Column column : keyNameColumns) {
111
+ final String source = reader.getString(column);
112
+ List<Token> tokens = tokenizer.tokenize(source);
113
+ logger.debug("{} => {}", source, tokens);
114
+ for (Map<String, String> setting : task.getSettings()) {
115
+ String suffix = setting.get("suffix");
116
+ String method = setting.get("method");
117
+ Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
118
+ List<Value> outputs = Lists.newArrayList();
119
+ for (Token token : tokens) {
120
+ logger.debug("token => {}, {}", token, token.getAllFeatures());
121
+ if (!isOkPartsOfSpeech(token)) { continue; }
122
+ String word = null;
123
+ if ("base_form".equals(method)) {
124
+ word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
125
+ } else if ("reading".equals(method)) {
126
+ word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
127
+ } else if ("surface_form".equals(method)) {
128
+ word = token.getSurface();
129
+ }
130
+ outputs.add(ValueFactory.newString(word));
131
+ }
132
+ if (outputColumn.getType().equals(Types.STRING)) {
133
+ Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
134
+ builder.setString(outputColumn, joiner.join(outputs));
135
+ } else if (outputColumn.getType().equals(Types.JSON)) {
136
+ builder.setJson(outputColumn, ValueFactory.newArray(outputs));
137
+ }
138
+ }
139
+ }
140
+ }
141
+
142
+ private boolean isOkPartsOfSpeech(Token token) {
143
+ if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
144
+ for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
145
+ if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
146
+ return true;
147
+ }
148
+ }
149
+ return false;
150
+ }
151
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-13 00:00:00.000000000 Z
11
+ date: 2016-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -57,8 +57,9 @@ files:
57
57
  - gradlew.bat
58
58
  - lib/embulk/filter/kuromoji.rb
59
59
  - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
60
+ - src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
60
61
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
61
- - classpath/embulk-filter-kuromoji-0.3.1.jar
62
+ - classpath/embulk-filter-kuromoji-0.3.2.jar
62
63
  - classpath/kuromoji-core-0.9.0.jar
63
64
  - classpath/kuromoji-ipadic-0.9.0.jar
64
65
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji