embulk-filter-kuromoji 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 39f5fef3dee7c66100e57b525ffa310c693cbad6
4
- data.tar.gz: 40bc8e9a948364242d091ceb84e9b10b8ec57392
3
+ metadata.gz: 2fcf8d7ebe4ba3b82de6056dad530ed8254bf358
4
+ data.tar.gz: 1af3161a76c6f26429f56c09522f3329e1db3745
5
5
  SHA512:
6
- metadata.gz: af95448ce60356db7f65ad3d27c5fbb6441a46d0dc18df64e47f527ccc2fb92052acbb21ce292e94ab26f6b060f9ac0e4c60976a9764d85681f86b6b70bbf0d6
7
- data.tar.gz: 4d0f23b0bee7ea2aff3cdc2785a2bfb4102aa9b390976902f2d0405e6c17ae6dc9cccffbc9efb3043fd97e7f8410f787158f55b6dd887effb8e6f874df5c532b
6
+ metadata.gz: 38de81bb46e81d51ae79d809d862f5323ac1d6301c7802ef3a51b55a0a19df4c2ce333585672ee71e9eb11a5e9509045d08f527421fe748e670028848b6985f7
7
+ data.tar.gz: 32e55429d7045a2f62a8dac4cab4a44fa2a373e858c3bdcdb2044f810512ce2c2af1c7ffb314c6c65660658816aed408f4dd8e2d468344b02cc4e51eef918121
data/build.gradle CHANGED
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.3.1"
17
+ version = "0.3.2"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -1,7 +1,5 @@
1
1
  package org.embulk.filter.kuromoji;
2
2
 
3
- import java.io.FileNotFoundException;
4
- import java.io.IOException;
5
3
  import java.util.List;
6
4
  import java.util.Map;
7
5
 
@@ -13,28 +11,21 @@ import org.embulk.config.TaskSource;
13
11
  import org.embulk.spi.Column;
14
12
  import org.embulk.spi.Exec;
15
13
  import org.embulk.spi.FilterPlugin;
16
- import org.embulk.spi.Page;
17
- import org.embulk.spi.PageBuilder;
18
14
  import org.embulk.spi.PageOutput;
19
- import org.embulk.spi.PageReader;
20
15
  import org.embulk.spi.Schema;
21
16
  import org.embulk.spi.type.Type;
22
17
  import org.embulk.spi.type.Types;
23
- import org.msgpack.value.Value;
24
- import org.msgpack.value.ValueFactory;
18
+ import org.slf4j.Logger;
25
19
 
26
- import com.atilika.kuromoji.ipadic.Token;
27
- import com.atilika.kuromoji.ipadic.Tokenizer;
28
- import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
29
- import com.google.common.base.Joiner;
30
20
  import com.google.common.base.MoreObjects;
31
21
  import com.google.common.base.Optional;
32
22
  import com.google.common.collect.ImmutableList;
33
- import com.google.common.collect.Lists;
34
23
  import com.google.common.collect.Maps;
35
24
 
36
25
  public class KuromojiFilterPlugin implements FilterPlugin
37
26
  {
27
+ private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
28
+
38
29
  public interface PluginTask extends Task
39
30
  {
40
31
  @Config("key_names")
@@ -62,6 +53,34 @@ public class KuromojiFilterPlugin implements FilterPlugin
62
53
  {
63
54
  PluginTask task = config.loadConfig(PluginTask.class);
64
55
 
56
+ Schema outputSchema = buildOutputSchema(task, inputSchema);
57
+
58
+ control.run(task.dump(), outputSchema);
59
+ }
60
+
61
+ @Override
62
+ public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
63
+ {
64
+ return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
65
+ }
66
+
67
+ /**
68
+ * @param inputSchema
69
+ * @param task
70
+ * @return
71
+ */
72
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
73
+ final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
74
+ logger.debug("outputColumns => {}", outputColumns);
75
+ return new Schema(outputColumns);
76
+ }
77
+
78
+ /**
79
+ * @param task
80
+ * @param inputSchema
81
+ * @return
82
+ */
83
+ private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
65
84
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
66
85
  Map<String, Column> map = Maps.newHashMap();
67
86
  int i = 0;
@@ -85,120 +104,6 @@ public class KuromojiFilterPlugin implements FilterPlugin
85
104
  final Column column = e.getValue();
86
105
  builder.add(new Column(i++, column.getName(), column.getType()));
87
106
  }
88
-
89
- Schema outputSchema = new Schema(builder.build());
90
- control.run(task.dump(), outputSchema);
91
- }
92
-
93
- @Override
94
- public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
95
- {
96
- final PluginTask task = taskSource.loadTask(PluginTask.class);
97
-
98
- Builder builder = new Tokenizer.Builder();
99
- if (task.getDictionaryPath().isPresent()) {
100
- try {
101
- builder.userDictionary(task.getDictionaryPath().get());
102
- } catch (FileNotFoundException e) {
103
- e.printStackTrace();
104
- } catch (IOException e) {
105
- e.printStackTrace();
106
- }
107
- }
108
- final Tokenizer tokenizer = builder.build();
109
- final List<Column> keyNameColumns = Lists.newArrayList();
110
-
111
- for (String keyName : task.getKeyNames()) {
112
- keyNameColumns.add(outputSchema.lookupColumn(keyName));
113
- }
114
-
115
- return new PageOutput() {
116
- private PageReader reader = new PageReader(inputSchema);
117
- private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
118
-
119
- @Override
120
- public void finish() {
121
- builder.finish();
122
- }
123
-
124
- @Override
125
- public void close() {
126
- builder.close();
127
- }
128
-
129
- @Override
130
- public void add(Page page) {
131
- reader.setPage(page);
132
- while (reader.nextRecord()) {
133
- setValue(builder);
134
- builder.addRecord();
135
- }
136
- }
137
-
138
- /**
139
- * @param builder
140
- */
141
- private void setValue(PageBuilder builder) {
142
- if (task.getKeepInput()) {
143
- for (Column inputColumn : inputSchema.getColumns()) {
144
- if (reader.isNull(inputColumn)) {
145
- builder.setNull(inputColumn);
146
- continue;
147
- }
148
- if (Types.STRING.equals(inputColumn.getType())) {
149
- builder.setString(inputColumn, reader.getString(inputColumn));
150
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
151
- builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
152
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
153
- builder.setDouble(inputColumn, reader.getDouble(inputColumn));
154
- } else if (Types.LONG.equals(inputColumn.getType())) {
155
- builder.setLong(inputColumn, reader.getLong(inputColumn));
156
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
157
- builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
158
- } else if (Types.JSON.equals(inputColumn.getType())) {
159
- builder.setJson(inputColumn, reader.getJson(inputColumn));
160
- }
161
- }
162
- }
163
-
164
- for (Column column : keyNameColumns) {
165
- List<Token> tokens = tokenizer.tokenize(reader.getString(column));
166
- for (Map<String, String> setting : task.getSettings()) {
167
- String suffix = setting.get("suffix");
168
- String method = setting.get("method");
169
- Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
170
- List<Value> outputs = Lists.newArrayList();
171
- for (Token token : tokens) {
172
- if (!isOkPartsOfSpeech(token)) { continue; }
173
- String word = null;
174
- if ("base_form".equals(method)) {
175
- word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
176
- } else if ("reading".equals(method)) {
177
- word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
178
- } else if ("surface_form".equals(method)) {
179
- word = token.getSurface();
180
- }
181
- outputs.add(ValueFactory.newString(word));
182
- }
183
- if (outputColumn.getType().equals(Types.STRING)) {
184
- Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
185
- builder.setString(outputColumn, joiner.join(outputs));
186
- } else if (outputColumn.getType().equals(Types.JSON)) {
187
- builder.setJson(outputColumn, ValueFactory.newArray(outputs));
188
- }
189
- }
190
- }
191
- }
192
-
193
- private boolean isOkPartsOfSpeech(Token token) {
194
- if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
195
- for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
196
- if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
197
- return true;
198
- }
199
- }
200
- return false;
201
- }
202
- };
107
+ return builder.build();
203
108
  }
204
109
  }
@@ -0,0 +1,151 @@
1
+ package org.embulk.filter.kuromoji;
2
+
3
+ import java.io.FileNotFoundException;
4
+ import java.io.IOException;
5
+ import java.util.List;
6
+ import java.util.Map;
7
+
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
10
+ import org.embulk.spi.Column;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.Page;
13
+ import org.embulk.spi.PageBuilder;
14
+ import org.embulk.spi.PageOutput;
15
+ import org.embulk.spi.PageReader;
16
+ import org.embulk.spi.Schema;
17
+ import org.embulk.spi.type.Types;
18
+ import org.msgpack.value.Value;
19
+ import org.msgpack.value.ValueFactory;
20
+ import org.slf4j.Logger;
21
+
22
+ import com.atilika.kuromoji.ipadic.Token;
23
+ import com.atilika.kuromoji.ipadic.Tokenizer;
24
+ import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
25
+ import com.google.common.base.Joiner;
26
+ import com.google.common.base.MoreObjects;
27
+ import com.google.common.collect.Lists;
28
+
29
+ public class KuromojiPageOutput implements PageOutput
30
+ {
31
+ private final PluginTask task;
32
+ private final Tokenizer tokenizer;
33
+ private final List<Column> keyNameColumns;
34
+ private final PageReader reader;
35
+ private final PageBuilder builder;
36
+ private final Schema inputSchema;
37
+ private final Schema outputSchema;
38
+ private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
39
+
40
+ public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
41
+ this.task = taskSource.loadTask(PluginTask.class);
42
+ this.inputSchema = inputSchema;
43
+ this.outputSchema = outputSchema;
44
+
45
+ Builder builder = new Tokenizer.Builder();
46
+ if (task.getDictionaryPath().isPresent()) {
47
+ try {
48
+ builder.userDictionary(task.getDictionaryPath().get());
49
+ } catch (FileNotFoundException e) {
50
+ e.printStackTrace();
51
+ } catch (IOException e) {
52
+ e.printStackTrace();
53
+ }
54
+ }
55
+ this.tokenizer = builder.build();
56
+ this.keyNameColumns = Lists.newArrayList();
57
+
58
+ for (String keyName : task.getKeyNames()) {
59
+ this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
60
+ }
61
+ this.reader = new PageReader(inputSchema);
62
+ this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
63
+ }
64
+
65
+ @Override
66
+ public void finish() {
67
+ builder.finish();
68
+ }
69
+
70
+ @Override
71
+ public void close() {
72
+ builder.close();
73
+ }
74
+
75
+ @Override
76
+ public void add(Page page) {
77
+ reader.setPage(page);
78
+ while (reader.nextRecord()) {
79
+ setValue(builder);
80
+ builder.addRecord();
81
+ }
82
+ }
83
+
84
+ /**
85
+ * @param builder
86
+ */
87
+ private void setValue(PageBuilder builder) {
88
+ if (task.getKeepInput()) {
89
+ for (Column inputColumn : inputSchema.getColumns()) {
90
+ if (reader.isNull(inputColumn)) {
91
+ builder.setNull(inputColumn);
92
+ continue;
93
+ }
94
+ if (Types.STRING.equals(inputColumn.getType())) {
95
+ builder.setString(inputColumn, reader.getString(inputColumn));
96
+ } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
97
+ builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
98
+ } else if (Types.DOUBLE.equals(inputColumn.getType())) {
99
+ builder.setDouble(inputColumn, reader.getDouble(inputColumn));
100
+ } else if (Types.LONG.equals(inputColumn.getType())) {
101
+ builder.setLong(inputColumn, reader.getLong(inputColumn));
102
+ } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
103
+ builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
104
+ } else if (Types.JSON.equals(inputColumn.getType())) {
105
+ builder.setJson(inputColumn, reader.getJson(inputColumn));
106
+ }
107
+ }
108
+ }
109
+
110
+ for (Column column : keyNameColumns) {
111
+ final String source = reader.getString(column);
112
+ List<Token> tokens = tokenizer.tokenize(source);
113
+ logger.debug("{} => {}", source, tokens);
114
+ for (Map<String, String> setting : task.getSettings()) {
115
+ String suffix = setting.get("suffix");
116
+ String method = setting.get("method");
117
+ Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
118
+ List<Value> outputs = Lists.newArrayList();
119
+ for (Token token : tokens) {
120
+ logger.debug("token => {}, {}", token, token.getAllFeatures());
121
+ if (!isOkPartsOfSpeech(token)) { continue; }
122
+ String word = null;
123
+ if ("base_form".equals(method)) {
124
+ word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
125
+ } else if ("reading".equals(method)) {
126
+ word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
127
+ } else if ("surface_form".equals(method)) {
128
+ word = token.getSurface();
129
+ }
130
+ outputs.add(ValueFactory.newString(word));
131
+ }
132
+ if (outputColumn.getType().equals(Types.STRING)) {
133
+ Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
134
+ builder.setString(outputColumn, joiner.join(outputs));
135
+ } else if (outputColumn.getType().equals(Types.JSON)) {
136
+ builder.setJson(outputColumn, ValueFactory.newArray(outputs));
137
+ }
138
+ }
139
+ }
140
+ }
141
+
142
+ private boolean isOkPartsOfSpeech(Token token) {
143
+ if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
144
+ for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
145
+ if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
146
+ return true;
147
+ }
148
+ }
149
+ return false;
150
+ }
151
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-13 00:00:00.000000000 Z
11
+ date: 2016-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -57,8 +57,9 @@ files:
57
57
  - gradlew.bat
58
58
  - lib/embulk/filter/kuromoji.rb
59
59
  - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
60
+ - src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
60
61
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
61
- - classpath/embulk-filter-kuromoji-0.3.1.jar
62
+ - classpath/embulk-filter-kuromoji-0.3.2.jar
62
63
  - classpath/kuromoji-core-0.9.0.jar
63
64
  - classpath/kuromoji-ipadic-0.9.0.jar
64
65
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji