embulk-filter-kuromoji 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2fcf8d7ebe4ba3b82de6056dad530ed8254bf358
|
4
|
+
data.tar.gz: 1af3161a76c6f26429f56c09522f3329e1db3745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38de81bb46e81d51ae79d809d862f5323ac1d6301c7802ef3a51b55a0a19df4c2ce333585672ee71e9eb11a5e9509045d08f527421fe748e670028848b6985f7
|
7
|
+
data.tar.gz: 32e55429d7045a2f62a8dac4cab4a44fa2a373e858c3bdcdb2044f810512ce2c2af1c7ffb314c6c65660658816aed408f4dd8e2d468344b02cc4e51eef918121
|
data/build.gradle
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
package org.embulk.filter.kuromoji;
|
2
2
|
|
3
|
-
import java.io.FileNotFoundException;
|
4
|
-
import java.io.IOException;
|
5
3
|
import java.util.List;
|
6
4
|
import java.util.Map;
|
7
5
|
|
@@ -13,28 +11,21 @@ import org.embulk.config.TaskSource;
|
|
13
11
|
import org.embulk.spi.Column;
|
14
12
|
import org.embulk.spi.Exec;
|
15
13
|
import org.embulk.spi.FilterPlugin;
|
16
|
-
import org.embulk.spi.Page;
|
17
|
-
import org.embulk.spi.PageBuilder;
|
18
14
|
import org.embulk.spi.PageOutput;
|
19
|
-
import org.embulk.spi.PageReader;
|
20
15
|
import org.embulk.spi.Schema;
|
21
16
|
import org.embulk.spi.type.Type;
|
22
17
|
import org.embulk.spi.type.Types;
|
23
|
-
import org.
|
24
|
-
import org.msgpack.value.ValueFactory;
|
18
|
+
import org.slf4j.Logger;
|
25
19
|
|
26
|
-
import com.atilika.kuromoji.ipadic.Token;
|
27
|
-
import com.atilika.kuromoji.ipadic.Tokenizer;
|
28
|
-
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
29
|
-
import com.google.common.base.Joiner;
|
30
20
|
import com.google.common.base.MoreObjects;
|
31
21
|
import com.google.common.base.Optional;
|
32
22
|
import com.google.common.collect.ImmutableList;
|
33
|
-
import com.google.common.collect.Lists;
|
34
23
|
import com.google.common.collect.Maps;
|
35
24
|
|
36
25
|
public class KuromojiFilterPlugin implements FilterPlugin
|
37
26
|
{
|
27
|
+
private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
|
28
|
+
|
38
29
|
public interface PluginTask extends Task
|
39
30
|
{
|
40
31
|
@Config("key_names")
|
@@ -62,6 +53,34 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
62
53
|
{
|
63
54
|
PluginTask task = config.loadConfig(PluginTask.class);
|
64
55
|
|
56
|
+
Schema outputSchema = buildOutputSchema(task, inputSchema);
|
57
|
+
|
58
|
+
control.run(task.dump(), outputSchema);
|
59
|
+
}
|
60
|
+
|
61
|
+
@Override
|
62
|
+
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
63
|
+
{
|
64
|
+
return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
|
65
|
+
}
|
66
|
+
|
67
|
+
/**
|
68
|
+
* @param inputSchema
|
69
|
+
* @param task
|
70
|
+
* @return
|
71
|
+
*/
|
72
|
+
private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
|
73
|
+
final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
|
74
|
+
logger.debug("outputColumns => {}", outputColumns);
|
75
|
+
return new Schema(outputColumns);
|
76
|
+
}
|
77
|
+
|
78
|
+
/**
|
79
|
+
* @param task
|
80
|
+
* @param inputSchema
|
81
|
+
* @return
|
82
|
+
*/
|
83
|
+
private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
|
65
84
|
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
66
85
|
Map<String, Column> map = Maps.newHashMap();
|
67
86
|
int i = 0;
|
@@ -85,120 +104,6 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
85
104
|
final Column column = e.getValue();
|
86
105
|
builder.add(new Column(i++, column.getName(), column.getType()));
|
87
106
|
}
|
88
|
-
|
89
|
-
Schema outputSchema = new Schema(builder.build());
|
90
|
-
control.run(task.dump(), outputSchema);
|
91
|
-
}
|
92
|
-
|
93
|
-
@Override
|
94
|
-
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
95
|
-
{
|
96
|
-
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
97
|
-
|
98
|
-
Builder builder = new Tokenizer.Builder();
|
99
|
-
if (task.getDictionaryPath().isPresent()) {
|
100
|
-
try {
|
101
|
-
builder.userDictionary(task.getDictionaryPath().get());
|
102
|
-
} catch (FileNotFoundException e) {
|
103
|
-
e.printStackTrace();
|
104
|
-
} catch (IOException e) {
|
105
|
-
e.printStackTrace();
|
106
|
-
}
|
107
|
-
}
|
108
|
-
final Tokenizer tokenizer = builder.build();
|
109
|
-
final List<Column> keyNameColumns = Lists.newArrayList();
|
110
|
-
|
111
|
-
for (String keyName : task.getKeyNames()) {
|
112
|
-
keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
113
|
-
}
|
114
|
-
|
115
|
-
return new PageOutput() {
|
116
|
-
private PageReader reader = new PageReader(inputSchema);
|
117
|
-
private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
118
|
-
|
119
|
-
@Override
|
120
|
-
public void finish() {
|
121
|
-
builder.finish();
|
122
|
-
}
|
123
|
-
|
124
|
-
@Override
|
125
|
-
public void close() {
|
126
|
-
builder.close();
|
127
|
-
}
|
128
|
-
|
129
|
-
@Override
|
130
|
-
public void add(Page page) {
|
131
|
-
reader.setPage(page);
|
132
|
-
while (reader.nextRecord()) {
|
133
|
-
setValue(builder);
|
134
|
-
builder.addRecord();
|
135
|
-
}
|
136
|
-
}
|
137
|
-
|
138
|
-
/**
|
139
|
-
* @param builder
|
140
|
-
*/
|
141
|
-
private void setValue(PageBuilder builder) {
|
142
|
-
if (task.getKeepInput()) {
|
143
|
-
for (Column inputColumn : inputSchema.getColumns()) {
|
144
|
-
if (reader.isNull(inputColumn)) {
|
145
|
-
builder.setNull(inputColumn);
|
146
|
-
continue;
|
147
|
-
}
|
148
|
-
if (Types.STRING.equals(inputColumn.getType())) {
|
149
|
-
builder.setString(inputColumn, reader.getString(inputColumn));
|
150
|
-
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
151
|
-
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
152
|
-
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
153
|
-
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
154
|
-
} else if (Types.LONG.equals(inputColumn.getType())) {
|
155
|
-
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
156
|
-
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
157
|
-
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
158
|
-
} else if (Types.JSON.equals(inputColumn.getType())) {
|
159
|
-
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
160
|
-
}
|
161
|
-
}
|
162
|
-
}
|
163
|
-
|
164
|
-
for (Column column : keyNameColumns) {
|
165
|
-
List<Token> tokens = tokenizer.tokenize(reader.getString(column));
|
166
|
-
for (Map<String, String> setting : task.getSettings()) {
|
167
|
-
String suffix = setting.get("suffix");
|
168
|
-
String method = setting.get("method");
|
169
|
-
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
170
|
-
List<Value> outputs = Lists.newArrayList();
|
171
|
-
for (Token token : tokens) {
|
172
|
-
if (!isOkPartsOfSpeech(token)) { continue; }
|
173
|
-
String word = null;
|
174
|
-
if ("base_form".equals(method)) {
|
175
|
-
word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
|
176
|
-
} else if ("reading".equals(method)) {
|
177
|
-
word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
|
178
|
-
} else if ("surface_form".equals(method)) {
|
179
|
-
word = token.getSurface();
|
180
|
-
}
|
181
|
-
outputs.add(ValueFactory.newString(word));
|
182
|
-
}
|
183
|
-
if (outputColumn.getType().equals(Types.STRING)) {
|
184
|
-
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
185
|
-
builder.setString(outputColumn, joiner.join(outputs));
|
186
|
-
} else if (outputColumn.getType().equals(Types.JSON)) {
|
187
|
-
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
188
|
-
}
|
189
|
-
}
|
190
|
-
}
|
191
|
-
}
|
192
|
-
|
193
|
-
private boolean isOkPartsOfSpeech(Token token) {
|
194
|
-
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
195
|
-
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
196
|
-
if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
|
197
|
-
return true;
|
198
|
-
}
|
199
|
-
}
|
200
|
-
return false;
|
201
|
-
}
|
202
|
-
};
|
107
|
+
return builder.build();
|
203
108
|
}
|
204
109
|
}
|
@@ -0,0 +1,151 @@
|
|
1
|
+
package org.embulk.filter.kuromoji;
|
2
|
+
|
3
|
+
import java.io.FileNotFoundException;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.Map;
|
7
|
+
|
8
|
+
import org.embulk.config.TaskSource;
|
9
|
+
import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
|
10
|
+
import org.embulk.spi.Column;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.embulk.spi.Page;
|
13
|
+
import org.embulk.spi.PageBuilder;
|
14
|
+
import org.embulk.spi.PageOutput;
|
15
|
+
import org.embulk.spi.PageReader;
|
16
|
+
import org.embulk.spi.Schema;
|
17
|
+
import org.embulk.spi.type.Types;
|
18
|
+
import org.msgpack.value.Value;
|
19
|
+
import org.msgpack.value.ValueFactory;
|
20
|
+
import org.slf4j.Logger;
|
21
|
+
|
22
|
+
import com.atilika.kuromoji.ipadic.Token;
|
23
|
+
import com.atilika.kuromoji.ipadic.Tokenizer;
|
24
|
+
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
25
|
+
import com.google.common.base.Joiner;
|
26
|
+
import com.google.common.base.MoreObjects;
|
27
|
+
import com.google.common.collect.Lists;
|
28
|
+
|
29
|
+
public class KuromojiPageOutput implements PageOutput
|
30
|
+
{
|
31
|
+
private final PluginTask task;
|
32
|
+
private final Tokenizer tokenizer;
|
33
|
+
private final List<Column> keyNameColumns;
|
34
|
+
private final PageReader reader;
|
35
|
+
private final PageBuilder builder;
|
36
|
+
private final Schema inputSchema;
|
37
|
+
private final Schema outputSchema;
|
38
|
+
private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
|
39
|
+
|
40
|
+
public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
|
41
|
+
this.task = taskSource.loadTask(PluginTask.class);
|
42
|
+
this.inputSchema = inputSchema;
|
43
|
+
this.outputSchema = outputSchema;
|
44
|
+
|
45
|
+
Builder builder = new Tokenizer.Builder();
|
46
|
+
if (task.getDictionaryPath().isPresent()) {
|
47
|
+
try {
|
48
|
+
builder.userDictionary(task.getDictionaryPath().get());
|
49
|
+
} catch (FileNotFoundException e) {
|
50
|
+
e.printStackTrace();
|
51
|
+
} catch (IOException e) {
|
52
|
+
e.printStackTrace();
|
53
|
+
}
|
54
|
+
}
|
55
|
+
this.tokenizer = builder.build();
|
56
|
+
this.keyNameColumns = Lists.newArrayList();
|
57
|
+
|
58
|
+
for (String keyName : task.getKeyNames()) {
|
59
|
+
this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
60
|
+
}
|
61
|
+
this.reader = new PageReader(inputSchema);
|
62
|
+
this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
63
|
+
}
|
64
|
+
|
65
|
+
@Override
|
66
|
+
public void finish() {
|
67
|
+
builder.finish();
|
68
|
+
}
|
69
|
+
|
70
|
+
@Override
|
71
|
+
public void close() {
|
72
|
+
builder.close();
|
73
|
+
}
|
74
|
+
|
75
|
+
@Override
|
76
|
+
public void add(Page page) {
|
77
|
+
reader.setPage(page);
|
78
|
+
while (reader.nextRecord()) {
|
79
|
+
setValue(builder);
|
80
|
+
builder.addRecord();
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
/**
|
85
|
+
* @param builder
|
86
|
+
*/
|
87
|
+
private void setValue(PageBuilder builder) {
|
88
|
+
if (task.getKeepInput()) {
|
89
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
90
|
+
if (reader.isNull(inputColumn)) {
|
91
|
+
builder.setNull(inputColumn);
|
92
|
+
continue;
|
93
|
+
}
|
94
|
+
if (Types.STRING.equals(inputColumn.getType())) {
|
95
|
+
builder.setString(inputColumn, reader.getString(inputColumn));
|
96
|
+
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
97
|
+
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
98
|
+
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
99
|
+
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
100
|
+
} else if (Types.LONG.equals(inputColumn.getType())) {
|
101
|
+
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
102
|
+
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
103
|
+
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
104
|
+
} else if (Types.JSON.equals(inputColumn.getType())) {
|
105
|
+
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
for (Column column : keyNameColumns) {
|
111
|
+
final String source = reader.getString(column);
|
112
|
+
List<Token> tokens = tokenizer.tokenize(source);
|
113
|
+
logger.debug("{} => {}", source, tokens);
|
114
|
+
for (Map<String, String> setting : task.getSettings()) {
|
115
|
+
String suffix = setting.get("suffix");
|
116
|
+
String method = setting.get("method");
|
117
|
+
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
118
|
+
List<Value> outputs = Lists.newArrayList();
|
119
|
+
for (Token token : tokens) {
|
120
|
+
logger.debug("token => {}, {}", token, token.getAllFeatures());
|
121
|
+
if (!isOkPartsOfSpeech(token)) { continue; }
|
122
|
+
String word = null;
|
123
|
+
if ("base_form".equals(method)) {
|
124
|
+
word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
|
125
|
+
} else if ("reading".equals(method)) {
|
126
|
+
word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
|
127
|
+
} else if ("surface_form".equals(method)) {
|
128
|
+
word = token.getSurface();
|
129
|
+
}
|
130
|
+
outputs.add(ValueFactory.newString(word));
|
131
|
+
}
|
132
|
+
if (outputColumn.getType().equals(Types.STRING)) {
|
133
|
+
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
134
|
+
builder.setString(outputColumn, joiner.join(outputs));
|
135
|
+
} else if (outputColumn.getType().equals(Types.JSON)) {
|
136
|
+
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
private boolean isOkPartsOfSpeech(Token token) {
|
143
|
+
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
144
|
+
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
145
|
+
if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
|
146
|
+
return true;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
return false;
|
150
|
+
}
|
151
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-kuromoji
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,8 +57,9 @@ files:
|
|
57
57
|
- gradlew.bat
|
58
58
|
- lib/embulk/filter/kuromoji.rb
|
59
59
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
|
60
|
+
- src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
|
60
61
|
- src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
|
61
|
-
- classpath/embulk-filter-kuromoji-0.3.
|
62
|
+
- classpath/embulk-filter-kuromoji-0.3.2.jar
|
62
63
|
- classpath/kuromoji-core-0.9.0.jar
|
63
64
|
- classpath/kuromoji-ipadic-0.9.0.jar
|
64
65
|
homepage: https://github.com/toyama0919/embulk-filter-kuromoji
|