embulk-filter-kuromoji 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2fcf8d7ebe4ba3b82de6056dad530ed8254bf358
|
4
|
+
data.tar.gz: 1af3161a76c6f26429f56c09522f3329e1db3745
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38de81bb46e81d51ae79d809d862f5323ac1d6301c7802ef3a51b55a0a19df4c2ce333585672ee71e9eb11a5e9509045d08f527421fe748e670028848b6985f7
|
7
|
+
data.tar.gz: 32e55429d7045a2f62a8dac4cab4a44fa2a373e858c3bdcdb2044f810512ce2c2af1c7ffb314c6c65660658816aed408f4dd8e2d468344b02cc4e51eef918121
|
data/build.gradle
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
package org.embulk.filter.kuromoji;
|
2
2
|
|
3
|
-
import java.io.FileNotFoundException;
|
4
|
-
import java.io.IOException;
|
5
3
|
import java.util.List;
|
6
4
|
import java.util.Map;
|
7
5
|
|
@@ -13,28 +11,21 @@ import org.embulk.config.TaskSource;
|
|
13
11
|
import org.embulk.spi.Column;
|
14
12
|
import org.embulk.spi.Exec;
|
15
13
|
import org.embulk.spi.FilterPlugin;
|
16
|
-
import org.embulk.spi.Page;
|
17
|
-
import org.embulk.spi.PageBuilder;
|
18
14
|
import org.embulk.spi.PageOutput;
|
19
|
-
import org.embulk.spi.PageReader;
|
20
15
|
import org.embulk.spi.Schema;
|
21
16
|
import org.embulk.spi.type.Type;
|
22
17
|
import org.embulk.spi.type.Types;
|
23
|
-
import org.
|
24
|
-
import org.msgpack.value.ValueFactory;
|
18
|
+
import org.slf4j.Logger;
|
25
19
|
|
26
|
-
import com.atilika.kuromoji.ipadic.Token;
|
27
|
-
import com.atilika.kuromoji.ipadic.Tokenizer;
|
28
|
-
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
29
|
-
import com.google.common.base.Joiner;
|
30
20
|
import com.google.common.base.MoreObjects;
|
31
21
|
import com.google.common.base.Optional;
|
32
22
|
import com.google.common.collect.ImmutableList;
|
33
|
-
import com.google.common.collect.Lists;
|
34
23
|
import com.google.common.collect.Maps;
|
35
24
|
|
36
25
|
public class KuromojiFilterPlugin implements FilterPlugin
|
37
26
|
{
|
27
|
+
private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
|
28
|
+
|
38
29
|
public interface PluginTask extends Task
|
39
30
|
{
|
40
31
|
@Config("key_names")
|
@@ -62,6 +53,34 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
62
53
|
{
|
63
54
|
PluginTask task = config.loadConfig(PluginTask.class);
|
64
55
|
|
56
|
+
Schema outputSchema = buildOutputSchema(task, inputSchema);
|
57
|
+
|
58
|
+
control.run(task.dump(), outputSchema);
|
59
|
+
}
|
60
|
+
|
61
|
+
@Override
|
62
|
+
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
63
|
+
{
|
64
|
+
return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
|
65
|
+
}
|
66
|
+
|
67
|
+
/**
|
68
|
+
* @param inputSchema
|
69
|
+
* @param task
|
70
|
+
* @return
|
71
|
+
*/
|
72
|
+
private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
|
73
|
+
final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
|
74
|
+
logger.debug("outputColumns => {}", outputColumns);
|
75
|
+
return new Schema(outputColumns);
|
76
|
+
}
|
77
|
+
|
78
|
+
/**
|
79
|
+
* @param task
|
80
|
+
* @param inputSchema
|
81
|
+
* @return
|
82
|
+
*/
|
83
|
+
private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
|
65
84
|
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
66
85
|
Map<String, Column> map = Maps.newHashMap();
|
67
86
|
int i = 0;
|
@@ -85,120 +104,6 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
85
104
|
final Column column = e.getValue();
|
86
105
|
builder.add(new Column(i++, column.getName(), column.getType()));
|
87
106
|
}
|
88
|
-
|
89
|
-
Schema outputSchema = new Schema(builder.build());
|
90
|
-
control.run(task.dump(), outputSchema);
|
91
|
-
}
|
92
|
-
|
93
|
-
@Override
|
94
|
-
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
95
|
-
{
|
96
|
-
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
97
|
-
|
98
|
-
Builder builder = new Tokenizer.Builder();
|
99
|
-
if (task.getDictionaryPath().isPresent()) {
|
100
|
-
try {
|
101
|
-
builder.userDictionary(task.getDictionaryPath().get());
|
102
|
-
} catch (FileNotFoundException e) {
|
103
|
-
e.printStackTrace();
|
104
|
-
} catch (IOException e) {
|
105
|
-
e.printStackTrace();
|
106
|
-
}
|
107
|
-
}
|
108
|
-
final Tokenizer tokenizer = builder.build();
|
109
|
-
final List<Column> keyNameColumns = Lists.newArrayList();
|
110
|
-
|
111
|
-
for (String keyName : task.getKeyNames()) {
|
112
|
-
keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
113
|
-
}
|
114
|
-
|
115
|
-
return new PageOutput() {
|
116
|
-
private PageReader reader = new PageReader(inputSchema);
|
117
|
-
private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
118
|
-
|
119
|
-
@Override
|
120
|
-
public void finish() {
|
121
|
-
builder.finish();
|
122
|
-
}
|
123
|
-
|
124
|
-
@Override
|
125
|
-
public void close() {
|
126
|
-
builder.close();
|
127
|
-
}
|
128
|
-
|
129
|
-
@Override
|
130
|
-
public void add(Page page) {
|
131
|
-
reader.setPage(page);
|
132
|
-
while (reader.nextRecord()) {
|
133
|
-
setValue(builder);
|
134
|
-
builder.addRecord();
|
135
|
-
}
|
136
|
-
}
|
137
|
-
|
138
|
-
/**
|
139
|
-
* @param builder
|
140
|
-
*/
|
141
|
-
private void setValue(PageBuilder builder) {
|
142
|
-
if (task.getKeepInput()) {
|
143
|
-
for (Column inputColumn : inputSchema.getColumns()) {
|
144
|
-
if (reader.isNull(inputColumn)) {
|
145
|
-
builder.setNull(inputColumn);
|
146
|
-
continue;
|
147
|
-
}
|
148
|
-
if (Types.STRING.equals(inputColumn.getType())) {
|
149
|
-
builder.setString(inputColumn, reader.getString(inputColumn));
|
150
|
-
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
151
|
-
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
152
|
-
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
153
|
-
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
154
|
-
} else if (Types.LONG.equals(inputColumn.getType())) {
|
155
|
-
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
156
|
-
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
157
|
-
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
158
|
-
} else if (Types.JSON.equals(inputColumn.getType())) {
|
159
|
-
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
160
|
-
}
|
161
|
-
}
|
162
|
-
}
|
163
|
-
|
164
|
-
for (Column column : keyNameColumns) {
|
165
|
-
List<Token> tokens = tokenizer.tokenize(reader.getString(column));
|
166
|
-
for (Map<String, String> setting : task.getSettings()) {
|
167
|
-
String suffix = setting.get("suffix");
|
168
|
-
String method = setting.get("method");
|
169
|
-
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
170
|
-
List<Value> outputs = Lists.newArrayList();
|
171
|
-
for (Token token : tokens) {
|
172
|
-
if (!isOkPartsOfSpeech(token)) { continue; }
|
173
|
-
String word = null;
|
174
|
-
if ("base_form".equals(method)) {
|
175
|
-
word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
|
176
|
-
} else if ("reading".equals(method)) {
|
177
|
-
word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
|
178
|
-
} else if ("surface_form".equals(method)) {
|
179
|
-
word = token.getSurface();
|
180
|
-
}
|
181
|
-
outputs.add(ValueFactory.newString(word));
|
182
|
-
}
|
183
|
-
if (outputColumn.getType().equals(Types.STRING)) {
|
184
|
-
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
185
|
-
builder.setString(outputColumn, joiner.join(outputs));
|
186
|
-
} else if (outputColumn.getType().equals(Types.JSON)) {
|
187
|
-
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
188
|
-
}
|
189
|
-
}
|
190
|
-
}
|
191
|
-
}
|
192
|
-
|
193
|
-
private boolean isOkPartsOfSpeech(Token token) {
|
194
|
-
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
195
|
-
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
196
|
-
if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
|
197
|
-
return true;
|
198
|
-
}
|
199
|
-
}
|
200
|
-
return false;
|
201
|
-
}
|
202
|
-
};
|
107
|
+
return builder.build();
|
203
108
|
}
|
204
109
|
}
|
@@ -0,0 +1,151 @@
|
|
1
|
+
package org.embulk.filter.kuromoji;
|
2
|
+
|
3
|
+
import java.io.FileNotFoundException;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.Map;
|
7
|
+
|
8
|
+
import org.embulk.config.TaskSource;
|
9
|
+
import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
|
10
|
+
import org.embulk.spi.Column;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.embulk.spi.Page;
|
13
|
+
import org.embulk.spi.PageBuilder;
|
14
|
+
import org.embulk.spi.PageOutput;
|
15
|
+
import org.embulk.spi.PageReader;
|
16
|
+
import org.embulk.spi.Schema;
|
17
|
+
import org.embulk.spi.type.Types;
|
18
|
+
import org.msgpack.value.Value;
|
19
|
+
import org.msgpack.value.ValueFactory;
|
20
|
+
import org.slf4j.Logger;
|
21
|
+
|
22
|
+
import com.atilika.kuromoji.ipadic.Token;
|
23
|
+
import com.atilika.kuromoji.ipadic.Tokenizer;
|
24
|
+
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
25
|
+
import com.google.common.base.Joiner;
|
26
|
+
import com.google.common.base.MoreObjects;
|
27
|
+
import com.google.common.collect.Lists;
|
28
|
+
|
29
|
+
public class KuromojiPageOutput implements PageOutput
|
30
|
+
{
|
31
|
+
private final PluginTask task;
|
32
|
+
private final Tokenizer tokenizer;
|
33
|
+
private final List<Column> keyNameColumns;
|
34
|
+
private final PageReader reader;
|
35
|
+
private final PageBuilder builder;
|
36
|
+
private final Schema inputSchema;
|
37
|
+
private final Schema outputSchema;
|
38
|
+
private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
|
39
|
+
|
40
|
+
public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
|
41
|
+
this.task = taskSource.loadTask(PluginTask.class);
|
42
|
+
this.inputSchema = inputSchema;
|
43
|
+
this.outputSchema = outputSchema;
|
44
|
+
|
45
|
+
Builder builder = new Tokenizer.Builder();
|
46
|
+
if (task.getDictionaryPath().isPresent()) {
|
47
|
+
try {
|
48
|
+
builder.userDictionary(task.getDictionaryPath().get());
|
49
|
+
} catch (FileNotFoundException e) {
|
50
|
+
e.printStackTrace();
|
51
|
+
} catch (IOException e) {
|
52
|
+
e.printStackTrace();
|
53
|
+
}
|
54
|
+
}
|
55
|
+
this.tokenizer = builder.build();
|
56
|
+
this.keyNameColumns = Lists.newArrayList();
|
57
|
+
|
58
|
+
for (String keyName : task.getKeyNames()) {
|
59
|
+
this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
60
|
+
}
|
61
|
+
this.reader = new PageReader(inputSchema);
|
62
|
+
this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
63
|
+
}
|
64
|
+
|
65
|
+
@Override
|
66
|
+
public void finish() {
|
67
|
+
builder.finish();
|
68
|
+
}
|
69
|
+
|
70
|
+
@Override
|
71
|
+
public void close() {
|
72
|
+
builder.close();
|
73
|
+
}
|
74
|
+
|
75
|
+
@Override
|
76
|
+
public void add(Page page) {
|
77
|
+
reader.setPage(page);
|
78
|
+
while (reader.nextRecord()) {
|
79
|
+
setValue(builder);
|
80
|
+
builder.addRecord();
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
/**
|
85
|
+
* @param builder
|
86
|
+
*/
|
87
|
+
private void setValue(PageBuilder builder) {
|
88
|
+
if (task.getKeepInput()) {
|
89
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
90
|
+
if (reader.isNull(inputColumn)) {
|
91
|
+
builder.setNull(inputColumn);
|
92
|
+
continue;
|
93
|
+
}
|
94
|
+
if (Types.STRING.equals(inputColumn.getType())) {
|
95
|
+
builder.setString(inputColumn, reader.getString(inputColumn));
|
96
|
+
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
97
|
+
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
98
|
+
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
99
|
+
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
100
|
+
} else if (Types.LONG.equals(inputColumn.getType())) {
|
101
|
+
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
102
|
+
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
103
|
+
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
104
|
+
} else if (Types.JSON.equals(inputColumn.getType())) {
|
105
|
+
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
for (Column column : keyNameColumns) {
|
111
|
+
final String source = reader.getString(column);
|
112
|
+
List<Token> tokens = tokenizer.tokenize(source);
|
113
|
+
logger.debug("{} => {}", source, tokens);
|
114
|
+
for (Map<String, String> setting : task.getSettings()) {
|
115
|
+
String suffix = setting.get("suffix");
|
116
|
+
String method = setting.get("method");
|
117
|
+
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
118
|
+
List<Value> outputs = Lists.newArrayList();
|
119
|
+
for (Token token : tokens) {
|
120
|
+
logger.debug("token => {}, {}", token, token.getAllFeatures());
|
121
|
+
if (!isOkPartsOfSpeech(token)) { continue; }
|
122
|
+
String word = null;
|
123
|
+
if ("base_form".equals(method)) {
|
124
|
+
word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
|
125
|
+
} else if ("reading".equals(method)) {
|
126
|
+
word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
|
127
|
+
} else if ("surface_form".equals(method)) {
|
128
|
+
word = token.getSurface();
|
129
|
+
}
|
130
|
+
outputs.add(ValueFactory.newString(word));
|
131
|
+
}
|
132
|
+
if (outputColumn.getType().equals(Types.STRING)) {
|
133
|
+
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
134
|
+
builder.setString(outputColumn, joiner.join(outputs));
|
135
|
+
} else if (outputColumn.getType().equals(Types.JSON)) {
|
136
|
+
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
private boolean isOkPartsOfSpeech(Token token) {
|
143
|
+
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
144
|
+
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
145
|
+
if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
|
146
|
+
return true;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
return false;
|
150
|
+
}
|
151
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-kuromoji
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,8 +57,9 @@ files:
|
|
57
57
|
- gradlew.bat
|
58
58
|
- lib/embulk/filter/kuromoji.rb
|
59
59
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
|
60
|
+
- src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
|
60
61
|
- src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
|
61
|
-
- classpath/embulk-filter-kuromoji-0.3.
|
62
|
+
- classpath/embulk-filter-kuromoji-0.3.2.jar
|
62
63
|
- classpath/kuromoji-core-0.9.0.jar
|
63
64
|
- classpath/kuromoji-ipadic-0.9.0.jar
|
64
65
|
homepage: https://github.com/toyama0919/embulk-filter-kuromoji
|