embulk-filter-kuromoji 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +35 -2
- data/build.gradle +7 -3
- data/src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java +17 -0
- data/src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java +14 -1
- data/src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java +205 -0
- data/src/main/java/org/embulk/filter/kuromoji/Token.java +41 -0
- metadata +10 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d34a0b4db85a5b6954b4bb957398267e385a4503
|
4
|
+
data.tar.gz: 88ff125a93503f4d0961d85270278f2c666fead1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca6c983fc956ba600c5ba89c45ab06826f903c68ef689eabfaeffb29df574d6e7e149a743e197bbfefc8127845397051f39fe4ecebed829ec18ce6633373657a
|
7
|
+
data.tar.gz: b072ee765942a7c72569e435d14322b172cd6acd395e2bc13e99d4103a2a7518039a0c2ee79111475a3880451180f94c47bb9a5103c741b0ed106134197afe39
|
data/README.md
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
# Kuromoji filter plugin for Embulk
|
2
2
|
|
3
3
|
Kuromoji filter plugin for Embulk.
|
4
|
+
Neologd support.
|
4
5
|
|
5
|
-
|
6
|
+
## Reference
|
7
|
+
|
8
|
+
* [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/kuromoji.html)
|
9
|
+
* [Home · neologd/mecab-ipadic-neologd Wiki](https://github.com/neologd/mecab-ipadic-neologd/wiki)
|
6
10
|
|
7
11
|
## Overview
|
8
12
|
|
@@ -10,6 +14,9 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
|
|
10
14
|
|
11
15
|
## Configuration
|
12
16
|
|
17
|
+
- **tokenizer**: select tokenizer.(kuromoji or neologd) (string, default: kuromoji)
|
18
|
+
- **mode**: select mode.(normal or search or extended) (string, default: normal)
|
19
|
+
- **use_stop_tag**: neologd only.(bool, default: false)
|
13
20
|
- **key_names**: description (list, required)
|
14
21
|
- **keep_input**: keep input columns. (bool, default: `true`)
|
15
22
|
- **ok_parts_of_speech**: ok parts of speech. (list, default: null)
|
@@ -20,12 +27,30 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
|
|
20
27
|
- **delimiter**: delimiter (string, default: ",")
|
21
28
|
- **type**: extract data type, array or string. array is json type. (string, default: "string")
|
22
29
|
|
23
|
-
## Example
|
30
|
+
## Neologd Example
|
31
|
+
|
32
|
+
```yaml
|
33
|
+
filters:
|
34
|
+
- type: kuromoji
|
35
|
+
tokenizer: neologd
|
36
|
+
use_stop_tag: true
|
37
|
+
key_names:
|
38
|
+
- catchcopy
|
39
|
+
settings:
|
40
|
+
- { method: 'reading', delimiter: '' }
|
41
|
+
- { suffix: _surface_form_no_delim, method: 'surface_form', delimiter: '' }
|
42
|
+
- { suffix: _base_form, method: 'base_form', delimiter: '###' }
|
43
|
+
- { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
|
44
|
+
- { suffix: _array, method: 'surface_form', type: 'array' }
|
45
|
+
```
|
46
|
+
|
47
|
+
## Pure kuromoji Example
|
24
48
|
|
25
49
|
```yaml
|
26
50
|
filters:
|
27
51
|
- type: kuromoji
|
28
52
|
keep_input: false
|
53
|
+
mode: search
|
29
54
|
ok_parts_of_speech:
|
30
55
|
- 名詞
|
31
56
|
key_names:
|
@@ -75,6 +100,14 @@ As below
|
|
75
100
|
- { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
|
76
101
|
```
|
77
102
|
|
103
|
+
## user dictionary exsample
|
104
|
+
|
105
|
+
```
|
106
|
+
西国分寺,西国分寺,ニシコクブンジ,駅名
|
107
|
+
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
|
108
|
+
```
|
109
|
+
|
110
|
+
|
78
111
|
## Build
|
79
112
|
|
80
113
|
```
|
data/build.gradle
CHANGED
@@ -9,12 +9,15 @@ import com.github.jrubygradle.JRubyExec
|
|
9
9
|
repositories {
|
10
10
|
mavenCentral()
|
11
11
|
jcenter()
|
12
|
+
maven {
|
13
|
+
url "http://maven.codelibs.org"
|
14
|
+
}
|
12
15
|
}
|
13
16
|
configurations {
|
14
17
|
provided
|
15
18
|
}
|
16
19
|
|
17
|
-
version = "0.
|
20
|
+
version = "0.4.0"
|
18
21
|
|
19
22
|
sourceCompatibility = 1.7
|
20
23
|
targetCompatibility = 1.7
|
@@ -22,6 +25,7 @@ targetCompatibility = 1.7
|
|
22
25
|
dependencies {
|
23
26
|
compile "org.embulk:embulk-core:0.8.9"
|
24
27
|
compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
|
28
|
+
compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:5.4.1-20160218"
|
25
29
|
provided "org.embulk:embulk-core:0.8.9"
|
26
30
|
testCompile "junit:junit:4.+"
|
27
31
|
testCompile "org.embulk:embulk-core:0.8.9"
|
@@ -76,8 +80,8 @@ Gem::Specification.new do |spec|
|
|
76
80
|
spec.name = "${project.name}"
|
77
81
|
spec.version = "${project.version}"
|
78
82
|
spec.authors = ["toyama0919"]
|
79
|
-
spec.summary = %[Kuromoji filter plugin for Embulk]
|
80
|
-
spec.description = %[Kuromoji filter plugin for Embulk]
|
83
|
+
spec.summary = %[Kuromoji filter plugin for Embulk. Neologd support.]
|
84
|
+
spec.description = %[Kuromoji filter plugin for Embulk. Neologd support.]
|
81
85
|
spec.email = ["toyama0919@gmail.com"]
|
82
86
|
spec.licenses = ["MIT"]
|
83
87
|
spec.homepage = "https://github.com/toyama0919/embulk-filter-kuromoji"
|
@@ -31,6 +31,18 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
31
31
|
@Config("key_names")
|
32
32
|
public List<String> getKeyNames();
|
33
33
|
|
34
|
+
@Config("tokenizer")
|
35
|
+
@ConfigDefault("\"kuromoji\"")
|
36
|
+
public String getTokenizer();
|
37
|
+
|
38
|
+
@Config("mode")
|
39
|
+
@ConfigDefault("\"normal\"")
|
40
|
+
public String getMode();
|
41
|
+
|
42
|
+
@Config("use_stop_tag")
|
43
|
+
@ConfigDefault("false")
|
44
|
+
public boolean getUseStopTag();
|
45
|
+
|
34
46
|
@Config("dictionary_path")
|
35
47
|
@ConfigDefault("null")
|
36
48
|
public Optional<String> getDictionaryPath();
|
@@ -61,6 +73,11 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
61
73
|
@Override
|
62
74
|
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
63
75
|
{
|
76
|
+
final String tokenizer = taskSource.loadTask(PluginTask.class).getTokenizer();
|
77
|
+
logger.info("Tokenizer => {}", tokenizer);
|
78
|
+
if (tokenizer.equals("neologd")){
|
79
|
+
return new NeologdPageOutput(taskSource, inputSchema, outputSchema, output);
|
80
|
+
}
|
64
81
|
return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
|
65
82
|
}
|
66
83
|
|
@@ -6,6 +6,7 @@ import java.util.List;
|
|
6
6
|
import java.util.Map;
|
7
7
|
|
8
8
|
import org.embulk.config.TaskSource;
|
9
|
+
import org.embulk.filter.kuromoji.KuromojiFilterPlugin;
|
9
10
|
import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
|
10
11
|
import org.embulk.spi.Column;
|
11
12
|
import org.embulk.spi.Exec;
|
@@ -19,6 +20,7 @@ import org.msgpack.value.Value;
|
|
19
20
|
import org.msgpack.value.ValueFactory;
|
20
21
|
import org.slf4j.Logger;
|
21
22
|
|
23
|
+
import com.atilika.kuromoji.TokenizerBase.Mode;
|
22
24
|
import com.atilika.kuromoji.ipadic.Token;
|
23
25
|
import com.atilika.kuromoji.ipadic.Tokenizer;
|
24
26
|
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
@@ -28,7 +30,7 @@ import com.google.common.collect.Lists;
|
|
28
30
|
|
29
31
|
public class KuromojiPageOutput implements PageOutput
|
30
32
|
{
|
31
|
-
private final PluginTask task;
|
33
|
+
private final KuromojiFilterPlugin.PluginTask task;
|
32
34
|
private final Tokenizer tokenizer;
|
33
35
|
private final List<Column> keyNameColumns;
|
34
36
|
private final PageReader reader;
|
@@ -52,6 +54,17 @@ public class KuromojiPageOutput implements PageOutput
|
|
52
54
|
e.printStackTrace();
|
53
55
|
}
|
54
56
|
}
|
57
|
+
|
58
|
+
Mode mode = null;
|
59
|
+
if (task.getMode().equals("normal")) {
|
60
|
+
mode = Mode.NORMAL;
|
61
|
+
} else if (task.getMode().equals("search")) {
|
62
|
+
mode = Mode.SEARCH;
|
63
|
+
} else if (task.getMode().equals("extended")) {
|
64
|
+
mode = Mode.EXTENDED;
|
65
|
+
}
|
66
|
+
|
67
|
+
builder.mode(mode);
|
55
68
|
this.tokenizer = builder.build();
|
56
69
|
this.keyNameColumns = Lists.newArrayList();
|
57
70
|
|
@@ -0,0 +1,205 @@
|
|
1
|
+
package org.embulk.filter.kuromoji;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.io.FileInputStream;
|
5
|
+
import java.io.InputStreamReader;
|
6
|
+
import java.io.Reader;
|
7
|
+
import java.io.StringReader;
|
8
|
+
import java.util.List;
|
9
|
+
import java.util.Map;
|
10
|
+
import java.util.Set;
|
11
|
+
|
12
|
+
import org.apache.lucene.analysis.TokenStream;
|
13
|
+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
14
|
+
import org.apache.lucene.analysis.util.CharArraySet;
|
15
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
|
16
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
|
17
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
18
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.dict.UserDictionary;
|
19
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
|
20
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
|
21
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute;
|
22
|
+
import org.embulk.config.TaskSource;
|
23
|
+
import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
|
24
|
+
import org.embulk.spi.Column;
|
25
|
+
import org.embulk.spi.Exec;
|
26
|
+
import org.embulk.spi.Page;
|
27
|
+
import org.embulk.spi.PageBuilder;
|
28
|
+
import org.embulk.spi.PageOutput;
|
29
|
+
import org.embulk.spi.PageReader;
|
30
|
+
import org.embulk.spi.Schema;
|
31
|
+
import org.embulk.spi.type.Types;
|
32
|
+
import org.msgpack.value.Value;
|
33
|
+
import org.msgpack.value.ValueFactory;
|
34
|
+
import org.slf4j.Logger;
|
35
|
+
|
36
|
+
import com.google.common.base.Charsets;
|
37
|
+
import com.google.common.base.Joiner;
|
38
|
+
import com.google.common.base.MoreObjects;
|
39
|
+
import com.google.common.collect.Lists;
|
40
|
+
import com.google.common.collect.Sets;
|
41
|
+
|
42
|
+
|
43
|
+
public class NeologdPageOutput implements PageOutput
|
44
|
+
{
|
45
|
+
private final KuromojiFilterPlugin.PluginTask task;
|
46
|
+
private final List<Column> keyNameColumns;
|
47
|
+
private final PageReader reader;
|
48
|
+
private final PageBuilder builder;
|
49
|
+
private final Schema inputSchema;
|
50
|
+
private final Schema outputSchema;
|
51
|
+
private final JapaneseAnalyzer japaneseAnalyzer;
|
52
|
+
private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
|
53
|
+
|
54
|
+
public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
|
55
|
+
this.task = taskSource.loadTask(PluginTask.class);
|
56
|
+
this.inputSchema = inputSchema;
|
57
|
+
this.outputSchema = outputSchema;
|
58
|
+
this.keyNameColumns = Lists.newArrayList();
|
59
|
+
|
60
|
+
for (String keyName : task.getKeyNames()) {
|
61
|
+
this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
62
|
+
}
|
63
|
+
this.reader = new PageReader(inputSchema);
|
64
|
+
this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
65
|
+
|
66
|
+
UserDictionary userDict = null;
|
67
|
+
if (task.getDictionaryPath().isPresent()) {
|
68
|
+
try {
|
69
|
+
File file = new File(task.getDictionaryPath().get());
|
70
|
+
Reader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
|
71
|
+
userDict = UserDictionary.open(reader);
|
72
|
+
} catch (Exception e) {
|
73
|
+
logger.error("neologd error", e);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
Mode mode = null;
|
78
|
+
if (task.getMode().equals("normal")) {
|
79
|
+
mode = JapaneseTokenizer.Mode.NORMAL;
|
80
|
+
} else if (task.getMode().equals("search")) {
|
81
|
+
mode = JapaneseTokenizer.Mode.SEARCH;
|
82
|
+
} else if (task.getMode().equals("extended")) {
|
83
|
+
mode = JapaneseTokenizer.Mode.EXTENDED;
|
84
|
+
}
|
85
|
+
|
86
|
+
CharArraySet stopSet = null;
|
87
|
+
Set<String> stopTags = Sets.newHashSet();
|
88
|
+
if (task.getUseStopTag()) {
|
89
|
+
stopSet = JapaneseAnalyzer.getDefaultStopSet();
|
90
|
+
stopTags = JapaneseAnalyzer.getDefaultStopTags();
|
91
|
+
}
|
92
|
+
this.japaneseAnalyzer = new JapaneseAnalyzer(userDict, mode, stopSet, stopTags);
|
93
|
+
}
|
94
|
+
|
95
|
+
@Override
|
96
|
+
public void finish() {
|
97
|
+
builder.finish();
|
98
|
+
}
|
99
|
+
|
100
|
+
@Override
|
101
|
+
public void close() {
|
102
|
+
builder.close();
|
103
|
+
}
|
104
|
+
|
105
|
+
@Override
|
106
|
+
public void add(Page page) {
|
107
|
+
reader.setPage(page);
|
108
|
+
while (reader.nextRecord()) {
|
109
|
+
setValue(builder);
|
110
|
+
builder.addRecord();
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
/**
|
115
|
+
* @param builder
|
116
|
+
*/
|
117
|
+
private void setValue(PageBuilder builder) {
|
118
|
+
if (task.getKeepInput()) {
|
119
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
120
|
+
if (reader.isNull(inputColumn)) {
|
121
|
+
builder.setNull(inputColumn);
|
122
|
+
continue;
|
123
|
+
}
|
124
|
+
if (Types.STRING.equals(inputColumn.getType())) {
|
125
|
+
builder.setString(inputColumn, reader.getString(inputColumn));
|
126
|
+
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
127
|
+
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
128
|
+
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
129
|
+
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
130
|
+
} else if (Types.LONG.equals(inputColumn.getType())) {
|
131
|
+
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
132
|
+
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
133
|
+
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
134
|
+
} else if (Types.JSON.equals(inputColumn.getType())) {
|
135
|
+
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
136
|
+
}
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
for (Column column : keyNameColumns) {
|
141
|
+
final String source = reader.getString(column);
|
142
|
+
List<Token> tokens = tokenize(new StringReader(source));
|
143
|
+
for (Map<String, String> setting : task.getSettings()) {
|
144
|
+
String suffix = setting.get("suffix");
|
145
|
+
String method = setting.get("method");
|
146
|
+
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
147
|
+
List<Value> outputs = Lists.newArrayList();
|
148
|
+
for (Token token : tokens) {
|
149
|
+
String word = null;
|
150
|
+
if ("base_form".equals(method)) {
|
151
|
+
word = token.getBaseForm();
|
152
|
+
} else if ("reading".equals(method)) {
|
153
|
+
word = token.getReading();
|
154
|
+
} else if ("surface_form".equals(method)) {
|
155
|
+
word = token.getCharTerm();
|
156
|
+
}
|
157
|
+
if (word != null) {
|
158
|
+
outputs.add(ValueFactory.newString(word));
|
159
|
+
}
|
160
|
+
}
|
161
|
+
if (outputColumn.getType().equals(Types.STRING)) {
|
162
|
+
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
163
|
+
builder.setString(outputColumn, joiner.join(outputs));
|
164
|
+
} else if (outputColumn.getType().equals(Types.JSON)) {
|
165
|
+
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
166
|
+
}
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
private boolean isOkPartsOfSpeech(Token token) {
|
172
|
+
logger.debug("{} => {}", token.getCharTerm(), token.getPartOfSpeech());
|
173
|
+
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
174
|
+
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
175
|
+
if (token.getPartOfSpeech().startsWith(okPartsOfSpeech)) {
|
176
|
+
return true;
|
177
|
+
}
|
178
|
+
}
|
179
|
+
return false;
|
180
|
+
}
|
181
|
+
|
182
|
+
private List<Token> tokenize(Reader reader) {
|
183
|
+
List<Token> list = Lists.newArrayList();
|
184
|
+
try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader) ) {
|
185
|
+
BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
|
186
|
+
CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
|
187
|
+
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
|
188
|
+
ReadingAttribute readAttr = tokenStream.addAttribute(ReadingAttribute.class);
|
189
|
+
|
190
|
+
tokenStream.reset();
|
191
|
+
while (tokenStream.incrementToken()) {
|
192
|
+
Token token = new Token();
|
193
|
+
token.setCharTerm(charAttr.toString());
|
194
|
+
token.setBaseForm(baseAttr.getBaseForm());
|
195
|
+
token.setReading(readAttr.getReading());
|
196
|
+
token.setPartOfSpeech(posAttr.getPartOfSpeech());
|
197
|
+
if (!isOkPartsOfSpeech(token)) { continue; }
|
198
|
+
list.add(token);
|
199
|
+
}
|
200
|
+
} catch (Exception e) {
|
201
|
+
logger.error("neologd error", e);
|
202
|
+
}
|
203
|
+
return list;
|
204
|
+
}
|
205
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
package org.embulk.filter.kuromoji;
|
2
|
+
|
3
|
+
public class Token
|
4
|
+
{
|
5
|
+
private String charTerm;
|
6
|
+
private String baseForm;
|
7
|
+
private String partOfSpeech;
|
8
|
+
private String reading;
|
9
|
+
private String inflection;
|
10
|
+
|
11
|
+
public String getCharTerm() {
|
12
|
+
return charTerm;
|
13
|
+
}
|
14
|
+
public String getBaseForm() {
|
15
|
+
return baseForm;
|
16
|
+
}
|
17
|
+
public String getPartOfSpeech() {
|
18
|
+
return partOfSpeech;
|
19
|
+
}
|
20
|
+
public void setCharTerm(String charTerm) {
|
21
|
+
this.charTerm = charTerm;
|
22
|
+
}
|
23
|
+
public void setBaseForm(String baseForm) {
|
24
|
+
this.baseForm = baseForm;
|
25
|
+
}
|
26
|
+
public void setPartOfSpeech(String partOfSpeech) {
|
27
|
+
this.partOfSpeech = partOfSpeech;
|
28
|
+
}
|
29
|
+
public void setReading(String reading) {
|
30
|
+
this.reading = reading;
|
31
|
+
}
|
32
|
+
public String getReading() {
|
33
|
+
return reading;
|
34
|
+
}
|
35
|
+
public String getInflection() {
|
36
|
+
return inflection;
|
37
|
+
}
|
38
|
+
public void setInflection(String inflection) {
|
39
|
+
this.inflection = inflection;
|
40
|
+
}
|
41
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-kuromoji
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description: Kuromoji filter plugin for Embulk
|
41
|
+
description: Kuromoji filter plugin for Embulk. Neologd support.
|
42
42
|
email:
|
43
43
|
- toyama0919@gmail.com
|
44
44
|
executables: []
|
@@ -58,10 +58,15 @@ files:
|
|
58
58
|
- lib/embulk/filter/kuromoji.rb
|
59
59
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
|
60
60
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
|
61
|
+
- src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java
|
62
|
+
- src/main/java/org/embulk/filter/kuromoji/Token.java
|
61
63
|
- src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
|
62
|
-
- classpath/embulk-filter-kuromoji-0.
|
64
|
+
- classpath/embulk-filter-kuromoji-0.4.0.jar
|
63
65
|
- classpath/kuromoji-core-0.9.0.jar
|
64
66
|
- classpath/kuromoji-ipadic-0.9.0.jar
|
67
|
+
- classpath/lucene-analyzers-common-5.4.1.jar
|
68
|
+
- classpath/lucene-analyzers-kuromoji-ipadic-neologd-5.4.1-20160218.jar
|
69
|
+
- classpath/lucene-core-5.4.1.jar
|
65
70
|
homepage: https://github.com/toyama0919/embulk-filter-kuromoji
|
66
71
|
licenses:
|
67
72
|
- MIT
|
@@ -85,6 +90,6 @@ rubyforge_project:
|
|
85
90
|
rubygems_version: 2.1.9
|
86
91
|
signing_key:
|
87
92
|
specification_version: 4
|
88
|
-
summary: Kuromoji filter plugin for Embulk
|
93
|
+
summary: Kuromoji filter plugin for Embulk. Neologd support.
|
89
94
|
test_files: []
|
90
95
|
has_rdoc:
|