embulk-filter-kuromoji 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +35 -2
- data/build.gradle +7 -3
- data/src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java +17 -0
- data/src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java +14 -1
- data/src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java +205 -0
- data/src/main/java/org/embulk/filter/kuromoji/Token.java +41 -0
- metadata +10 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d34a0b4db85a5b6954b4bb957398267e385a4503
|
4
|
+
data.tar.gz: 88ff125a93503f4d0961d85270278f2c666fead1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca6c983fc956ba600c5ba89c45ab06826f903c68ef689eabfaeffb29df574d6e7e149a743e197bbfefc8127845397051f39fe4ecebed829ec18ce6633373657a
|
7
|
+
data.tar.gz: b072ee765942a7c72569e435d14322b172cd6acd395e2bc13e99d4103a2a7518039a0c2ee79111475a3880451180f94c47bb9a5103c741b0ed106134197afe39
|
data/README.md
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
# Kuromoji filter plugin for Embulk
|
2
2
|
|
3
3
|
Kuromoji filter plugin for Embulk.
|
4
|
+
Neologd support.
|
4
5
|
|
5
|
-
|
6
|
+
## Reference
|
7
|
+
|
8
|
+
* [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/kuromoji.html)
|
9
|
+
* [Home · neologd/mecab-ipadic-neologd Wiki](https://github.com/neologd/mecab-ipadic-neologd/wiki)
|
6
10
|
|
7
11
|
## Overview
|
8
12
|
|
@@ -10,6 +14,9 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
|
|
10
14
|
|
11
15
|
## Configuration
|
12
16
|
|
17
|
+
- **tokenizer**: select tokenizer.(kuromoji or neologd) (string, default: kuromoji)
|
18
|
+
- **mode**: select mode.(normal or search or extended) (string, default: normal)
|
19
|
+
- **use_stop_tag**: neologd only.(bool, default: false)
|
13
20
|
- **key_names**: description (list, required)
|
14
21
|
- **keep_input**: keep input columns. (bool, default: `true`)
|
15
22
|
- **ok_parts_of_speech**: ok parts of speech. (list, default: null)
|
@@ -20,12 +27,30 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
|
|
20
27
|
- **delimiter**: delimiter (string, default: ",")
|
21
28
|
- **type**: extract data type, array or string. array is json type. (string, default: "string")
|
22
29
|
|
23
|
-
## Example
|
30
|
+
## Neologd Example
|
31
|
+
|
32
|
+
```yaml
|
33
|
+
filters:
|
34
|
+
- type: kuromoji
|
35
|
+
tokenizer: neologd
|
36
|
+
use_stop_tag: true
|
37
|
+
key_names:
|
38
|
+
- catchcopy
|
39
|
+
settings:
|
40
|
+
- { method: 'reading', delimiter: '' }
|
41
|
+
- { suffix: _surface_form_no_delim, method: 'surface_form', delimiter: '' }
|
42
|
+
- { suffix: _base_form, method: 'base_form', delimiter: '###' }
|
43
|
+
- { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
|
44
|
+
- { suffix: _array, method: 'surface_form', type: 'array' }
|
45
|
+
```
|
46
|
+
|
47
|
+
## Pure kuromoji Example
|
24
48
|
|
25
49
|
```yaml
|
26
50
|
filters:
|
27
51
|
- type: kuromoji
|
28
52
|
keep_input: false
|
53
|
+
mode: search
|
29
54
|
ok_parts_of_speech:
|
30
55
|
- 名詞
|
31
56
|
key_names:
|
@@ -75,6 +100,14 @@ As below
|
|
75
100
|
- { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
|
76
101
|
```
|
77
102
|
|
103
|
+
## user dictionary exsample
|
104
|
+
|
105
|
+
```
|
106
|
+
西国分寺,西国分寺,ニシコクブンジ,駅名
|
107
|
+
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
|
108
|
+
```
|
109
|
+
|
110
|
+
|
78
111
|
## Build
|
79
112
|
|
80
113
|
```
|
data/build.gradle
CHANGED
@@ -9,12 +9,15 @@ import com.github.jrubygradle.JRubyExec
|
|
9
9
|
repositories {
|
10
10
|
mavenCentral()
|
11
11
|
jcenter()
|
12
|
+
maven {
|
13
|
+
url "http://maven.codelibs.org"
|
14
|
+
}
|
12
15
|
}
|
13
16
|
configurations {
|
14
17
|
provided
|
15
18
|
}
|
16
19
|
|
17
|
-
version = "0.
|
20
|
+
version = "0.4.0"
|
18
21
|
|
19
22
|
sourceCompatibility = 1.7
|
20
23
|
targetCompatibility = 1.7
|
@@ -22,6 +25,7 @@ targetCompatibility = 1.7
|
|
22
25
|
dependencies {
|
23
26
|
compile "org.embulk:embulk-core:0.8.9"
|
24
27
|
compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
|
28
|
+
compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:5.4.1-20160218"
|
25
29
|
provided "org.embulk:embulk-core:0.8.9"
|
26
30
|
testCompile "junit:junit:4.+"
|
27
31
|
testCompile "org.embulk:embulk-core:0.8.9"
|
@@ -76,8 +80,8 @@ Gem::Specification.new do |spec|
|
|
76
80
|
spec.name = "${project.name}"
|
77
81
|
spec.version = "${project.version}"
|
78
82
|
spec.authors = ["toyama0919"]
|
79
|
-
spec.summary = %[Kuromoji filter plugin for Embulk]
|
80
|
-
spec.description = %[Kuromoji filter plugin for Embulk]
|
83
|
+
spec.summary = %[Kuromoji filter plugin for Embulk. Neologd support.]
|
84
|
+
spec.description = %[Kuromoji filter plugin for Embulk. Neologd support.]
|
81
85
|
spec.email = ["toyama0919@gmail.com"]
|
82
86
|
spec.licenses = ["MIT"]
|
83
87
|
spec.homepage = "https://github.com/toyama0919/embulk-filter-kuromoji"
|
@@ -31,6 +31,18 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
31
31
|
@Config("key_names")
|
32
32
|
public List<String> getKeyNames();
|
33
33
|
|
34
|
+
@Config("tokenizer")
|
35
|
+
@ConfigDefault("\"kuromoji\"")
|
36
|
+
public String getTokenizer();
|
37
|
+
|
38
|
+
@Config("mode")
|
39
|
+
@ConfigDefault("\"normal\"")
|
40
|
+
public String getMode();
|
41
|
+
|
42
|
+
@Config("use_stop_tag")
|
43
|
+
@ConfigDefault("false")
|
44
|
+
public boolean getUseStopTag();
|
45
|
+
|
34
46
|
@Config("dictionary_path")
|
35
47
|
@ConfigDefault("null")
|
36
48
|
public Optional<String> getDictionaryPath();
|
@@ -61,6 +73,11 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
61
73
|
@Override
|
62
74
|
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
63
75
|
{
|
76
|
+
final String tokenizer = taskSource.loadTask(PluginTask.class).getTokenizer();
|
77
|
+
logger.info("Tokenizer => {}", tokenizer);
|
78
|
+
if (tokenizer.equals("neologd")){
|
79
|
+
return new NeologdPageOutput(taskSource, inputSchema, outputSchema, output);
|
80
|
+
}
|
64
81
|
return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
|
65
82
|
}
|
66
83
|
|
@@ -6,6 +6,7 @@ import java.util.List;
|
|
6
6
|
import java.util.Map;
|
7
7
|
|
8
8
|
import org.embulk.config.TaskSource;
|
9
|
+
import org.embulk.filter.kuromoji.KuromojiFilterPlugin;
|
9
10
|
import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
|
10
11
|
import org.embulk.spi.Column;
|
11
12
|
import org.embulk.spi.Exec;
|
@@ -19,6 +20,7 @@ import org.msgpack.value.Value;
|
|
19
20
|
import org.msgpack.value.ValueFactory;
|
20
21
|
import org.slf4j.Logger;
|
21
22
|
|
23
|
+
import com.atilika.kuromoji.TokenizerBase.Mode;
|
22
24
|
import com.atilika.kuromoji.ipadic.Token;
|
23
25
|
import com.atilika.kuromoji.ipadic.Tokenizer;
|
24
26
|
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
@@ -28,7 +30,7 @@ import com.google.common.collect.Lists;
|
|
28
30
|
|
29
31
|
public class KuromojiPageOutput implements PageOutput
|
30
32
|
{
|
31
|
-
private final PluginTask task;
|
33
|
+
private final KuromojiFilterPlugin.PluginTask task;
|
32
34
|
private final Tokenizer tokenizer;
|
33
35
|
private final List<Column> keyNameColumns;
|
34
36
|
private final PageReader reader;
|
@@ -52,6 +54,17 @@ public class KuromojiPageOutput implements PageOutput
|
|
52
54
|
e.printStackTrace();
|
53
55
|
}
|
54
56
|
}
|
57
|
+
|
58
|
+
Mode mode = null;
|
59
|
+
if (task.getMode().equals("normal")) {
|
60
|
+
mode = Mode.NORMAL;
|
61
|
+
} else if (task.getMode().equals("search")) {
|
62
|
+
mode = Mode.SEARCH;
|
63
|
+
} else if (task.getMode().equals("extended")) {
|
64
|
+
mode = Mode.EXTENDED;
|
65
|
+
}
|
66
|
+
|
67
|
+
builder.mode(mode);
|
55
68
|
this.tokenizer = builder.build();
|
56
69
|
this.keyNameColumns = Lists.newArrayList();
|
57
70
|
|
@@ -0,0 +1,205 @@
|
|
1
|
+
package org.embulk.filter.kuromoji;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.io.FileInputStream;
|
5
|
+
import java.io.InputStreamReader;
|
6
|
+
import java.io.Reader;
|
7
|
+
import java.io.StringReader;
|
8
|
+
import java.util.List;
|
9
|
+
import java.util.Map;
|
10
|
+
import java.util.Set;
|
11
|
+
|
12
|
+
import org.apache.lucene.analysis.TokenStream;
|
13
|
+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
14
|
+
import org.apache.lucene.analysis.util.CharArraySet;
|
15
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
|
16
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
|
17
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
18
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.dict.UserDictionary;
|
19
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
|
20
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
|
21
|
+
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute;
|
22
|
+
import org.embulk.config.TaskSource;
|
23
|
+
import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
|
24
|
+
import org.embulk.spi.Column;
|
25
|
+
import org.embulk.spi.Exec;
|
26
|
+
import org.embulk.spi.Page;
|
27
|
+
import org.embulk.spi.PageBuilder;
|
28
|
+
import org.embulk.spi.PageOutput;
|
29
|
+
import org.embulk.spi.PageReader;
|
30
|
+
import org.embulk.spi.Schema;
|
31
|
+
import org.embulk.spi.type.Types;
|
32
|
+
import org.msgpack.value.Value;
|
33
|
+
import org.msgpack.value.ValueFactory;
|
34
|
+
import org.slf4j.Logger;
|
35
|
+
|
36
|
+
import com.google.common.base.Charsets;
|
37
|
+
import com.google.common.base.Joiner;
|
38
|
+
import com.google.common.base.MoreObjects;
|
39
|
+
import com.google.common.collect.Lists;
|
40
|
+
import com.google.common.collect.Sets;
|
41
|
+
|
42
|
+
|
43
|
+
public class NeologdPageOutput implements PageOutput
|
44
|
+
{
|
45
|
+
private final KuromojiFilterPlugin.PluginTask task;
|
46
|
+
private final List<Column> keyNameColumns;
|
47
|
+
private final PageReader reader;
|
48
|
+
private final PageBuilder builder;
|
49
|
+
private final Schema inputSchema;
|
50
|
+
private final Schema outputSchema;
|
51
|
+
private final JapaneseAnalyzer japaneseAnalyzer;
|
52
|
+
private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
|
53
|
+
|
54
|
+
public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
|
55
|
+
this.task = taskSource.loadTask(PluginTask.class);
|
56
|
+
this.inputSchema = inputSchema;
|
57
|
+
this.outputSchema = outputSchema;
|
58
|
+
this.keyNameColumns = Lists.newArrayList();
|
59
|
+
|
60
|
+
for (String keyName : task.getKeyNames()) {
|
61
|
+
this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
62
|
+
}
|
63
|
+
this.reader = new PageReader(inputSchema);
|
64
|
+
this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
65
|
+
|
66
|
+
UserDictionary userDict = null;
|
67
|
+
if (task.getDictionaryPath().isPresent()) {
|
68
|
+
try {
|
69
|
+
File file = new File(task.getDictionaryPath().get());
|
70
|
+
Reader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
|
71
|
+
userDict = UserDictionary.open(reader);
|
72
|
+
} catch (Exception e) {
|
73
|
+
logger.error("neologd error", e);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
Mode mode = null;
|
78
|
+
if (task.getMode().equals("normal")) {
|
79
|
+
mode = JapaneseTokenizer.Mode.NORMAL;
|
80
|
+
} else if (task.getMode().equals("search")) {
|
81
|
+
mode = JapaneseTokenizer.Mode.SEARCH;
|
82
|
+
} else if (task.getMode().equals("extended")) {
|
83
|
+
mode = JapaneseTokenizer.Mode.EXTENDED;
|
84
|
+
}
|
85
|
+
|
86
|
+
CharArraySet stopSet = null;
|
87
|
+
Set<String> stopTags = Sets.newHashSet();
|
88
|
+
if (task.getUseStopTag()) {
|
89
|
+
stopSet = JapaneseAnalyzer.getDefaultStopSet();
|
90
|
+
stopTags = JapaneseAnalyzer.getDefaultStopTags();
|
91
|
+
}
|
92
|
+
this.japaneseAnalyzer = new JapaneseAnalyzer(userDict, mode, stopSet, stopTags);
|
93
|
+
}
|
94
|
+
|
95
|
+
@Override
|
96
|
+
public void finish() {
|
97
|
+
builder.finish();
|
98
|
+
}
|
99
|
+
|
100
|
+
@Override
|
101
|
+
public void close() {
|
102
|
+
builder.close();
|
103
|
+
}
|
104
|
+
|
105
|
+
@Override
|
106
|
+
public void add(Page page) {
|
107
|
+
reader.setPage(page);
|
108
|
+
while (reader.nextRecord()) {
|
109
|
+
setValue(builder);
|
110
|
+
builder.addRecord();
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
/**
|
115
|
+
* @param builder
|
116
|
+
*/
|
117
|
+
private void setValue(PageBuilder builder) {
|
118
|
+
if (task.getKeepInput()) {
|
119
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
120
|
+
if (reader.isNull(inputColumn)) {
|
121
|
+
builder.setNull(inputColumn);
|
122
|
+
continue;
|
123
|
+
}
|
124
|
+
if (Types.STRING.equals(inputColumn.getType())) {
|
125
|
+
builder.setString(inputColumn, reader.getString(inputColumn));
|
126
|
+
} else if (Types.BOOLEAN.equals(inputColumn.getType())) {
|
127
|
+
builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
|
128
|
+
} else if (Types.DOUBLE.equals(inputColumn.getType())) {
|
129
|
+
builder.setDouble(inputColumn, reader.getDouble(inputColumn));
|
130
|
+
} else if (Types.LONG.equals(inputColumn.getType())) {
|
131
|
+
builder.setLong(inputColumn, reader.getLong(inputColumn));
|
132
|
+
} else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
|
133
|
+
builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
|
134
|
+
} else if (Types.JSON.equals(inputColumn.getType())) {
|
135
|
+
builder.setJson(inputColumn, reader.getJson(inputColumn));
|
136
|
+
}
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
for (Column column : keyNameColumns) {
|
141
|
+
final String source = reader.getString(column);
|
142
|
+
List<Token> tokens = tokenize(new StringReader(source));
|
143
|
+
for (Map<String, String> setting : task.getSettings()) {
|
144
|
+
String suffix = setting.get("suffix");
|
145
|
+
String method = setting.get("method");
|
146
|
+
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
147
|
+
List<Value> outputs = Lists.newArrayList();
|
148
|
+
for (Token token : tokens) {
|
149
|
+
String word = null;
|
150
|
+
if ("base_form".equals(method)) {
|
151
|
+
word = token.getBaseForm();
|
152
|
+
} else if ("reading".equals(method)) {
|
153
|
+
word = token.getReading();
|
154
|
+
} else if ("surface_form".equals(method)) {
|
155
|
+
word = token.getCharTerm();
|
156
|
+
}
|
157
|
+
if (word != null) {
|
158
|
+
outputs.add(ValueFactory.newString(word));
|
159
|
+
}
|
160
|
+
}
|
161
|
+
if (outputColumn.getType().equals(Types.STRING)) {
|
162
|
+
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
163
|
+
builder.setString(outputColumn, joiner.join(outputs));
|
164
|
+
} else if (outputColumn.getType().equals(Types.JSON)) {
|
165
|
+
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
166
|
+
}
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
private boolean isOkPartsOfSpeech(Token token) {
|
172
|
+
logger.debug("{} => {}", token.getCharTerm(), token.getPartOfSpeech());
|
173
|
+
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
174
|
+
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
175
|
+
if (token.getPartOfSpeech().startsWith(okPartsOfSpeech)) {
|
176
|
+
return true;
|
177
|
+
}
|
178
|
+
}
|
179
|
+
return false;
|
180
|
+
}
|
181
|
+
|
182
|
+
private List<Token> tokenize(Reader reader) {
|
183
|
+
List<Token> list = Lists.newArrayList();
|
184
|
+
try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader) ) {
|
185
|
+
BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
|
186
|
+
CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
|
187
|
+
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
|
188
|
+
ReadingAttribute readAttr = tokenStream.addAttribute(ReadingAttribute.class);
|
189
|
+
|
190
|
+
tokenStream.reset();
|
191
|
+
while (tokenStream.incrementToken()) {
|
192
|
+
Token token = new Token();
|
193
|
+
token.setCharTerm(charAttr.toString());
|
194
|
+
token.setBaseForm(baseAttr.getBaseForm());
|
195
|
+
token.setReading(readAttr.getReading());
|
196
|
+
token.setPartOfSpeech(posAttr.getPartOfSpeech());
|
197
|
+
if (!isOkPartsOfSpeech(token)) { continue; }
|
198
|
+
list.add(token);
|
199
|
+
}
|
200
|
+
} catch (Exception e) {
|
201
|
+
logger.error("neologd error", e);
|
202
|
+
}
|
203
|
+
return list;
|
204
|
+
}
|
205
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
package org.embulk.filter.kuromoji;
|
2
|
+
|
3
|
+
public class Token
|
4
|
+
{
|
5
|
+
private String charTerm;
|
6
|
+
private String baseForm;
|
7
|
+
private String partOfSpeech;
|
8
|
+
private String reading;
|
9
|
+
private String inflection;
|
10
|
+
|
11
|
+
public String getCharTerm() {
|
12
|
+
return charTerm;
|
13
|
+
}
|
14
|
+
public String getBaseForm() {
|
15
|
+
return baseForm;
|
16
|
+
}
|
17
|
+
public String getPartOfSpeech() {
|
18
|
+
return partOfSpeech;
|
19
|
+
}
|
20
|
+
public void setCharTerm(String charTerm) {
|
21
|
+
this.charTerm = charTerm;
|
22
|
+
}
|
23
|
+
public void setBaseForm(String baseForm) {
|
24
|
+
this.baseForm = baseForm;
|
25
|
+
}
|
26
|
+
public void setPartOfSpeech(String partOfSpeech) {
|
27
|
+
this.partOfSpeech = partOfSpeech;
|
28
|
+
}
|
29
|
+
public void setReading(String reading) {
|
30
|
+
this.reading = reading;
|
31
|
+
}
|
32
|
+
public String getReading() {
|
33
|
+
return reading;
|
34
|
+
}
|
35
|
+
public String getInflection() {
|
36
|
+
return inflection;
|
37
|
+
}
|
38
|
+
public void setInflection(String inflection) {
|
39
|
+
this.inflection = inflection;
|
40
|
+
}
|
41
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-kuromoji
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description: Kuromoji filter plugin for Embulk
|
41
|
+
description: Kuromoji filter plugin for Embulk. Neologd support.
|
42
42
|
email:
|
43
43
|
- toyama0919@gmail.com
|
44
44
|
executables: []
|
@@ -58,10 +58,15 @@ files:
|
|
58
58
|
- lib/embulk/filter/kuromoji.rb
|
59
59
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
|
60
60
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
|
61
|
+
- src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java
|
62
|
+
- src/main/java/org/embulk/filter/kuromoji/Token.java
|
61
63
|
- src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
|
62
|
-
- classpath/embulk-filter-kuromoji-0.
|
64
|
+
- classpath/embulk-filter-kuromoji-0.4.0.jar
|
63
65
|
- classpath/kuromoji-core-0.9.0.jar
|
64
66
|
- classpath/kuromoji-ipadic-0.9.0.jar
|
67
|
+
- classpath/lucene-analyzers-common-5.4.1.jar
|
68
|
+
- classpath/lucene-analyzers-kuromoji-ipadic-neologd-5.4.1-20160218.jar
|
69
|
+
- classpath/lucene-core-5.4.1.jar
|
65
70
|
homepage: https://github.com/toyama0919/embulk-filter-kuromoji
|
66
71
|
licenses:
|
67
72
|
- MIT
|
@@ -85,6 +90,6 @@ rubyforge_project:
|
|
85
90
|
rubygems_version: 2.1.9
|
86
91
|
signing_key:
|
87
92
|
specification_version: 4
|
88
|
-
summary: Kuromoji filter plugin for Embulk
|
93
|
+
summary: Kuromoji filter plugin for Embulk. Neologd support.
|
89
94
|
test_files: []
|
90
95
|
has_rdoc:
|