embulk-filter-kuromoji 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ae73f70a252f75cff653883c3410115220b0516a
4
- data.tar.gz: 239fdb7c9aa444898f660dd280645521c9e1368e
3
+ metadata.gz: d34a0b4db85a5b6954b4bb957398267e385a4503
4
+ data.tar.gz: 88ff125a93503f4d0961d85270278f2c666fead1
5
5
  SHA512:
6
- metadata.gz: 347e0a417a8131a2b504b53ccd4e943bd1e4afd125ee95180efde77c100005b02010a9d9dafa77cfd67b4d0b244b4691b9e64199dddcd0975b14ff42af1f731e
7
- data.tar.gz: d5fadea360f8ce2e0d25de2e0a825f60d34eae1b60b88fb50e361eb6e13e8261f7264162c91e94921a73ba18e2c727cf22426a7b1c9b0f38043760e58fc06ca5
6
+ metadata.gz: ca6c983fc956ba600c5ba89c45ab06826f903c68ef689eabfaeffb29df574d6e7e149a743e197bbfefc8127845397051f39fe4ecebed829ec18ce6633373657a
7
+ data.tar.gz: b072ee765942a7c72569e435d14322b172cd6acd395e2bc13e99d4103a2a7518039a0c2ee79111475a3880451180f94c47bb9a5103c741b0ed106134197afe39
data/README.md CHANGED
@@ -1,8 +1,12 @@
1
1
  # Kuromoji filter plugin for Embulk
2
2
 
3
3
  Kuromoji filter plugin for Embulk.
4
+ Neologd support.
4
5
 
5
- see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/kuromoji.html)
6
+ ## Reference
7
+
8
+ * [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/kuromoji.html)
9
+ * [Home · neologd/mecab-ipadic-neologd Wiki](https://github.com/neologd/mecab-ipadic-neologd/wiki)
6
10
 
7
11
  ## Overview
8
12
 
@@ -10,6 +14,9 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
10
14
 
11
15
  ## Configuration
12
16
 
17
+ - **tokenizer**: select tokenizer.(kuromoji or neologd) (string, default: kuromoji)
18
+ - **mode**: select mode.(normal or search or extended) (string, default: normal)
19
+ - **use_stop_tag**: neologd only.(bool, default: false)
13
20
  - **key_names**: description (list, required)
14
21
  - **keep_input**: keep input columns. (bool, default: `true`)
15
22
  - **ok_parts_of_speech**: ok parts of speech. (list, default: null)
@@ -20,12 +27,30 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
20
27
  - **delimiter**: delimiter (string, default: ",")
21
28
  - **type**: extract data type, array or string. array is json type. (string, default: "string")
22
29
 
23
- ## Example
30
+ ## Neologd Example
31
+
32
+ ```yaml
33
+ filters:
34
+ - type: kuromoji
35
+ tokenizer: neologd
36
+ use_stop_tag: true
37
+ key_names:
38
+ - catchcopy
39
+ settings:
40
+ - { method: 'reading', delimiter: '' }
41
+ - { suffix: _surface_form_no_delim, method: 'surface_form', delimiter: '' }
42
+ - { suffix: _base_form, method: 'base_form', delimiter: '###' }
43
+ - { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
44
+ - { suffix: _array, method: 'surface_form', type: 'array' }
45
+ ```
46
+
47
+ ## Pure kuromoji Example
24
48
 
25
49
  ```yaml
26
50
  filters:
27
51
  - type: kuromoji
28
52
  keep_input: false
53
+ mode: search
29
54
  ok_parts_of_speech:
30
55
  - 名詞
31
56
  key_names:
@@ -75,6 +100,14 @@ As below
75
100
  - { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
76
101
  ```
77
102
 
103
+ ## user dictionary exsample
104
+
105
+ ```
106
+ 西国分寺,西国分寺,ニシコクブンジ,駅名
107
+ 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
108
+ ```
109
+
110
+
78
111
  ## Build
79
112
 
80
113
  ```
data/build.gradle CHANGED
@@ -9,12 +9,15 @@ import com.github.jrubygradle.JRubyExec
9
9
  repositories {
10
10
  mavenCentral()
11
11
  jcenter()
12
+ maven {
13
+ url "http://maven.codelibs.org"
14
+ }
12
15
  }
13
16
  configurations {
14
17
  provided
15
18
  }
16
19
 
17
- version = "0.3.3"
20
+ version = "0.4.0"
18
21
 
19
22
  sourceCompatibility = 1.7
20
23
  targetCompatibility = 1.7
@@ -22,6 +25,7 @@ targetCompatibility = 1.7
22
25
  dependencies {
23
26
  compile "org.embulk:embulk-core:0.8.9"
24
27
  compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
28
+ compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:5.4.1-20160218"
25
29
  provided "org.embulk:embulk-core:0.8.9"
26
30
  testCompile "junit:junit:4.+"
27
31
  testCompile "org.embulk:embulk-core:0.8.9"
@@ -76,8 +80,8 @@ Gem::Specification.new do |spec|
76
80
  spec.name = "${project.name}"
77
81
  spec.version = "${project.version}"
78
82
  spec.authors = ["toyama0919"]
79
- spec.summary = %[Kuromoji filter plugin for Embulk]
80
- spec.description = %[Kuromoji filter plugin for Embulk]
83
+ spec.summary = %[Kuromoji filter plugin for Embulk. Neologd support.]
84
+ spec.description = %[Kuromoji filter plugin for Embulk. Neologd support.]
81
85
  spec.email = ["toyama0919@gmail.com"]
82
86
  spec.licenses = ["MIT"]
83
87
  spec.homepage = "https://github.com/toyama0919/embulk-filter-kuromoji"
@@ -31,6 +31,18 @@ public class KuromojiFilterPlugin implements FilterPlugin
31
31
  @Config("key_names")
32
32
  public List<String> getKeyNames();
33
33
 
34
+ @Config("tokenizer")
35
+ @ConfigDefault("\"kuromoji\"")
36
+ public String getTokenizer();
37
+
38
+ @Config("mode")
39
+ @ConfigDefault("\"normal\"")
40
+ public String getMode();
41
+
42
+ @Config("use_stop_tag")
43
+ @ConfigDefault("false")
44
+ public boolean getUseStopTag();
45
+
34
46
  @Config("dictionary_path")
35
47
  @ConfigDefault("null")
36
48
  public Optional<String> getDictionaryPath();
@@ -61,6 +73,11 @@ public class KuromojiFilterPlugin implements FilterPlugin
61
73
  @Override
62
74
  public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
63
75
  {
76
+ final String tokenizer = taskSource.loadTask(PluginTask.class).getTokenizer();
77
+ logger.info("Tokenizer => {}", tokenizer);
78
+ if (tokenizer.equals("neologd")){
79
+ return new NeologdPageOutput(taskSource, inputSchema, outputSchema, output);
80
+ }
64
81
  return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
65
82
  }
66
83
 
@@ -6,6 +6,7 @@ import java.util.List;
6
6
  import java.util.Map;
7
7
 
8
8
  import org.embulk.config.TaskSource;
9
+ import org.embulk.filter.kuromoji.KuromojiFilterPlugin;
9
10
  import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
10
11
  import org.embulk.spi.Column;
11
12
  import org.embulk.spi.Exec;
@@ -19,6 +20,7 @@ import org.msgpack.value.Value;
19
20
  import org.msgpack.value.ValueFactory;
20
21
  import org.slf4j.Logger;
21
22
 
23
+ import com.atilika.kuromoji.TokenizerBase.Mode;
22
24
  import com.atilika.kuromoji.ipadic.Token;
23
25
  import com.atilika.kuromoji.ipadic.Tokenizer;
24
26
  import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
@@ -28,7 +30,7 @@ import com.google.common.collect.Lists;
28
30
 
29
31
  public class KuromojiPageOutput implements PageOutput
30
32
  {
31
- private final PluginTask task;
33
+ private final KuromojiFilterPlugin.PluginTask task;
32
34
  private final Tokenizer tokenizer;
33
35
  private final List<Column> keyNameColumns;
34
36
  private final PageReader reader;
@@ -52,6 +54,17 @@ public class KuromojiPageOutput implements PageOutput
52
54
  e.printStackTrace();
53
55
  }
54
56
  }
57
+
58
+ Mode mode = null;
59
+ if (task.getMode().equals("normal")) {
60
+ mode = Mode.NORMAL;
61
+ } else if (task.getMode().equals("search")) {
62
+ mode = Mode.SEARCH;
63
+ } else if (task.getMode().equals("extended")) {
64
+ mode = Mode.EXTENDED;
65
+ }
66
+
67
+ builder.mode(mode);
55
68
  this.tokenizer = builder.build();
56
69
  this.keyNameColumns = Lists.newArrayList();
57
70
 
@@ -0,0 +1,205 @@
1
+ package org.embulk.filter.kuromoji;
2
+
3
+ import java.io.File;
4
+ import java.io.FileInputStream;
5
+ import java.io.InputStreamReader;
6
+ import java.io.Reader;
7
+ import java.io.StringReader;
8
+ import java.util.List;
9
+ import java.util.Map;
10
+ import java.util.Set;
11
+
12
+ import org.apache.lucene.analysis.TokenStream;
13
+ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14
+ import org.apache.lucene.analysis.util.CharArraySet;
15
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
16
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
17
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer.Mode;
18
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.dict.UserDictionary;
19
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
20
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
21
+ import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute;
22
+ import org.embulk.config.TaskSource;
23
+ import org.embulk.filter.kuromoji.KuromojiFilterPlugin.PluginTask;
24
+ import org.embulk.spi.Column;
25
+ import org.embulk.spi.Exec;
26
+ import org.embulk.spi.Page;
27
+ import org.embulk.spi.PageBuilder;
28
+ import org.embulk.spi.PageOutput;
29
+ import org.embulk.spi.PageReader;
30
+ import org.embulk.spi.Schema;
31
+ import org.embulk.spi.type.Types;
32
+ import org.msgpack.value.Value;
33
+ import org.msgpack.value.ValueFactory;
34
+ import org.slf4j.Logger;
35
+
36
+ import com.google.common.base.Charsets;
37
+ import com.google.common.base.Joiner;
38
+ import com.google.common.base.MoreObjects;
39
+ import com.google.common.collect.Lists;
40
+ import com.google.common.collect.Sets;
41
+
42
+
43
+ public class NeologdPageOutput implements PageOutput
44
+ {
45
+ private final KuromojiFilterPlugin.PluginTask task;
46
+ private final List<Column> keyNameColumns;
47
+ private final PageReader reader;
48
+ private final PageBuilder builder;
49
+ private final Schema inputSchema;
50
+ private final Schema outputSchema;
51
+ private final JapaneseAnalyzer japaneseAnalyzer;
52
+ private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
53
+
54
+ public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
55
+ this.task = taskSource.loadTask(PluginTask.class);
56
+ this.inputSchema = inputSchema;
57
+ this.outputSchema = outputSchema;
58
+ this.keyNameColumns = Lists.newArrayList();
59
+
60
+ for (String keyName : task.getKeyNames()) {
61
+ this.keyNameColumns.add(outputSchema.lookupColumn(keyName));
62
+ }
63
+ this.reader = new PageReader(inputSchema);
64
+ this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
65
+
66
+ UserDictionary userDict = null;
67
+ if (task.getDictionaryPath().isPresent()) {
68
+ try {
69
+ File file = new File(task.getDictionaryPath().get());
70
+ Reader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
71
+ userDict = UserDictionary.open(reader);
72
+ } catch (Exception e) {
73
+ logger.error("neologd error", e);
74
+ }
75
+ }
76
+
77
+ Mode mode = null;
78
+ if (task.getMode().equals("normal")) {
79
+ mode = JapaneseTokenizer.Mode.NORMAL;
80
+ } else if (task.getMode().equals("search")) {
81
+ mode = JapaneseTokenizer.Mode.SEARCH;
82
+ } else if (task.getMode().equals("extended")) {
83
+ mode = JapaneseTokenizer.Mode.EXTENDED;
84
+ }
85
+
86
+ CharArraySet stopSet = null;
87
+ Set<String> stopTags = Sets.newHashSet();
88
+ if (task.getUseStopTag()) {
89
+ stopSet = JapaneseAnalyzer.getDefaultStopSet();
90
+ stopTags = JapaneseAnalyzer.getDefaultStopTags();
91
+ }
92
+ this.japaneseAnalyzer = new JapaneseAnalyzer(userDict, mode, stopSet, stopTags);
93
+ }
94
+
95
+ @Override
96
+ public void finish() {
97
+ builder.finish();
98
+ }
99
+
100
+ @Override
101
+ public void close() {
102
+ builder.close();
103
+ }
104
+
105
+ @Override
106
+ public void add(Page page) {
107
+ reader.setPage(page);
108
+ while (reader.nextRecord()) {
109
+ setValue(builder);
110
+ builder.addRecord();
111
+ }
112
+ }
113
+
114
+ /**
115
+ * @param builder
116
+ */
117
+ private void setValue(PageBuilder builder) {
118
+ if (task.getKeepInput()) {
119
+ for (Column inputColumn : inputSchema.getColumns()) {
120
+ if (reader.isNull(inputColumn)) {
121
+ builder.setNull(inputColumn);
122
+ continue;
123
+ }
124
+ if (Types.STRING.equals(inputColumn.getType())) {
125
+ builder.setString(inputColumn, reader.getString(inputColumn));
126
+ } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
127
+ builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
128
+ } else if (Types.DOUBLE.equals(inputColumn.getType())) {
129
+ builder.setDouble(inputColumn, reader.getDouble(inputColumn));
130
+ } else if (Types.LONG.equals(inputColumn.getType())) {
131
+ builder.setLong(inputColumn, reader.getLong(inputColumn));
132
+ } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
133
+ builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
134
+ } else if (Types.JSON.equals(inputColumn.getType())) {
135
+ builder.setJson(inputColumn, reader.getJson(inputColumn));
136
+ }
137
+ }
138
+ }
139
+
140
+ for (Column column : keyNameColumns) {
141
+ final String source = reader.getString(column);
142
+ List<Token> tokens = tokenize(new StringReader(source));
143
+ for (Map<String, String> setting : task.getSettings()) {
144
+ String suffix = setting.get("suffix");
145
+ String method = setting.get("method");
146
+ Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
147
+ List<Value> outputs = Lists.newArrayList();
148
+ for (Token token : tokens) {
149
+ String word = null;
150
+ if ("base_form".equals(method)) {
151
+ word = token.getBaseForm();
152
+ } else if ("reading".equals(method)) {
153
+ word = token.getReading();
154
+ } else if ("surface_form".equals(method)) {
155
+ word = token.getCharTerm();
156
+ }
157
+ if (word != null) {
158
+ outputs.add(ValueFactory.newString(word));
159
+ }
160
+ }
161
+ if (outputColumn.getType().equals(Types.STRING)) {
162
+ Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
163
+ builder.setString(outputColumn, joiner.join(outputs));
164
+ } else if (outputColumn.getType().equals(Types.JSON)) {
165
+ builder.setJson(outputColumn, ValueFactory.newArray(outputs));
166
+ }
167
+ }
168
+ }
169
+ }
170
+
171
+ private boolean isOkPartsOfSpeech(Token token) {
172
+ logger.debug("{} => {}", token.getCharTerm(), token.getPartOfSpeech());
173
+ if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
174
+ for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
175
+ if (token.getPartOfSpeech().startsWith(okPartsOfSpeech)) {
176
+ return true;
177
+ }
178
+ }
179
+ return false;
180
+ }
181
+
182
+ private List<Token> tokenize(Reader reader) {
183
+ List<Token> list = Lists.newArrayList();
184
+ try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader) ) {
185
+ BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
186
+ CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
187
+ PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
188
+ ReadingAttribute readAttr = tokenStream.addAttribute(ReadingAttribute.class);
189
+
190
+ tokenStream.reset();
191
+ while (tokenStream.incrementToken()) {
192
+ Token token = new Token();
193
+ token.setCharTerm(charAttr.toString());
194
+ token.setBaseForm(baseAttr.getBaseForm());
195
+ token.setReading(readAttr.getReading());
196
+ token.setPartOfSpeech(posAttr.getPartOfSpeech());
197
+ if (!isOkPartsOfSpeech(token)) { continue; }
198
+ list.add(token);
199
+ }
200
+ } catch (Exception e) {
201
+ logger.error("neologd error", e);
202
+ }
203
+ return list;
204
+ }
205
+ }
@@ -0,0 +1,41 @@
1
+ package org.embulk.filter.kuromoji;
2
+
3
+ public class Token
4
+ {
5
+ private String charTerm;
6
+ private String baseForm;
7
+ private String partOfSpeech;
8
+ private String reading;
9
+ private String inflection;
10
+
11
+ public String getCharTerm() {
12
+ return charTerm;
13
+ }
14
+ public String getBaseForm() {
15
+ return baseForm;
16
+ }
17
+ public String getPartOfSpeech() {
18
+ return partOfSpeech;
19
+ }
20
+ public void setCharTerm(String charTerm) {
21
+ this.charTerm = charTerm;
22
+ }
23
+ public void setBaseForm(String baseForm) {
24
+ this.baseForm = baseForm;
25
+ }
26
+ public void setPartOfSpeech(String partOfSpeech) {
27
+ this.partOfSpeech = partOfSpeech;
28
+ }
29
+ public void setReading(String reading) {
30
+ this.reading = reading;
31
+ }
32
+ public String getReading() {
33
+ return reading;
34
+ }
35
+ public String getInflection() {
36
+ return inflection;
37
+ }
38
+ public void setInflection(String inflection) {
39
+ this.inflection = inflection;
40
+ }
41
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-21 00:00:00.000000000 Z
11
+ date: 2016-07-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -38,7 +38,7 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: Kuromoji filter plugin for Embulk
41
+ description: Kuromoji filter plugin for Embulk. Neologd support.
42
42
  email:
43
43
  - toyama0919@gmail.com
44
44
  executables: []
@@ -58,10 +58,15 @@ files:
58
58
  - lib/embulk/filter/kuromoji.rb
59
59
  - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
60
60
  - src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java
61
+ - src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java
62
+ - src/main/java/org/embulk/filter/kuromoji/Token.java
61
63
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
62
- - classpath/embulk-filter-kuromoji-0.3.3.jar
64
+ - classpath/embulk-filter-kuromoji-0.4.0.jar
63
65
  - classpath/kuromoji-core-0.9.0.jar
64
66
  - classpath/kuromoji-ipadic-0.9.0.jar
67
+ - classpath/lucene-analyzers-common-5.4.1.jar
68
+ - classpath/lucene-analyzers-kuromoji-ipadic-neologd-5.4.1-20160218.jar
69
+ - classpath/lucene-core-5.4.1.jar
65
70
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji
66
71
  licenses:
67
72
  - MIT
@@ -85,6 +90,6 @@ rubyforge_project:
85
90
  rubygems_version: 2.1.9
86
91
  signing_key:
87
92
  specification_version: 4
88
- summary: Kuromoji filter plugin for Embulk
93
+ summary: Kuromoji filter plugin for Embulk. Neologd support.
89
94
  test_files: []
90
95
  has_rdoc: