embulk-filter-kuromoji 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d34a0b4db85a5b6954b4bb957398267e385a4503
4
- data.tar.gz: 88ff125a93503f4d0961d85270278f2c666fead1
3
+ metadata.gz: f761d94b92551164712a27d55503e5fb44bf9530
4
+ data.tar.gz: 71d3c24850363da272c604e1bbffe62821c1e1f9
5
5
  SHA512:
6
- metadata.gz: ca6c983fc956ba600c5ba89c45ab06826f903c68ef689eabfaeffb29df574d6e7e149a743e197bbfefc8127845397051f39fe4ecebed829ec18ce6633373657a
7
- data.tar.gz: b072ee765942a7c72569e435d14322b172cd6acd395e2bc13e99d4103a2a7518039a0c2ee79111475a3880451180f94c47bb9a5103c741b0ed106134197afe39
6
+ metadata.gz: 95ffe8e33a1b6c0d2be2c7985dcc8474e786dcbc82899a89e5091155f4a2e3b4141a5308bd38807e4644f839a0ee91fae73d5a498f0e16b64a2cab98607534f9
7
+ data.tar.gz: ac70dbc1e7f830758a5da0ee46df782f3b95af2367184b78f511405621d9273b7e86b907f33987dbfaed4cdcf9ffdd3c62381d16c07401ab8df346c0023c01a5
@@ -17,18 +17,18 @@ configurations {
17
17
  provided
18
18
  }
19
19
 
20
- version = "0.4.0"
20
+ version = "0.5.0"
21
21
 
22
22
  sourceCompatibility = 1.7
23
23
  targetCompatibility = 1.7
24
24
 
25
25
  dependencies {
26
- compile "org.embulk:embulk-core:0.8.9"
26
+ compile "org.embulk:embulk-core:0.8.15"
27
27
  compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
28
- compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:5.4.1-20160218"
29
- provided "org.embulk:embulk-core:0.8.9"
28
+ compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:6.2.1-20161201"
29
+ provided "org.embulk:embulk-core:0.8.15"
30
30
  testCompile "junit:junit:4.+"
31
- testCompile "org.embulk:embulk-core:0.8.9"
31
+ testCompile "org.embulk:embulk-core:0.8.15"
32
32
  }
33
33
 
34
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -75,7 +75,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
75
75
  {
76
76
  final String tokenizer = taskSource.loadTask(PluginTask.class).getTokenizer();
77
77
  logger.info("Tokenizer => {}", tokenizer);
78
- if (tokenizer.equals("neologd")){
78
+ if (tokenizer.equals("neologd")) {
79
79
  return new NeologdPageOutput(taskSource, inputSchema, outputSchema, output);
80
80
  }
81
81
  return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
@@ -86,7 +86,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
86
86
  * @param task
87
87
  * @return
88
88
  */
89
- private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
89
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema)
90
+ {
90
91
  final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
91
92
  logger.debug("outputColumns => {}", outputColumns);
92
93
  return new Schema(outputColumns);
@@ -97,7 +98,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
97
98
  * @param inputSchema
98
99
  * @return
99
100
  */
100
- private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
101
+ private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema)
102
+ {
101
103
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
102
104
  Map<String, Column> map = Maps.newLinkedHashMap();
103
105
  int i = 0;
@@ -117,7 +119,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
117
119
  }
118
120
 
119
121
  i = 0;
120
- for(Map.Entry<String, Column> e : map.entrySet()) {
122
+ for (Map.Entry<String, Column> e : map.entrySet()) {
121
123
  final Column column = e.getValue();
122
124
  builder.add(new Column(i++, column.getName(), column.getType()));
123
125
  }
@@ -39,7 +39,8 @@ public class KuromojiPageOutput implements PageOutput
39
39
  private final Schema outputSchema;
40
40
  private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
41
41
 
42
- public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
42
+ public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output)
43
+ {
43
44
  this.task = taskSource.loadTask(PluginTask.class);
44
45
  this.inputSchema = inputSchema;
45
46
  this.outputSchema = outputSchema;
@@ -48,9 +49,11 @@ public class KuromojiPageOutput implements PageOutput
48
49
  if (task.getDictionaryPath().isPresent()) {
49
50
  try {
50
51
  builder.userDictionary(task.getDictionaryPath().get());
51
- } catch (FileNotFoundException e) {
52
+ }
53
+ catch (FileNotFoundException e) {
52
54
  e.printStackTrace();
53
- } catch (IOException e) {
55
+ }
56
+ catch (IOException e) {
54
57
  e.printStackTrace();
55
58
  }
56
59
  }
@@ -58,9 +61,11 @@ public class KuromojiPageOutput implements PageOutput
58
61
  Mode mode = null;
59
62
  if (task.getMode().equals("normal")) {
60
63
  mode = Mode.NORMAL;
61
- } else if (task.getMode().equals("search")) {
64
+ }
65
+ else if (task.getMode().equals("search")) {
62
66
  mode = Mode.SEARCH;
63
- } else if (task.getMode().equals("extended")) {
67
+ }
68
+ else if (task.getMode().equals("extended")) {
64
69
  mode = Mode.EXTENDED;
65
70
  }
66
71
 
@@ -76,17 +81,20 @@ public class KuromojiPageOutput implements PageOutput
76
81
  }
77
82
 
78
83
  @Override
79
- public void finish() {
84
+ public void finish()
85
+ {
80
86
  builder.finish();
81
87
  }
82
88
 
83
89
  @Override
84
- public void close() {
90
+ public void close()
91
+ {
85
92
  builder.close();
86
93
  }
87
94
 
88
95
  @Override
89
- public void add(Page page) {
96
+ public void add(Page page)
97
+ {
90
98
  reader.setPage(page);
91
99
  while (reader.nextRecord()) {
92
100
  setValue(builder);
@@ -97,7 +105,8 @@ public class KuromojiPageOutput implements PageOutput
97
105
  /**
98
106
  * @param builder
99
107
  */
100
- private void setValue(PageBuilder builder) {
108
+ private void setValue(PageBuilder builder)
109
+ {
101
110
  if (task.getKeepInput()) {
102
111
  for (Column inputColumn : inputSchema.getColumns()) {
103
112
  if (reader.isNull(inputColumn)) {
@@ -106,15 +115,20 @@ public class KuromojiPageOutput implements PageOutput
106
115
  }
107
116
  if (Types.STRING.equals(inputColumn.getType())) {
108
117
  builder.setString(inputColumn, reader.getString(inputColumn));
109
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
118
+ }
119
+ else if (Types.BOOLEAN.equals(inputColumn.getType())) {
110
120
  builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
111
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
121
+ }
122
+ else if (Types.DOUBLE.equals(inputColumn.getType())) {
112
123
  builder.setDouble(inputColumn, reader.getDouble(inputColumn));
113
- } else if (Types.LONG.equals(inputColumn.getType())) {
124
+ }
125
+ else if (Types.LONG.equals(inputColumn.getType())) {
114
126
  builder.setLong(inputColumn, reader.getLong(inputColumn));
115
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
127
+ }
128
+ else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
116
129
  builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
117
- } else if (Types.JSON.equals(inputColumn.getType())) {
130
+ }
131
+ else if (Types.JSON.equals(inputColumn.getType())) {
118
132
  builder.setJson(inputColumn, reader.getJson(inputColumn));
119
133
  }
120
134
  }
@@ -131,13 +145,17 @@ public class KuromojiPageOutput implements PageOutput
131
145
  List<Value> outputs = Lists.newArrayList();
132
146
  for (Token token : tokens) {
133
147
  logger.debug("token => {}, {}", token, token.getAllFeatures());
134
- if (!isOkPartsOfSpeech(token)) { continue; }
148
+ if (!isOkPartsOfSpeech(token)) {
149
+ continue;
150
+ }
135
151
  String word = null;
136
152
  if ("base_form".equals(method)) {
137
153
  word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
138
- } else if ("reading".equals(method)) {
154
+ }
155
+ else if ("reading".equals(method)) {
139
156
  word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
140
- } else if ("surface_form".equals(method)) {
157
+ }
158
+ else if ("surface_form".equals(method)) {
141
159
  word = token.getSurface();
142
160
  }
143
161
  outputs.add(ValueFactory.newString(word));
@@ -145,15 +163,19 @@ public class KuromojiPageOutput implements PageOutput
145
163
  if (outputColumn.getType().equals(Types.STRING)) {
146
164
  Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
147
165
  builder.setString(outputColumn, joiner.join(outputs));
148
- } else if (outputColumn.getType().equals(Types.JSON)) {
166
+ }
167
+ else if (outputColumn.getType().equals(Types.JSON)) {
149
168
  builder.setJson(outputColumn, ValueFactory.newArray(outputs));
150
169
  }
151
170
  }
152
171
  }
153
172
  }
154
173
 
155
- private boolean isOkPartsOfSpeech(Token token) {
156
- if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
174
+ private boolean isOkPartsOfSpeech(Token token)
175
+ {
176
+ if (!task.getOkPartsOfSpeech().isPresent()) {
177
+ return true;
178
+ }
157
179
  for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
158
180
  if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
159
181
  return true;
@@ -9,9 +9,9 @@ import java.util.List;
9
9
  import java.util.Map;
10
10
  import java.util.Set;
11
11
 
12
+ import org.apache.lucene.analysis.CharArraySet;
12
13
  import org.apache.lucene.analysis.TokenStream;
13
14
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14
- import org.apache.lucene.analysis.util.CharArraySet;
15
15
  import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
16
16
  import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
17
17
  import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer.Mode;
@@ -39,7 +39,6 @@ import com.google.common.base.MoreObjects;
39
39
  import com.google.common.collect.Lists;
40
40
  import com.google.common.collect.Sets;
41
41
 
42
-
43
42
  public class NeologdPageOutput implements PageOutput
44
43
  {
45
44
  private final KuromojiFilterPlugin.PluginTask task;
@@ -51,7 +50,8 @@ public class NeologdPageOutput implements PageOutput
51
50
  private final JapaneseAnalyzer japaneseAnalyzer;
52
51
  private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
53
52
 
54
- public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
53
+ public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output)
54
+ {
55
55
  this.task = taskSource.loadTask(PluginTask.class);
56
56
  this.inputSchema = inputSchema;
57
57
  this.outputSchema = outputSchema;
@@ -69,7 +69,8 @@ public class NeologdPageOutput implements PageOutput
69
69
  File file = new File(task.getDictionaryPath().get());
70
70
  Reader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
71
71
  userDict = UserDictionary.open(reader);
72
- } catch (Exception e) {
72
+ }
73
+ catch (Exception e) {
73
74
  logger.error("neologd error", e);
74
75
  }
75
76
  }
@@ -77,9 +78,11 @@ public class NeologdPageOutput implements PageOutput
77
78
  Mode mode = null;
78
79
  if (task.getMode().equals("normal")) {
79
80
  mode = JapaneseTokenizer.Mode.NORMAL;
80
- } else if (task.getMode().equals("search")) {
81
+ }
82
+ else if (task.getMode().equals("search")) {
81
83
  mode = JapaneseTokenizer.Mode.SEARCH;
82
- } else if (task.getMode().equals("extended")) {
84
+ }
85
+ else if (task.getMode().equals("extended")) {
83
86
  mode = JapaneseTokenizer.Mode.EXTENDED;
84
87
  }
85
88
 
@@ -93,17 +96,20 @@ public class NeologdPageOutput implements PageOutput
93
96
  }
94
97
 
95
98
  @Override
96
- public void finish() {
99
+ public void finish()
100
+ {
97
101
  builder.finish();
98
102
  }
99
103
 
100
104
  @Override
101
- public void close() {
105
+ public void close()
106
+ {
102
107
  builder.close();
103
108
  }
104
109
 
105
110
  @Override
106
- public void add(Page page) {
111
+ public void add(Page page)
112
+ {
107
113
  reader.setPage(page);
108
114
  while (reader.nextRecord()) {
109
115
  setValue(builder);
@@ -114,7 +120,8 @@ public class NeologdPageOutput implements PageOutput
114
120
  /**
115
121
  * @param builder
116
122
  */
117
- private void setValue(PageBuilder builder) {
123
+ private void setValue(PageBuilder builder)
124
+ {
118
125
  if (task.getKeepInput()) {
119
126
  for (Column inputColumn : inputSchema.getColumns()) {
120
127
  if (reader.isNull(inputColumn)) {
@@ -123,15 +130,20 @@ public class NeologdPageOutput implements PageOutput
123
130
  }
124
131
  if (Types.STRING.equals(inputColumn.getType())) {
125
132
  builder.setString(inputColumn, reader.getString(inputColumn));
126
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
133
+ }
134
+ else if (Types.BOOLEAN.equals(inputColumn.getType())) {
127
135
  builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
128
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
136
+ }
137
+ else if (Types.DOUBLE.equals(inputColumn.getType())) {
129
138
  builder.setDouble(inputColumn, reader.getDouble(inputColumn));
130
- } else if (Types.LONG.equals(inputColumn.getType())) {
139
+ }
140
+ else if (Types.LONG.equals(inputColumn.getType())) {
131
141
  builder.setLong(inputColumn, reader.getLong(inputColumn));
132
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
142
+ }
143
+ else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
133
144
  builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
134
- } else if (Types.JSON.equals(inputColumn.getType())) {
145
+ }
146
+ else if (Types.JSON.equals(inputColumn.getType())) {
135
147
  builder.setJson(inputColumn, reader.getJson(inputColumn));
136
148
  }
137
149
  }
@@ -149,9 +161,11 @@ public class NeologdPageOutput implements PageOutput
149
161
  String word = null;
150
162
  if ("base_form".equals(method)) {
151
163
  word = token.getBaseForm();
152
- } else if ("reading".equals(method)) {
164
+ }
165
+ else if ("reading".equals(method)) {
153
166
  word = token.getReading();
154
- } else if ("surface_form".equals(method)) {
167
+ }
168
+ else if ("surface_form".equals(method)) {
155
169
  word = token.getCharTerm();
156
170
  }
157
171
  if (word != null) {
@@ -161,16 +175,20 @@ public class NeologdPageOutput implements PageOutput
161
175
  if (outputColumn.getType().equals(Types.STRING)) {
162
176
  Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
163
177
  builder.setString(outputColumn, joiner.join(outputs));
164
- } else if (outputColumn.getType().equals(Types.JSON)) {
178
+ }
179
+ else if (outputColumn.getType().equals(Types.JSON)) {
165
180
  builder.setJson(outputColumn, ValueFactory.newArray(outputs));
166
181
  }
167
182
  }
168
183
  }
169
184
  }
170
185
 
171
- private boolean isOkPartsOfSpeech(Token token) {
186
+ private boolean isOkPartsOfSpeech(Token token)
187
+ {
172
188
  logger.debug("{} => {}", token.getCharTerm(), token.getPartOfSpeech());
173
- if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
189
+ if (!task.getOkPartsOfSpeech().isPresent()) {
190
+ return true;
191
+ }
174
192
  for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
175
193
  if (token.getPartOfSpeech().startsWith(okPartsOfSpeech)) {
176
194
  return true;
@@ -179,9 +197,10 @@ public class NeologdPageOutput implements PageOutput
179
197
  return false;
180
198
  }
181
199
 
182
- private List<Token> tokenize(Reader reader) {
200
+ private List<Token> tokenize(Reader reader)
201
+ {
183
202
  List<Token> list = Lists.newArrayList();
184
- try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader) ) {
203
+ try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader)) {
185
204
  BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
186
205
  CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
187
206
  PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
@@ -194,10 +213,13 @@ public class NeologdPageOutput implements PageOutput
194
213
  token.setBaseForm(baseAttr.getBaseForm());
195
214
  token.setReading(readAttr.getReading());
196
215
  token.setPartOfSpeech(posAttr.getPartOfSpeech());
197
- if (!isOkPartsOfSpeech(token)) { continue; }
216
+ if (!isOkPartsOfSpeech(token)) {
217
+ continue;
218
+ }
198
219
  list.add(token);
199
220
  }
200
- } catch (Exception e) {
221
+ }
222
+ catch (Exception e) {
201
223
  logger.error("neologd error", e);
202
224
  }
203
225
  return list;
@@ -8,34 +8,44 @@ public class Token
8
8
  private String reading;
9
9
  private String inflection;
10
10
 
11
- public String getCharTerm() {
11
+ public String getCharTerm()
12
+ {
12
13
  return charTerm;
13
14
  }
14
- public String getBaseForm() {
15
+ public String getBaseForm()
16
+ {
15
17
  return baseForm;
16
18
  }
17
- public String getPartOfSpeech() {
19
+ public String getPartOfSpeech()
20
+ {
18
21
  return partOfSpeech;
19
22
  }
20
- public void setCharTerm(String charTerm) {
23
+ public void setCharTerm(String charTerm)
24
+ {
21
25
  this.charTerm = charTerm;
22
26
  }
23
- public void setBaseForm(String baseForm) {
27
+ public void setBaseForm(String baseForm)
28
+ {
24
29
  this.baseForm = baseForm;
25
30
  }
26
- public void setPartOfSpeech(String partOfSpeech) {
31
+ public void setPartOfSpeech(String partOfSpeech)
32
+ {
27
33
  this.partOfSpeech = partOfSpeech;
28
34
  }
29
- public void setReading(String reading) {
35
+ public void setReading(String reading)
36
+ {
30
37
  this.reading = reading;
31
38
  }
32
- public String getReading() {
39
+ public String getReading()
40
+ {
33
41
  return reading;
34
42
  }
35
- public String getInflection() {
43
+ public String getInflection()
44
+ {
36
45
  return inflection;
37
46
  }
38
- public void setInflection(String inflection) {
47
+ public void setInflection(String inflection)
48
+ {
39
49
  this.inflection = inflection;
40
50
  }
41
51
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-12 00:00:00.000000000 Z
11
+ date: 2016-12-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -61,12 +61,12 @@ files:
61
61
  - src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java
62
62
  - src/main/java/org/embulk/filter/kuromoji/Token.java
63
63
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
64
- - classpath/embulk-filter-kuromoji-0.4.0.jar
64
+ - classpath/embulk-filter-kuromoji-0.5.0.jar
65
65
  - classpath/kuromoji-core-0.9.0.jar
66
66
  - classpath/kuromoji-ipadic-0.9.0.jar
67
- - classpath/lucene-analyzers-common-5.4.1.jar
68
- - classpath/lucene-analyzers-kuromoji-ipadic-neologd-5.4.1-20160218.jar
69
- - classpath/lucene-core-5.4.1.jar
67
+ - classpath/lucene-analyzers-common-6.2.1.jar
68
+ - classpath/lucene-analyzers-kuromoji-ipadic-neologd-6.2.1-20161201.jar
69
+ - classpath/lucene-core-6.2.1.jar
70
70
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji
71
71
  licenses:
72
72
  - MIT
@@ -92,4 +92,3 @@ signing_key:
92
92
  specification_version: 4
93
93
  summary: Kuromoji filter plugin for Embulk. Neologd support.
94
94
  test_files: []
95
- has_rdoc: