embulk-filter-kuromoji 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d34a0b4db85a5b6954b4bb957398267e385a4503
4
- data.tar.gz: 88ff125a93503f4d0961d85270278f2c666fead1
3
+ metadata.gz: f761d94b92551164712a27d55503e5fb44bf9530
4
+ data.tar.gz: 71d3c24850363da272c604e1bbffe62821c1e1f9
5
5
  SHA512:
6
- metadata.gz: ca6c983fc956ba600c5ba89c45ab06826f903c68ef689eabfaeffb29df574d6e7e149a743e197bbfefc8127845397051f39fe4ecebed829ec18ce6633373657a
7
- data.tar.gz: b072ee765942a7c72569e435d14322b172cd6acd395e2bc13e99d4103a2a7518039a0c2ee79111475a3880451180f94c47bb9a5103c741b0ed106134197afe39
6
+ metadata.gz: 95ffe8e33a1b6c0d2be2c7985dcc8474e786dcbc82899a89e5091155f4a2e3b4141a5308bd38807e4644f839a0ee91fae73d5a498f0e16b64a2cab98607534f9
7
+ data.tar.gz: ac70dbc1e7f830758a5da0ee46df782f3b95af2367184b78f511405621d9273b7e86b907f33987dbfaed4cdcf9ffdd3c62381d16c07401ab8df346c0023c01a5
@@ -17,18 +17,18 @@ configurations {
17
17
  provided
18
18
  }
19
19
 
20
- version = "0.4.0"
20
+ version = "0.5.0"
21
21
 
22
22
  sourceCompatibility = 1.7
23
23
  targetCompatibility = 1.7
24
24
 
25
25
  dependencies {
26
- compile "org.embulk:embulk-core:0.8.9"
26
+ compile "org.embulk:embulk-core:0.8.15"
27
27
  compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
28
- compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:5.4.1-20160218"
29
- provided "org.embulk:embulk-core:0.8.9"
28
+ compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:6.2.1-20161201"
29
+ provided "org.embulk:embulk-core:0.8.15"
30
30
  testCompile "junit:junit:4.+"
31
- testCompile "org.embulk:embulk-core:0.8.9"
31
+ testCompile "org.embulk:embulk-core:0.8.15"
32
32
  }
33
33
 
34
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -75,7 +75,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
75
75
  {
76
76
  final String tokenizer = taskSource.loadTask(PluginTask.class).getTokenizer();
77
77
  logger.info("Tokenizer => {}", tokenizer);
78
- if (tokenizer.equals("neologd")){
78
+ if (tokenizer.equals("neologd")) {
79
79
  return new NeologdPageOutput(taskSource, inputSchema, outputSchema, output);
80
80
  }
81
81
  return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
@@ -86,7 +86,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
86
86
  * @param task
87
87
  * @return
88
88
  */
89
- private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
89
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema)
90
+ {
90
91
  final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
91
92
  logger.debug("outputColumns => {}", outputColumns);
92
93
  return new Schema(outputColumns);
@@ -97,7 +98,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
97
98
  * @param inputSchema
98
99
  * @return
99
100
  */
100
- private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
101
+ private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema)
102
+ {
101
103
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
102
104
  Map<String, Column> map = Maps.newLinkedHashMap();
103
105
  int i = 0;
@@ -117,7 +119,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
117
119
  }
118
120
 
119
121
  i = 0;
120
- for(Map.Entry<String, Column> e : map.entrySet()) {
122
+ for (Map.Entry<String, Column> e : map.entrySet()) {
121
123
  final Column column = e.getValue();
122
124
  builder.add(new Column(i++, column.getName(), column.getType()));
123
125
  }
@@ -39,7 +39,8 @@ public class KuromojiPageOutput implements PageOutput
39
39
  private final Schema outputSchema;
40
40
  private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
41
41
 
42
- public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
42
+ public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output)
43
+ {
43
44
  this.task = taskSource.loadTask(PluginTask.class);
44
45
  this.inputSchema = inputSchema;
45
46
  this.outputSchema = outputSchema;
@@ -48,9 +49,11 @@ public class KuromojiPageOutput implements PageOutput
48
49
  if (task.getDictionaryPath().isPresent()) {
49
50
  try {
50
51
  builder.userDictionary(task.getDictionaryPath().get());
51
- } catch (FileNotFoundException e) {
52
+ }
53
+ catch (FileNotFoundException e) {
52
54
  e.printStackTrace();
53
- } catch (IOException e) {
55
+ }
56
+ catch (IOException e) {
54
57
  e.printStackTrace();
55
58
  }
56
59
  }
@@ -58,9 +61,11 @@ public class KuromojiPageOutput implements PageOutput
58
61
  Mode mode = null;
59
62
  if (task.getMode().equals("normal")) {
60
63
  mode = Mode.NORMAL;
61
- } else if (task.getMode().equals("search")) {
64
+ }
65
+ else if (task.getMode().equals("search")) {
62
66
  mode = Mode.SEARCH;
63
- } else if (task.getMode().equals("extended")) {
67
+ }
68
+ else if (task.getMode().equals("extended")) {
64
69
  mode = Mode.EXTENDED;
65
70
  }
66
71
 
@@ -76,17 +81,20 @@ public class KuromojiPageOutput implements PageOutput
76
81
  }
77
82
 
78
83
  @Override
79
- public void finish() {
84
+ public void finish()
85
+ {
80
86
  builder.finish();
81
87
  }
82
88
 
83
89
  @Override
84
- public void close() {
90
+ public void close()
91
+ {
85
92
  builder.close();
86
93
  }
87
94
 
88
95
  @Override
89
- public void add(Page page) {
96
+ public void add(Page page)
97
+ {
90
98
  reader.setPage(page);
91
99
  while (reader.nextRecord()) {
92
100
  setValue(builder);
@@ -97,7 +105,8 @@ public class KuromojiPageOutput implements PageOutput
97
105
  /**
98
106
  * @param builder
99
107
  */
100
- private void setValue(PageBuilder builder) {
108
+ private void setValue(PageBuilder builder)
109
+ {
101
110
  if (task.getKeepInput()) {
102
111
  for (Column inputColumn : inputSchema.getColumns()) {
103
112
  if (reader.isNull(inputColumn)) {
@@ -106,15 +115,20 @@ public class KuromojiPageOutput implements PageOutput
106
115
  }
107
116
  if (Types.STRING.equals(inputColumn.getType())) {
108
117
  builder.setString(inputColumn, reader.getString(inputColumn));
109
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
118
+ }
119
+ else if (Types.BOOLEAN.equals(inputColumn.getType())) {
110
120
  builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
111
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
121
+ }
122
+ else if (Types.DOUBLE.equals(inputColumn.getType())) {
112
123
  builder.setDouble(inputColumn, reader.getDouble(inputColumn));
113
- } else if (Types.LONG.equals(inputColumn.getType())) {
124
+ }
125
+ else if (Types.LONG.equals(inputColumn.getType())) {
114
126
  builder.setLong(inputColumn, reader.getLong(inputColumn));
115
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
127
+ }
128
+ else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
116
129
  builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
117
- } else if (Types.JSON.equals(inputColumn.getType())) {
130
+ }
131
+ else if (Types.JSON.equals(inputColumn.getType())) {
118
132
  builder.setJson(inputColumn, reader.getJson(inputColumn));
119
133
  }
120
134
  }
@@ -131,13 +145,17 @@ public class KuromojiPageOutput implements PageOutput
131
145
  List<Value> outputs = Lists.newArrayList();
132
146
  for (Token token : tokens) {
133
147
  logger.debug("token => {}, {}", token, token.getAllFeatures());
134
- if (!isOkPartsOfSpeech(token)) { continue; }
148
+ if (!isOkPartsOfSpeech(token)) {
149
+ continue;
150
+ }
135
151
  String word = null;
136
152
  if ("base_form".equals(method)) {
137
153
  word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
138
- } else if ("reading".equals(method)) {
154
+ }
155
+ else if ("reading".equals(method)) {
139
156
  word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
140
- } else if ("surface_form".equals(method)) {
157
+ }
158
+ else if ("surface_form".equals(method)) {
141
159
  word = token.getSurface();
142
160
  }
143
161
  outputs.add(ValueFactory.newString(word));
@@ -145,15 +163,19 @@ public class KuromojiPageOutput implements PageOutput
145
163
  if (outputColumn.getType().equals(Types.STRING)) {
146
164
  Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
147
165
  builder.setString(outputColumn, joiner.join(outputs));
148
- } else if (outputColumn.getType().equals(Types.JSON)) {
166
+ }
167
+ else if (outputColumn.getType().equals(Types.JSON)) {
149
168
  builder.setJson(outputColumn, ValueFactory.newArray(outputs));
150
169
  }
151
170
  }
152
171
  }
153
172
  }
154
173
 
155
- private boolean isOkPartsOfSpeech(Token token) {
156
- if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
174
+ private boolean isOkPartsOfSpeech(Token token)
175
+ {
176
+ if (!task.getOkPartsOfSpeech().isPresent()) {
177
+ return true;
178
+ }
157
179
  for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
158
180
  if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
159
181
  return true;
@@ -9,9 +9,9 @@ import java.util.List;
9
9
  import java.util.Map;
10
10
  import java.util.Set;
11
11
 
12
+ import org.apache.lucene.analysis.CharArraySet;
12
13
  import org.apache.lucene.analysis.TokenStream;
13
14
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14
- import org.apache.lucene.analysis.util.CharArraySet;
15
15
  import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
16
16
  import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
17
17
  import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer.Mode;
@@ -39,7 +39,6 @@ import com.google.common.base.MoreObjects;
39
39
  import com.google.common.collect.Lists;
40
40
  import com.google.common.collect.Sets;
41
41
 
42
-
43
42
  public class NeologdPageOutput implements PageOutput
44
43
  {
45
44
  private final KuromojiFilterPlugin.PluginTask task;
@@ -51,7 +50,8 @@ public class NeologdPageOutput implements PageOutput
51
50
  private final JapaneseAnalyzer japaneseAnalyzer;
52
51
  private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
53
52
 
54
- public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
53
+ public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output)
54
+ {
55
55
  this.task = taskSource.loadTask(PluginTask.class);
56
56
  this.inputSchema = inputSchema;
57
57
  this.outputSchema = outputSchema;
@@ -69,7 +69,8 @@ public class NeologdPageOutput implements PageOutput
69
69
  File file = new File(task.getDictionaryPath().get());
70
70
  Reader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
71
71
  userDict = UserDictionary.open(reader);
72
- } catch (Exception e) {
72
+ }
73
+ catch (Exception e) {
73
74
  logger.error("neologd error", e);
74
75
  }
75
76
  }
@@ -77,9 +78,11 @@ public class NeologdPageOutput implements PageOutput
77
78
  Mode mode = null;
78
79
  if (task.getMode().equals("normal")) {
79
80
  mode = JapaneseTokenizer.Mode.NORMAL;
80
- } else if (task.getMode().equals("search")) {
81
+ }
82
+ else if (task.getMode().equals("search")) {
81
83
  mode = JapaneseTokenizer.Mode.SEARCH;
82
- } else if (task.getMode().equals("extended")) {
84
+ }
85
+ else if (task.getMode().equals("extended")) {
83
86
  mode = JapaneseTokenizer.Mode.EXTENDED;
84
87
  }
85
88
 
@@ -93,17 +96,20 @@ public class NeologdPageOutput implements PageOutput
93
96
  }
94
97
 
95
98
  @Override
96
- public void finish() {
99
+ public void finish()
100
+ {
97
101
  builder.finish();
98
102
  }
99
103
 
100
104
  @Override
101
- public void close() {
105
+ public void close()
106
+ {
102
107
  builder.close();
103
108
  }
104
109
 
105
110
  @Override
106
- public void add(Page page) {
111
+ public void add(Page page)
112
+ {
107
113
  reader.setPage(page);
108
114
  while (reader.nextRecord()) {
109
115
  setValue(builder);
@@ -114,7 +120,8 @@ public class NeologdPageOutput implements PageOutput
114
120
  /**
115
121
  * @param builder
116
122
  */
117
- private void setValue(PageBuilder builder) {
123
+ private void setValue(PageBuilder builder)
124
+ {
118
125
  if (task.getKeepInput()) {
119
126
  for (Column inputColumn : inputSchema.getColumns()) {
120
127
  if (reader.isNull(inputColumn)) {
@@ -123,15 +130,20 @@ public class NeologdPageOutput implements PageOutput
123
130
  }
124
131
  if (Types.STRING.equals(inputColumn.getType())) {
125
132
  builder.setString(inputColumn, reader.getString(inputColumn));
126
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
133
+ }
134
+ else if (Types.BOOLEAN.equals(inputColumn.getType())) {
127
135
  builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
128
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
136
+ }
137
+ else if (Types.DOUBLE.equals(inputColumn.getType())) {
129
138
  builder.setDouble(inputColumn, reader.getDouble(inputColumn));
130
- } else if (Types.LONG.equals(inputColumn.getType())) {
139
+ }
140
+ else if (Types.LONG.equals(inputColumn.getType())) {
131
141
  builder.setLong(inputColumn, reader.getLong(inputColumn));
132
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
142
+ }
143
+ else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
133
144
  builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
134
- } else if (Types.JSON.equals(inputColumn.getType())) {
145
+ }
146
+ else if (Types.JSON.equals(inputColumn.getType())) {
135
147
  builder.setJson(inputColumn, reader.getJson(inputColumn));
136
148
  }
137
149
  }
@@ -149,9 +161,11 @@ public class NeologdPageOutput implements PageOutput
149
161
  String word = null;
150
162
  if ("base_form".equals(method)) {
151
163
  word = token.getBaseForm();
152
- } else if ("reading".equals(method)) {
164
+ }
165
+ else if ("reading".equals(method)) {
153
166
  word = token.getReading();
154
- } else if ("surface_form".equals(method)) {
167
+ }
168
+ else if ("surface_form".equals(method)) {
155
169
  word = token.getCharTerm();
156
170
  }
157
171
  if (word != null) {
@@ -161,16 +175,20 @@ public class NeologdPageOutput implements PageOutput
161
175
  if (outputColumn.getType().equals(Types.STRING)) {
162
176
  Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
163
177
  builder.setString(outputColumn, joiner.join(outputs));
164
- } else if (outputColumn.getType().equals(Types.JSON)) {
178
+ }
179
+ else if (outputColumn.getType().equals(Types.JSON)) {
165
180
  builder.setJson(outputColumn, ValueFactory.newArray(outputs));
166
181
  }
167
182
  }
168
183
  }
169
184
  }
170
185
 
171
- private boolean isOkPartsOfSpeech(Token token) {
186
+ private boolean isOkPartsOfSpeech(Token token)
187
+ {
172
188
  logger.debug("{} => {}", token.getCharTerm(), token.getPartOfSpeech());
173
- if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
189
+ if (!task.getOkPartsOfSpeech().isPresent()) {
190
+ return true;
191
+ }
174
192
  for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
175
193
  if (token.getPartOfSpeech().startsWith(okPartsOfSpeech)) {
176
194
  return true;
@@ -179,9 +197,10 @@ public class NeologdPageOutput implements PageOutput
179
197
  return false;
180
198
  }
181
199
 
182
- private List<Token> tokenize(Reader reader) {
200
+ private List<Token> tokenize(Reader reader)
201
+ {
183
202
  List<Token> list = Lists.newArrayList();
184
- try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader) ) {
203
+ try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader)) {
185
204
  BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
186
205
  CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
187
206
  PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
@@ -194,10 +213,13 @@ public class NeologdPageOutput implements PageOutput
194
213
  token.setBaseForm(baseAttr.getBaseForm());
195
214
  token.setReading(readAttr.getReading());
196
215
  token.setPartOfSpeech(posAttr.getPartOfSpeech());
197
- if (!isOkPartsOfSpeech(token)) { continue; }
216
+ if (!isOkPartsOfSpeech(token)) {
217
+ continue;
218
+ }
198
219
  list.add(token);
199
220
  }
200
- } catch (Exception e) {
221
+ }
222
+ catch (Exception e) {
201
223
  logger.error("neologd error", e);
202
224
  }
203
225
  return list;
@@ -8,34 +8,44 @@ public class Token
8
8
  private String reading;
9
9
  private String inflection;
10
10
 
11
- public String getCharTerm() {
11
+ public String getCharTerm()
12
+ {
12
13
  return charTerm;
13
14
  }
14
- public String getBaseForm() {
15
+ public String getBaseForm()
16
+ {
15
17
  return baseForm;
16
18
  }
17
- public String getPartOfSpeech() {
19
+ public String getPartOfSpeech()
20
+ {
18
21
  return partOfSpeech;
19
22
  }
20
- public void setCharTerm(String charTerm) {
23
+ public void setCharTerm(String charTerm)
24
+ {
21
25
  this.charTerm = charTerm;
22
26
  }
23
- public void setBaseForm(String baseForm) {
27
+ public void setBaseForm(String baseForm)
28
+ {
24
29
  this.baseForm = baseForm;
25
30
  }
26
- public void setPartOfSpeech(String partOfSpeech) {
31
+ public void setPartOfSpeech(String partOfSpeech)
32
+ {
27
33
  this.partOfSpeech = partOfSpeech;
28
34
  }
29
- public void setReading(String reading) {
35
+ public void setReading(String reading)
36
+ {
30
37
  this.reading = reading;
31
38
  }
32
- public String getReading() {
39
+ public String getReading()
40
+ {
33
41
  return reading;
34
42
  }
35
- public String getInflection() {
43
+ public String getInflection()
44
+ {
36
45
  return inflection;
37
46
  }
38
- public void setInflection(String inflection) {
47
+ public void setInflection(String inflection)
48
+ {
39
49
  this.inflection = inflection;
40
50
  }
41
51
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-12 00:00:00.000000000 Z
11
+ date: 2016-12-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -61,12 +61,12 @@ files:
61
61
  - src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java
62
62
  - src/main/java/org/embulk/filter/kuromoji/Token.java
63
63
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
64
- - classpath/embulk-filter-kuromoji-0.4.0.jar
64
+ - classpath/embulk-filter-kuromoji-0.5.0.jar
65
65
  - classpath/kuromoji-core-0.9.0.jar
66
66
  - classpath/kuromoji-ipadic-0.9.0.jar
67
- - classpath/lucene-analyzers-common-5.4.1.jar
68
- - classpath/lucene-analyzers-kuromoji-ipadic-neologd-5.4.1-20160218.jar
69
- - classpath/lucene-core-5.4.1.jar
67
+ - classpath/lucene-analyzers-common-6.2.1.jar
68
+ - classpath/lucene-analyzers-kuromoji-ipadic-neologd-6.2.1-20161201.jar
69
+ - classpath/lucene-core-6.2.1.jar
70
70
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji
71
71
  licenses:
72
72
  - MIT
@@ -92,4 +92,3 @@ signing_key:
92
92
  specification_version: 4
93
93
  summary: Kuromoji filter plugin for Embulk. Neologd support.
94
94
  test_files: []
95
- has_rdoc: