RubyGems - embulk-filter-kuromoji - Versions diffs - 0.4.0 → 0.5.0 - Mend

embulk-filter-kuromoji 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/build.gradle +5 -5
data/src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java +6 -4
data/src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java +42 -20
data/src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java +46 -24
data/src/main/java/org/embulk/filter/kuromoji/Token.java +20 -10
metadata +6 -7

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d34a0b4db85a5b6954b4bb957398267e385a4503
-  data.tar.gz: 88ff125a93503f4d0961d85270278f2c666fead1
+  metadata.gz: f761d94b92551164712a27d55503e5fb44bf9530
+  data.tar.gz: 71d3c24850363da272c604e1bbffe62821c1e1f9
 SHA512:
-  metadata.gz: ca6c983fc956ba600c5ba89c45ab06826f903c68ef689eabfaeffb29df574d6e7e149a743e197bbfefc8127845397051f39fe4ecebed829ec18ce6633373657a
-  data.tar.gz: b072ee765942a7c72569e435d14322b172cd6acd395e2bc13e99d4103a2a7518039a0c2ee79111475a3880451180f94c47bb9a5103c741b0ed106134197afe39
+  metadata.gz: 95ffe8e33a1b6c0d2be2c7985dcc8474e786dcbc82899a89e5091155f4a2e3b4141a5308bd38807e4644f839a0ee91fae73d5a498f0e16b64a2cab98607534f9
+  data.tar.gz: ac70dbc1e7f830758a5da0ee46df782f3b95af2367184b78f511405621d9273b7e86b907f33987dbfaed4cdcf9ffdd3c62381d16c07401ab8df346c0023c01a5

data/build.gradle CHANGED

@@ -17,18 +17,18 @@ configurations {
     provided
 }
-version = "0.4.0"
+version = "0.5.0"
 sourceCompatibility = 1.7
 targetCompatibility = 1.7
 dependencies {
-    compile  "org.embulk:embulk-core:0.8.9"
+    compile  "org.embulk:embulk-core:0.8.15"
     compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
-    compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:5.4.1-20160218"
-    provided "org.embulk:embulk-core:0.8.9"
+    compile "org.codelibs:lucene-analyzers-kuromoji-ipadic-neologd:6.2.1-20161201"
+    provided "org.embulk:embulk-core:0.8.15"
     testCompile "junit:junit:4.+"
-    testCompile  "org.embulk:embulk-core:0.8.9"
+    testCompile  "org.embulk:embulk-core:0.8.15"
 }
 task classpath(type: Copy, dependsOn: ["jar"]) {

data/src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java CHANGED

@@ -75,7 +75,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
     {
         final String tokenizer = taskSource.loadTask(PluginTask.class).getTokenizer();
         logger.info("Tokenizer => {}", tokenizer);
-        if (tokenizer.equals("neologd")){
+        if (tokenizer.equals("neologd")) {
             return new NeologdPageOutput(taskSource, inputSchema, outputSchema, output);
         }
         return new KuromojiPageOutput(taskSource, inputSchema, outputSchema, output);
@@ -86,7 +86,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
      * @param task
      * @return
      */
-    private Schema buildOutputSchema(PluginTask task, Schema inputSchema) {
+    private Schema buildOutputSchema(PluginTask task, Schema inputSchema)
+    {
         final List<Column> outputColumns = buildOutputColumns(task, inputSchema);
         logger.debug("outputColumns => {}", outputColumns);
         return new Schema(outputColumns);
@@ -97,7 +98,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
      * @param inputSchema
      * @return
      */
-    private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema) {
+    private List<Column> buildOutputColumns(PluginTask task, Schema inputSchema)
+    {
         ImmutableList.Builder<Column> builder = ImmutableList.builder();
         Map<String, Column> map = Maps.newLinkedHashMap();
         int i = 0;
@@ -117,7 +119,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
         }
         i = 0;
-        for(Map.Entry<String, Column> e : map.entrySet()) {
+        for (Map.Entry<String, Column> e : map.entrySet()) {
             final Column column = e.getValue();
             builder.add(new Column(i++, column.getName(), column.getType()));
         }

data/src/main/java/org/embulk/filter/kuromoji/KuromojiPageOutput.java CHANGED

@@ -39,7 +39,8 @@ public class KuromojiPageOutput implements PageOutput
     private final Schema outputSchema;
     private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
-    public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
+    public KuromojiPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output)
+    {
         this.task = taskSource.loadTask(PluginTask.class);
         this.inputSchema = inputSchema;
         this.outputSchema = outputSchema;
@@ -48,9 +49,11 @@ public class KuromojiPageOutput implements PageOutput
         if (task.getDictionaryPath().isPresent()) {
             try {
                 builder.userDictionary(task.getDictionaryPath().get());
-            } catch (FileNotFoundException e) {
+            }
+            catch (FileNotFoundException e) {
                 e.printStackTrace();
-            } catch (IOException e) {
+            }
+            catch (IOException e) {
                 e.printStackTrace();
             }
         }
@@ -58,9 +61,11 @@ public class KuromojiPageOutput implements PageOutput
         Mode mode = null;
         if (task.getMode().equals("normal")) {
             mode = Mode.NORMAL;
-        } else if (task.getMode().equals("search")) {
+        }
+        else if (task.getMode().equals("search")) {
             mode = Mode.SEARCH;
-        } else if (task.getMode().equals("extended")) {
+        }
+        else if (task.getMode().equals("extended")) {
             mode = Mode.EXTENDED;
         }
@@ -76,17 +81,20 @@ public class KuromojiPageOutput implements PageOutput
     }
     @Override
-    public void finish() {
+    public void finish()
+    {
         builder.finish();
     }
     @Override
-    public void close() {
+    public void close()
+    {
         builder.close();
     }
     @Override
-    public void add(Page page) {
+    public void add(Page page)
+    {
         reader.setPage(page);
         while (reader.nextRecord()) {
             setValue(builder);
@@ -97,7 +105,8 @@ public class KuromojiPageOutput implements PageOutput
     /**
      * @param builder
      */
-    private void setValue(PageBuilder builder) {
+    private void setValue(PageBuilder builder)
+    {
         if (task.getKeepInput()) {
             for (Column inputColumn : inputSchema.getColumns()) {
                 if (reader.isNull(inputColumn)) {
@@ -106,15 +115,20 @@ public class KuromojiPageOutput implements PageOutput
                 }
                 if (Types.STRING.equals(inputColumn.getType())) {
                     builder.setString(inputColumn, reader.getString(inputColumn));
-                } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
+                }
+                else if (Types.BOOLEAN.equals(inputColumn.getType())) {
                     builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
-                } else if (Types.DOUBLE.equals(inputColumn.getType())) {
+                }
+                else if (Types.DOUBLE.equals(inputColumn.getType())) {
                     builder.setDouble(inputColumn, reader.getDouble(inputColumn));
-                } else if (Types.LONG.equals(inputColumn.getType())) {
+                }
+                else if (Types.LONG.equals(inputColumn.getType())) {
                     builder.setLong(inputColumn, reader.getLong(inputColumn));
-                } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
+                }
+                else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
                     builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
-                } else if (Types.JSON.equals(inputColumn.getType())) {
+                }
+                else if (Types.JSON.equals(inputColumn.getType())) {
                     builder.setJson(inputColumn, reader.getJson(inputColumn));
                 }
             }
@@ -131,13 +145,17 @@ public class KuromojiPageOutput implements PageOutput
                 List<Value> outputs = Lists.newArrayList();
                 for (Token token : tokens) {
                     logger.debug("token => {}, {}", token, token.getAllFeatures());
-                    if (!isOkPartsOfSpeech(token)) { continue; }
+                    if (!isOkPartsOfSpeech(token)) {
+                        continue;
+                    }
                     String word = null;
                     if ("base_form".equals(method)) {
                         word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
-                    } else if ("reading".equals(method)) {
+                    }
+                    else if ("reading".equals(method)) {
                         word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
-                    } else if ("surface_form".equals(method)) {
+                    }
+                    else if ("surface_form".equals(method)) {
                         word = token.getSurface();
                     }
                     outputs.add(ValueFactory.newString(word));
@@ -145,15 +163,19 @@ public class KuromojiPageOutput implements PageOutput
                 if (outputColumn.getType().equals(Types.STRING)) {
                     Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
                     builder.setString(outputColumn, joiner.join(outputs));
-                } else if (outputColumn.getType().equals(Types.JSON)) {
+                }
+                else if (outputColumn.getType().equals(Types.JSON)) {
                     builder.setJson(outputColumn, ValueFactory.newArray(outputs));
                 }
             }
         }
     }
-    private boolean isOkPartsOfSpeech(Token token) {
-        if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
+    private boolean isOkPartsOfSpeech(Token token)
+    {
+        if (!task.getOkPartsOfSpeech().isPresent()) {
+            return true;
+        }
         for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
             if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
                 return true;

data/src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java CHANGED

@@ -9,9 +9,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer.Mode;
@@ -39,7 +39,6 @@ import com.google.common.base.MoreObjects;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
 public class NeologdPageOutput implements PageOutput
 {
     private final KuromojiFilterPlugin.PluginTask task;
@@ -51,7 +50,8 @@ public class NeologdPageOutput implements PageOutput
     private final JapaneseAnalyzer japaneseAnalyzer;
     private static final Logger logger = Exec.getLogger(KuromojiFilterPlugin.class);
-    public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
+    public NeologdPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output)
+    {
         this.task = taskSource.loadTask(PluginTask.class);
         this.inputSchema = inputSchema;
         this.outputSchema = outputSchema;
@@ -69,7 +69,8 @@ public class NeologdPageOutput implements PageOutput
                 File file = new File(task.getDictionaryPath().get());
                 Reader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
                 userDict = UserDictionary.open(reader);
-            } catch (Exception e) {
+            }
+            catch (Exception e) {
                 logger.error("neologd error", e);
             }
         }
@@ -77,9 +78,11 @@ public class NeologdPageOutput implements PageOutput
         Mode mode = null;
         if (task.getMode().equals("normal")) {
             mode = JapaneseTokenizer.Mode.NORMAL;
-        } else if (task.getMode().equals("search")) {
+        }
+        else if (task.getMode().equals("search")) {
             mode = JapaneseTokenizer.Mode.SEARCH;
-        } else if (task.getMode().equals("extended")) {
+        }
+        else if (task.getMode().equals("extended")) {
             mode = JapaneseTokenizer.Mode.EXTENDED;
         }
@@ -93,17 +96,20 @@ public class NeologdPageOutput implements PageOutput
     }
     @Override
-    public void finish() {
+    public void finish()
+    {
         builder.finish();
     }
     @Override
-    public void close() {
+    public void close()
+    {
         builder.close();
     }
     @Override
-    public void add(Page page) {
+    public void add(Page page)
+    {
         reader.setPage(page);
         while (reader.nextRecord()) {
             setValue(builder);
@@ -114,7 +120,8 @@ public class NeologdPageOutput implements PageOutput
     /**
      * @param builder
      */
-    private void setValue(PageBuilder builder) {
+    private void setValue(PageBuilder builder)
+    {
         if (task.getKeepInput()) {
             for (Column inputColumn : inputSchema.getColumns()) {
                 if (reader.isNull(inputColumn)) {
@@ -123,15 +130,20 @@ public class NeologdPageOutput implements PageOutput
                 }
                 if (Types.STRING.equals(inputColumn.getType())) {
                     builder.setString(inputColumn, reader.getString(inputColumn));
-                } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
+                }
+                else if (Types.BOOLEAN.equals(inputColumn.getType())) {
                     builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
-                } else if (Types.DOUBLE.equals(inputColumn.getType())) {
+                }
+                else if (Types.DOUBLE.equals(inputColumn.getType())) {
                     builder.setDouble(inputColumn, reader.getDouble(inputColumn));
-                } else if (Types.LONG.equals(inputColumn.getType())) {
+                }
+                else if (Types.LONG.equals(inputColumn.getType())) {
                     builder.setLong(inputColumn, reader.getLong(inputColumn));
-                } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
+                }
+                else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
                     builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
-                } else if (Types.JSON.equals(inputColumn.getType())) {
+                }
+                else if (Types.JSON.equals(inputColumn.getType())) {
                     builder.setJson(inputColumn, reader.getJson(inputColumn));
                 }
             }
@@ -149,9 +161,11 @@ public class NeologdPageOutput implements PageOutput
                     String word = null;
                     if ("base_form".equals(method)) {
                         word = token.getBaseForm();
-                    } else if ("reading".equals(method)) {
+                    }
+                    else if ("reading".equals(method)) {
                         word = token.getReading();
-                    } else if ("surface_form".equals(method)) {
+                    }
+                    else if ("surface_form".equals(method)) {
                         word = token.getCharTerm();
                     }
                     if (word != null) {
@@ -161,16 +175,20 @@ public class NeologdPageOutput implements PageOutput
                 if (outputColumn.getType().equals(Types.STRING)) {
                     Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
                     builder.setString(outputColumn, joiner.join(outputs));
-                } else if (outputColumn.getType().equals(Types.JSON)) {
+                }
+                else if (outputColumn.getType().equals(Types.JSON)) {
                     builder.setJson(outputColumn, ValueFactory.newArray(outputs));
                 }
             }
         }
     }
-    private boolean isOkPartsOfSpeech(Token token) {
+    private boolean isOkPartsOfSpeech(Token token)
+    {
         logger.debug("{} => {}", token.getCharTerm(), token.getPartOfSpeech());
-        if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
+        if (!task.getOkPartsOfSpeech().isPresent()) {
+            return true;
+        }
         for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
             if (token.getPartOfSpeech().startsWith(okPartsOfSpeech)) {
                 return true;
@@ -179,9 +197,10 @@ public class NeologdPageOutput implements PageOutput
         return false;
     }
-    private List<Token> tokenize(Reader reader) {
+    private List<Token> tokenize(Reader reader)
+    {
         List<Token> list = Lists.newArrayList();
-        try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader) ) {
+        try (TokenStream tokenStream = japaneseAnalyzer.tokenStream("", reader)) {
             BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
             CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
             PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
@@ -194,10 +213,13 @@ public class NeologdPageOutput implements PageOutput
                 token.setBaseForm(baseAttr.getBaseForm());
                 token.setReading(readAttr.getReading());
                 token.setPartOfSpeech(posAttr.getPartOfSpeech());
-                if (!isOkPartsOfSpeech(token)) { continue; }
+                if (!isOkPartsOfSpeech(token)) {
+                    continue;
+                }
                 list.add(token);
             }
-        } catch (Exception e) {
+        }
+        catch (Exception e) {
             logger.error("neologd error", e);
         }
         return list;

data/src/main/java/org/embulk/filter/kuromoji/Token.java CHANGED

@@ -8,34 +8,44 @@ public class Token
     private String reading;
     private String inflection;
-    public String getCharTerm() {
+    public String getCharTerm()
+    {
         return charTerm;
     }
-    public String getBaseForm() {
+    public String getBaseForm()
+    {
         return baseForm;
     }
-    public String getPartOfSpeech() {
+    public String getPartOfSpeech()
+    {
         return partOfSpeech;
     }
-    public void setCharTerm(String charTerm) {
+    public void setCharTerm(String charTerm)
+    {
         this.charTerm = charTerm;
     }
-    public void setBaseForm(String baseForm) {
+    public void setBaseForm(String baseForm)
+    {
         this.baseForm = baseForm;
     }
-    public void setPartOfSpeech(String partOfSpeech) {
+    public void setPartOfSpeech(String partOfSpeech)
+    {
         this.partOfSpeech = partOfSpeech;
     }
-    public void setReading(String reading) {
+    public void setReading(String reading)
+    {
         this.reading = reading;
     }
-    public String getReading() {
+    public String getReading()
+    {
         return reading;
     }
-    public String getInflection() {
+    public String getInflection()
+    {
         return inflection;
     }
-    public void setInflection(String inflection) {
+    public void setInflection(String inflection)
+    {
         this.inflection = inflection;
     }
 }

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: embulk-filter-kuromoji
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - toyama0919
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-07-12 00:00:00.000000000 Z
+date: 2016-12-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -61,12 +61,12 @@ files:
 - src/main/java/org/embulk/filter/kuromoji/NeologdPageOutput.java
 - src/main/java/org/embulk/filter/kuromoji/Token.java
 - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
-- classpath/embulk-filter-kuromoji-0.4.0.jar
+- classpath/embulk-filter-kuromoji-0.5.0.jar
 - classpath/kuromoji-core-0.9.0.jar
 - classpath/kuromoji-ipadic-0.9.0.jar
-- classpath/lucene-analyzers-common-5.4.1.jar
-- classpath/lucene-analyzers-kuromoji-ipadic-neologd-5.4.1-20160218.jar
-- classpath/lucene-core-5.4.1.jar
+- classpath/lucene-analyzers-common-6.2.1.jar
+- classpath/lucene-analyzers-kuromoji-ipadic-neologd-6.2.1-20161201.jar
+- classpath/lucene-core-6.2.1.jar
 homepage: https://github.com/toyama0919/embulk-filter-kuromoji
 licenses:
 - MIT
@@ -92,4 +92,3 @@ signing_key:
 specification_version: 4
 summary: Kuromoji filter plugin for Embulk. Neologd support.
 test_files: []
-has_rdoc: