RubyGems - ve - Versions diffs - 0.0.3 → 0.0.4 - Mend

ve 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +5 -5
data/.gitignore +2 -0
data/.travis.yml +3 -6
data/Gemfile +8 -6
data/Gemfile.lock +29 -19
data/LICENSE.txt +21 -0
data/Readme.md +42 -5
data/java/.gitignore +4 -0
data/java/build.gradle +38 -0
data/java/gradle/wrapper/gradle-wrapper.jar +0 -0
data/java/gradle/wrapper/gradle-wrapper.properties +5 -0
data/java/gradlew +185 -0
data/java/gradlew.bat +104 -0
data/java/pom.xml +56 -0
data/java/readme.md +103 -0
data/java/settings.gradle +1 -0
data/java/src/main/java/ve/Grammar.java +10 -0
data/java/src/main/java/ve/Parse.java +336 -0
data/java/src/main/java/ve/Pos.java +27 -0
data/java/src/main/java/ve/Word.java +104 -0
data/java/src/test/java/ve/VeTest.java +41 -0
data/lib/part_of_speech.rb +1 -1
data/lib/providers/freeling_en.rb +29 -28
data/lib/providers/japanese_transliterators.rb +14 -14
data/lib/providers/mecab_ipadic.rb +10 -10
data/lib/ve.rb +21 -15
data/lib/word.rb +19 -12
data/sinatra/server.rb +35 -2
data/tests/japanese_transliterators_test.rb +8 -5
data/tests/mecab_ipadic_parse_test.rb +12 -0
data/tests/test_helper.rb +0 -1
data/tests/ve_test.rb +0 -1
data/ve.gemspec +9 -7
metadata +24 -9

data/java/src/main/java/ve/Pos.java ADDED

@@ -0,0 +1,27 @@
+package ve;
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+ * Released under MIT license (see LICENSE.txt at root of repository).
+ *
+ * Based on ve/lib/part_of_speech.rb.
+ **/
+public enum Pos {
+        Noun,
+        ProperNoun,
+        Pronoun,
+        Adjective,
+        Adverb,
+        Determiner,
+        Preposition,
+        Postposition,
+        Verb,
+        Suffix,
+        Prefix,
+        Conjunction,
+        Interjection,
+        Number,
+        Unknown,
+        Symbol,
+        Other,
+        TBD
+}

data/java/src/main/java/ve/Word.java ADDED

@@ -0,0 +1,104 @@
+package ve;
+import org.atilika.kuromoji.Token;
+import java.util.ArrayList;
+import java.util.List;
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+ * Released under MIT license (see LICENSE.txt at root of repository).
+ *
+ * A Word is composed of one or more Tokens, as stored in an internal List.
+ * It also has various fields like 'reading' and 'transcription', which may
+ * build up as extra Tokens are added to the list.
+ * Words are identified and built up by this project's Parse.words() method.
+ **/
+public class Word {
+//    These five seem underdeveloped and underpopulated:
+    private String reading;
+    private String transcription;
+    private Grammar grammar;
+//    private String reading_script;
+//    private String transcription_script;
+    private String lemma; // "聞く"
+    private Pos part_of_speech; // eg. Pos.Noun
+    private List<Token> tokens = new ArrayList<>(); // those which were eaten up by this one word: {聞か, せ, られ}
+    private String word; // "聞かせられ"
+    /**
+     * Incoming variables are named in the style of Sen; fields are named in the style of Ve.
+     * @param read - call token.getReading().
+     * @param pronunciation - call token.getPronunciation().
+     * @param grammar - this is an underdeveloped enum-like variable originating from Ve.
+     * @param basic - call token.getBasicString().
+     * @param part_of_speech - this is another underdeveloped enum-like variable originating from Ve.
+     * @param nodeStr - call token.getNodeStr().
+     * @param token - pass in a Token composing part of the Word. Currently expects the Token to come from Sen, but could
+     *              be simply adapted to come from Kuromoji.
+     */
+    public Word(String read,
+                String pronunciation,
+                Grammar grammar,
+//                String reading_script,
+//                String transcription_script,
+                String basic,
+                Pos part_of_speech,
+                String nodeStr,
+                Token token) {
+        this.reading = read;
+        this.transcription = pronunciation;
+        this.grammar = grammar;
+//        this.reading_script = reading_script;
+//        this.transcription_script = transcription_script;
+        this.lemma = basic;
+        this.part_of_speech = part_of_speech;
+        this.word = nodeStr;
+        tokens.add(token);
+    }
+    public void setPart_of_speech(Pos part_of_speech) {
+        this.part_of_speech = part_of_speech;
+    }
+    public String getLemma() {
+        return lemma;
+    }
+    public Pos getPart_of_speech() {
+        return part_of_speech;
+    }
+    public List<Token> getTokens() {
+        return tokens;
+    }
+    public String getWord() {
+        return word;
+    }
+    public void appendToWord(String suffix) {
+        if(word == null) word = "_".concat(suffix); // likely won't experience a null word, actually.
+        else word = word.concat(suffix);
+    }
+    public void appendToReading(String suffix) {
+        if(reading == null) reading = "_".concat(suffix);
+        else reading = reading.concat(suffix);
+    }
+    public void appendToTranscription(String suffix) {
+        if(transcription == null) transcription = "_".concat(suffix);
+        else transcription = transcription.concat(suffix);
+    }
+    // Not sure when this would change.
+    public void appendToLemma(String suffix) {
+        if(lemma == null) lemma = "_".concat(suffix);
+        else lemma = lemma.concat(suffix);
+    }
+    @Override
+    public String toString() {
+        return word;
+    }
+}

data/java/src/test/java/ve/VeTest.java ADDED

@@ -0,0 +1,41 @@
+package ve;
+import org.atilika.kuromoji.Token;
+import org.atilika.kuromoji.Tokenizer;
+import org.junit.Test;
+import java.util.List;
+/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
+  * Released under MIT license (see LICENSE.txt at root of repository).
+  *
+  * This test is purely to show the console output; it is unconditional.
+  **/
+public class VeTest {
+    /** More hardcore test sentence at: https://hondou.homedns.org/pukiwiki/index.php?cmd=read&page=Java%20SEN%20%A4%C7%B7%C1%C2%D6%C1%C7%B2%F2%C0%CF
+     */
+    @Test
+    public void coreUsage() {
+        String kanji = "お金がなければいけないです。";
+        List<Token> tokensList = Tokenizer.builder().build().tokenize(kanji);
+        Token[] tokensArray = tokensList.toArray(new Token[tokensList.size()]);
+        Parse parser = new Parse(tokensArray);
+        List<Word> words = parser.words();
+        System.out.println(words);
+        /*  Prints out:
+            [お金, が, なければいけない, です, 。]
+        */
+        /* Note: I have found that, depending on the MeCab dictionary/model, POS-tagging of tokens may vary.
+           ie: for a particular sentence, when tokenizing using net.java.sen:
+               なけれ is labelled as a DOUSHI-JITATSU-*-*.
+           However, when tokenizing using org.atilika.kuromoji:
+               なけれ is labelled as a KEIYOUSHI-JITATSU-*-*.
+           So your mileage may vary (very slightly) if comparing to other tokenizer results..!
+           Not the Ve algorithm's fault, fortunately.
+        */
+    }
+}

data/lib/part_of_speech.rb CHANGED

@@ -2,7 +2,7 @@ class Ve
   class PartOfSpeech
     def self.name
-      self.to_s.split('::').last.downcase
+      self.to_s.split('::').last.gsub(/(?<=[A-Za-z])(?=[A-Z])/, ' ').downcase # RegEx adds spaces before uppercase letters. Ex: Ve::PartOfSpeech::ProperNoun.name => "proper noun"
     end
     class Noun < PartOfSpeech; end

data/lib/providers/freeling_en.rb CHANGED

@@ -10,39 +10,40 @@ class Ve
     class FreelingEn < Ve::Provider
       # FIX: This class isn't tested
       BIT_STOP = 'VeEnd'
       # TODO: Automatically set FREELINGSHARE if it's not set?
       def initialize(config = {})
         @config = {:app => 'analyzer',
                    :path => '',
                    :flags => ''}.merge(config)
         @config[:app] = `which #{@config[:app]}`.strip!
         local = @config[:app] =~ /local/ ? '/local' : ''
-        @config[:flags] = "-f /usr#{local}/share/FreeLing/config/en.cfg --flush --nonumb --nodate"
+        share_dir = "/usr#{local}/share"
+        @config[:freeling_dir_name] = Dir.exist?("#{share_dir}/FreeLing") ? 'FreeLing' : 'freeling'
+        @config[:flags] = "-f #{share_dir}/#{@config[:freeling_dir_name]}/config/en.cfg --flush --nonumb --nodate"
         start!
       end
       # Interface methods
       def works?
         p = parse('Wrote')
-        ["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
+        "Wrote write VBD 1" == p.tokens.collect { |t| t[:raw] }[0]
       end
       # Talks to the app and returns a parse object
       def parse(text, options = {})
         start! if @stdin.nil?
         # Fix Unicode chars
         # TODO: These need to be converted back to the original char in the :literal attribute
         text = text.gsub('’', "'")
         @stdin.puts "#{text}\n#{BIT_STOP}\n"
         output = []
         while line = @stdout.readline
-          puts line
           if line =~ /#{BIT_STOP}/x
             @stdout.readline
             break
@@ -56,17 +57,17 @@ class Ve
       end
       private
       def start!
         @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
         # TODO: Also filter out non-iso-latin-1 characters
         @stdin.set_encoding('UTF-8', 'ISO-8859-1')
         @stdout.set_encoding('ISO-8859-1', 'UTF-8')
       rescue Errno::ENOENT
         # The parser couldn't be started. Probably not installed on this system
       end
     end
   end
 end
@@ -74,14 +75,14 @@ end
 class Ve
   class Parse
     class FreelingEn < Ve::Parse
       attr_reader :tokens, :text
       def initialize(text, output)
         @tokens = []
         @text = text
         position = 0
         output.each_with_index do |line, index|
           line.rstrip!
           token = {:raw => line}
@@ -98,7 +99,7 @@ class Ve
               @tokens << unparsed_token
             end
           end
           # Sentence splits are just empty lines in Freeling
           if line.length == 0
             token[:type] = :sentence_split
@@ -106,7 +107,7 @@ class Ve
             @tokens << token
             next
           end
           # The parsed token
           info = line.split(/\s+/)
           token[:type] = :parsed
@@ -116,7 +117,7 @@ class Ve
           token[:literal].gsub!('_', ' ')
           token[:lemma].gsub!('_', ' ')
           # Anything unparsed preceding this token.
           # We need to do this complicated dance with _ since Freeling replaces spaces with it.
           # And so we need to be able to find the token with both spaces and _ in it since
@@ -137,7 +138,7 @@ class Ve
           @tokens << token
         end
       end
       INTERNAL_INFO_FOR_PARSED_POS = {
         'CC' => [Ve::PartOfSpeech::Conjunction, nil],
         'CD' => [Ve::PartOfSpeech::Number, nil],
@@ -176,10 +177,10 @@ class Ve
         'WRB' => [Ve::PartOfSpeech::Adverb, nil],
         'Z' => [Ve::PartOfSpeech::Determiner, nil]
       }
       def words
         words = []
         @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
           if token[:pos] == 'POS'
             # Possessive ending, add to previous token
@@ -199,14 +200,14 @@ class Ve
             words << word
           end
         end
         words
       end
       def sentences
         sentences = []
         current = ''
         @tokens.each do |token|
           if token[:type] == :sentence_split
             sentences << current
@@ -215,14 +216,14 @@ class Ve
             current << token[:literal]
           end
         end
         # In case there is no :sentence_split at the end
         sentences << current if current.length > 0
         sentences.collect { |s| s.strip! }
         sentences
       end
     end
   end
 end

data/lib/providers/japanese_transliterators.rb CHANGED

@@ -25,7 +25,7 @@ class Ve
       H_SYLLABIC_N   = 'ん'
       H_SMALL_TSU    = 'っ'
       HIRA_TO_LATN = {
         "あ"=>"a", "い"=>"i", "う"=>"u", "え"=>"e", "お"=>"o",
         "か"=>"ka", "き"=>"ki", "く"=>"ku", "け"=>"ke", "こ"=>"ko",
@@ -42,17 +42,17 @@ class Ve
         "や"=>"ya", "ゆ"=>"yu", "よ"=>"yo",
         "ら"=>"ra", "り"=>"ri", "る"=>"ru", "れ"=>"re", "ろ"=>"ro",
         "わ"=>"wa", "うぃ"=>"whi", "うぇ"=>"whe", "を"=>"wo",
-        "ゑ"=>"wye", "ゐ"=>"wyi", "ー"=>"-", "ん"=>"n",
+        "ゑ"=>"we", "ゐ"=>"wi", "ー"=>"-", "ん"=>"n",
         "きゃ"=>"kya", "きゅ"=>"kyu", "きょ"=>"kyo", "きぇ"=>"kye", "きぃ"=>"kyi",
         "ぎゃ"=>"gya", "ぎゅ"=>"gyu", "ぎょ"=>"gyo", "ぎぇ"=>"gye", "ぎぃ"=>"gyi",
         "くぁ"=>"kwa", "くぃ"=>"kwi", "くぅ"=>"kwu", "くぇ"=>"kwe", "くぉ"=>"kwo",
         "ぐぁ"=>"qwa", "ぐぃ"=>"gwi", "ぐぅ"=>"gwu", "ぐぇ"=>"gwe", "ぐぉ"=>"gwo",
         "しゃ"=>"sha", "しぃ"=>"syi", "しゅ"=>"shu", "しぇ"=>"she", "しょ"=>"sho",
-        "じゃ"=>"jya", "じゅ"=>"zyu", "じぇ"=>"zye", "じょ"=>"zyo", "じぃ"=>"zyi",
+        "じゃ"=>"ja", "じゅ"=>"ju", "じぇ"=>"jye", "じょ"=>"jo", "じぃ"=>"jyi",
         "すぁ"=>"swa", "すぃ"=>"swi", "すぅ"=>"swu", "すぇ"=>"swe", "すぉ"=>"swo",
-        "ちゃ"=>"tya", "ちゅ"=>"tyu", "ちぇ"=>"tye", "ちょ"=>"tyo", "ちぃ"=>"tyi",
-        "ぢゃ"=>"dya", "ぢぃ"=>"dyi", "ぢゅ"=>"dyu", "ぢぇ"=>"dye", "ぢょ"=>"dyo",
+        "ちゃ"=>"cha", "ちゅ"=>"chu", "ちぇ"=>"tye", "ちょ"=>"cho", "ちぃ"=>"tyi",
+        "ぢゃ"=>"ja", "ぢぃ"=>"dyi", "ぢゅ"=>"ju", "ぢぇ"=>"dye", "ぢょ"=>"jo",
         "つぁ"=>"tsa", "つぃ"=>"tsi", "つぇ"=>"tse", "つぉ"=>"tso", "てゃ"=>"tha",
         "てぃ"=>"thi", "てゅ"=>"thu", "てぇ"=>"the", "てょ"=>"tho", "とぁ"=>"twa",
         "とぃ"=>"twi", "とぅ"=>"twu", "とぇ"=>"twe", "とぉ"=>"two", "でゃ"=>"dha",
@@ -72,7 +72,7 @@ class Ve
         "ぁ"=>"xa", "ぃ"=>"xi", "ぅ"=>"xu", "ぇ"=>"xe", "ぉ"=>"xo",
         "ゕ"=>"xka", "ゖ"=>"xke", "ゎ"=>"xwa"
       }
       LATN_TO_HIRA = {
         'a'   => 'あ', 'i'   => 'い',                'u'  => 'う',               'e'  => 'え',   'o'  => 'お',
         'ka'  => 'か', 'ki'  => 'き',                'ku' => 'く',               'ke' => 'け',   'ko' => 'こ',
@@ -98,7 +98,7 @@ class Ve
         'gya' => 'ぎゃ', 'gyu' => 'ぎゅ', 'gyo' => 'ぎょ', 'gye' => 'ぎぇ', 'gyi' => 'ぎぃ',
         'kwa' => 'くぁ', 'kwi' => 'くぃ', 'kwu' => 'くぅ', 'kwe' => 'くぇ', 'kwo' => 'くぉ',
         'gwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
-        'qwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
+        'qwa' => 'ぐぁ', 'qwi' => 'ぐぃ', 'qwu' => 'ぐぅ', 'qwe' => 'ぐぇ', 'qwo' => 'ぐぉ',
         'sya' => 'しゃ', 'syi' => 'しぃ', 'syu' => 'しゅ', 'sye' => 'しぇ', 'syo' => 'しょ',
         'sha' => 'しゃ',                  'shu' => 'しゅ', 'she' => 'しぇ', 'sho' => 'しょ',
@@ -196,7 +196,7 @@ class Ve
         return romaji
       end
       def transliterate_from_latn_to_hrkt
         romaji = @text.dup
         kana = ''
@@ -220,8 +220,8 @@ class Ve
             elsif LATN_TO_HIRA[for_conversion]
               # Generic cases
               mora = LATN_TO_HIRA[for_conversion]
-            elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwc])\1/))
-              # tch and double-consonants for small tsu
+            elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwchf])\1/))
+              # tch and double-consonants for small tsu
               mora = H_SMALL_TSU
               for_removal = 1
             end
@@ -237,7 +237,7 @@ class Ve
               else
                 kana << mora
               end
               romaji[0, for_removal] = ''
               break
             elsif length == 1
@@ -250,7 +250,7 @@ class Ve
         return kana
       end
       def transliterate_from_kana_to_hira
         transpose_codepoints_in_range(@text, -96, 12449..12534)
       end
@@ -268,7 +268,7 @@ class Ve
         res = transpose_codepoints_in_range(@text, 65248, 33..126)
         transpose_codepoints_in_range(res, 12256, 32..32)
       end
       private
       def transpose_codepoints_in_range(text, distance, range)
@@ -284,7 +284,7 @@ class Ve
         return result
       end
     end
   end
 end