RubyGems - ve - Versions diffs - 0.0.2 → 0.0.3 - Mend

ve 0.0.2 → 0.0.3

Files changed (18) hide show

checksums.yaml +7 -0
data/.gitignore +1 -0
data/.travis.yml +9 -0
data/Gemfile +5 -0
data/Gemfile.lock +6 -0
data/lib/providers/freeling_en.rb +4 -2
data/lib/providers/mecab_ipadic.rb +52 -32
data/lib/ve.rb +15 -4
data/tests/{freeling_en_test.rb → freeling_en_parse_test.rb} +37 -48
data/tests/freeling_en_provider_test.rb +38 -0
data/tests/japanese_transliterators_test.rb +1 -1
data/tests/mecab_ipadic_parse_test.rb +772 -0
data/tests/mecab_ipadic_provider_test.rb +21 -0
data/tests/test_helper.rb +5 -4
data/tests/ve_test.rb +5 -1
data/ve.gemspec +1 -1
metadata +27 -35
data/tests/mecab_ipadic_test.rb +0 -452

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7667f10a89f699b284d7a412bab815d46e9bf26d
+  data.tar.gz: 7700e9a46ee0321b746ce23c806df91fb54b7252
+SHA512:
+  metadata.gz: fa87fa761966cc70ec3edf7dad6b4ef36404a8adc14314f88aa7bd91b34784a98de198d699f6fe8650ace92fb45b9be7c5ac6e23d35b5f1fcf9a138d75d647b0
+  data.tar.gz: 78e90e9c7af44b26bebd04bd4f44e002b392438bfe7422950f473d01e99840e5e26f6ca0887a17f76f2d7211f49d44fee2ce7e86ed832824f0d01d5109a6cea4

data/.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 .DS_Store
 .*.swp
 *.gem
+.rvmrc

data/.travis.yml ADDED Viewed

@@ -0,0 +1,9 @@
+language: ruby
+rvm:
+  - 1.9.2
+  - 1.9.3
+  - jruby-19mode
+  - rbx-19mode
+  - ruby-head
+  - jruby-head
+  - ree

data/Gemfile CHANGED Viewed

@@ -6,3 +6,8 @@ group :server do
   gem "sinatra"
   gem "rack-cors"
 end
+group :test do
+  gem "rake"
+  gem "mocha", :require => false
+end

data/Gemfile.lock CHANGED Viewed

@@ -2,11 +2,15 @@ GEM
   remote: http://rubygems.org/
   specs:
     json (1.6.1)
+    metaclass (0.0.1)
+    mocha (0.11.4)
+      metaclass (~> 0.0.1)
     rack (1.3.5)
     rack-cors (0.2.4)
       rack
     rack-protection (1.1.4)
       rack
+    rake (0.8.7)
     sinatra (1.3.1)
       rack (~> 1.3, >= 1.3.4)
       rack-protection (~> 1.1, >= 1.1.2)
@@ -18,5 +22,7 @@ PLATFORMS
 DEPENDENCIES
   json
+  mocha
   rack-cors
+  rake
   sinatra

data/lib/providers/freeling_en.rb CHANGED Viewed

@@ -8,7 +8,7 @@ require 'open3'
 class Ve
   class Provider
     class FreelingEn < Ve::Provider
+      # FIX: This class isn't tested
       BIT_STOP = 'VeEnd'
       # TODO: Automatically set FREELINGSHARE if it's not set?
@@ -27,7 +27,8 @@ class Ve
       # Interface methods
       def works?
-        (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
+        p = parse('Wrote')
+        ["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
       end
       # Talks to the app and returns a parse object
@@ -41,6 +42,7 @@ class Ve
         output = []
         while line = @stdout.readline
+          puts line
           if line =~ /#{BIT_STOP}/x
             @stdout.readline
             break

data/lib/providers/mecab_ipadic.rb CHANGED Viewed

@@ -7,31 +7,31 @@ class Ve
     class MecabIpadic < Ve::Provider
       BIT_STOP = 'VeEnd'
       def initialize(config = {})
         # TODO: Make config handling better
         @config = {:app => 'mecab',
                    :path => '',
                    :flags => ''}.merge(config)
-        @config[:app] = `which #{@config[:app]}`
+        @config[:app] = `which #{@config[:app]}`.chomp
         start!
       end
       def works?
         (["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
           "た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
           "EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
       end
       # Talks to the app and returns a parse object
       def parse(text, options = {})
         start! if @stdin.nil? # Restart if the provider crashed
         @stdin.puts "#{text} #{BIT_STOP}"
         output = []
         while line = @stdout.readline.force_encoding('UTF-8')
           if line =~ /#{BIT_STOP}/x
             output << @stdout.readline # Catch the EOS
@@ -39,25 +39,25 @@ class Ve
           end
           output << line
         end
         Ve::Parse::MecabIpadic.new(text, output)
-      rescue
+      rescue => e
         # TODO: No good to catch all errors like this
         # I need a backtrace when something unexpected fails
         Ve::Parse::MecabIpadic.new(text, [])
       end
       private
       # TODO: Use Process.spawn/kill for process control?
       def start!
-        @stdin, @stdout, @stderr = Open3.popen3(@config[:app])
+        @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
         @stdin.set_encoding('UTF-8')
         @stdout.set_encoding('UTF-8')
-      rescue Errno::ENOENT
+      rescue Errno::ENOENT => e
         # The parser couldn't be started. Probably not installed on this system
       end
     end
   end
 end
@@ -65,15 +65,15 @@ end
 class Ve
   class Parse
     class MecabIpadic < Ve::Parse
       PARSER = %r{^ (.+?) \t (.+) }x
       attr_reader :tokens, :text
       def initialize(text, output)
         @tokens = []
         @text = text
         position = 0
         output.each_with_index do |line, index|
           line.rstrip!
           token = {:raw => line}
@@ -87,7 +87,7 @@ class Ve
               @tokens << unparsed_token
             end
           end
           if line =~ %r{^ EOS $}x
             token[:type] = :sentence_split
             token[:literal] = ''
@@ -99,7 +99,7 @@ class Ve
             [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
               token[attr] = info[i]
             end
             # Anything unparsed preceding this token
             unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
             if unparsed_md[1].length > 0
@@ -108,7 +108,7 @@ class Ve
               @tokens << unparsed_token
               position += unparsed_token[:literal].length
             end
             token[:characters] = (position..(position+token[:literal].length-1))
             position += token[:literal].length
           else
@@ -118,7 +118,7 @@ class Ve
           @tokens << token
         end
       end
       # PoS
       MEISHI = '名詞'
       KOYUUMEISHI = '固有名詞'
@@ -159,6 +159,11 @@ class Ve
       TOKUSHU_DESU = '特殊・デス'
       TOKUSHU_DA = '特殊・ダ'
       TOKUSHU_MASU = '特殊・マス'
+      TOKUSHU_NU = '特殊・ヌ'
+      FUHENKAGATA = '不変化型'
+      JINMEI = '人名'
+      MEIREI_I = '命令ｉ'
+      KAKARIJOSHI = '係助詞'
       # Etc
       NA = 'な'
@@ -166,11 +171,14 @@ class Ve
       TE = 'て'
       DE = 'で'
       BA = 'ば'
+      NN = 'ん'
+      SA = 'さ'
       def words
         words = []
         tokens = @tokens.find_all { |t| t[:type] == :parsed }
         tokens = tokens.to_enum
+        previous = nil
         # This is becoming very big
         begin
@@ -181,6 +189,7 @@ class Ve
             eat_lemma = true
             attach_to_previous = false
             also_attach_to_lemma = false
+            update_pos = false
             case token[:pos]
             when MEISHI
@@ -208,7 +217,7 @@ class Ve
                     eat_next = true
                   elsif following[:pos] == JOSHI && following[:literal] == NI
                     pos = Ve::PartOfSpeech::Adverb
-                    eat_next = true
+                    eat_next = false
                   end
                 end
               when HIJIRITSU, TOKUSHU
@@ -246,8 +255,13 @@ class Ve
                   also_attach_to_lemma = true
                 end
               when SETSUBI
-                # TODO: elaborate a bit?
-                pos = Ve::PartOfSpeech::Suffix
+                if token[:pos3] == TOKUSHU && token[:lemma] == SA
+                  attach_to_previous = true
+                  update_pos = true
+                  pos = Ve::PartOfSpeech::Noun
+                else
+                  pos = Ve::PartOfSpeech::Suffix
+                end
               when SETSUZOKUSHITEKI
                 pos = Ve::PartOfSpeech::Conjunction
               when DOUSHIHIJIRITSUTEKI
@@ -260,7 +274,10 @@ class Ve
             when JODOUSHI
               pos = Ve::PartOfSpeech::Postposition
-              if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
+              if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
+                 [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
+                attach_to_previous = true
+              elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
                 attach_to_previous = true
               elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
                 pos = Ve::PartOfSpeech::Verb
@@ -269,8 +286,8 @@ class Ve
               pos = Ve::PartOfSpeech::Verb
               if token[:pos2] == SETSUBI
                 attach_to_previous = true
-              elsif token[:pos2] == HIJIRITSU
-                grammar = :auxillary
+              elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
+                attach_to_previous = true
               end
             when KEIYOUSHI
               pos = Ve::PartOfSpeech::Adjective
@@ -301,6 +318,7 @@ class Ve
               words[-1].extra[:reading] << (token[:reading] || '')
               words[-1].extra[:transcription] << (token[:hatsuon] || '')
               words[-1].lemma << token[:lemma] if also_attach_to_lemma
+              words[-1].part_of_speech = pos if update_pos
             else
               pos = Ve::PartOfSpeech::TBD if pos.nil?
               word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
@@ -323,18 +341,20 @@ class Ve
               words << word
             end
+            previous = token
           end
         rescue StopIteration
         end
         return words
       end
       def sentences
         # TODO: Sentence objects that keep track of the sentence's tokens
         sentences = []
         current = ''
         @tokens.each do |token|
           if token[:type] == :sentence_split
             sentences << current
@@ -347,13 +367,13 @@ class Ve
             current << token[:literal]
           end
         end
         # In case there is no :sentence_split at the end
         sentences << current if current.length > 0
         sentences
       end
     end
   end
 end

data/lib/ve.rb CHANGED Viewed

@@ -10,23 +10,34 @@ require 'pp'
 class Ve
   class Manager
+    @@config_for = {}
+    def self.set_default_config_for(klass, config = {})
+      @@config_for[klass] = config
+    end
     def self.provider_for(language, function)
-      @@provider_for[language.to_sym][function.to_sym]
+      provider = @@provider_for[language.to_sym][function.to_sym]
+      if provider.is_a?(Class)
+        config = @@config_for[provider] || {}
+        provider = @@provider_for[language.to_sym][function.to_sym].new(config)
+        @@provider_for[language.to_sym][function.to_sym] = provider
+      end
+      provider
     end
     # TODO: Make a difference between what features are available locally
     # and what requires contacting external Ves
     def self.register(klass, language)
       @@provider_for ||= {}
-      provider = klass.new
       # This won't work if people start monkey patching the providers with public methods that arent abilities
       # It's also not pretty, but kinda nifty
-      provider_name = provider.class.to_s.split('::').last
+      provider_name = klass.to_s.split('::').last
       parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
       abilities = parse_class.public_instance_methods - Object.public_instance_methods
       abilities.each do |a|
         @@provider_for[language.to_sym] ||= {}
-        @@provider_for[language.to_sym][a] = provider
+        @@provider_for[language.to_sym][a] = klass
       end
     end
   end

data/tests/{freeling_en_test.rb → freeling_en_parse_test.rb} RENAMED Viewed

@@ -2,31 +2,7 @@
 require_relative 'test_helper'
-class FreelingEnTest < Test::Unit::TestCase
-  def test_should_be_able_to_start
-    freeling = Ve::Provider::FreelingEn.new
-    assert freeling.works?
-  end
-  def test_doesnt_die_on_japanese
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('これは日本語です')
-    assert_equal Ve::Parse::FreelingEn, parse.class
-  end
-  # TODO: UTF-8 handling
-  def test_can_handle_utf8
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('I’m')
-    assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
-  end
-  def test_can_parse
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('')
-    assert_equal Ve::Parse::FreelingEn, parse.class
-  end
+class FreelingEnParseTest < MiniTest::Unit::TestCase
   def test_all_literals_should_equal_the_input_text
     text = <<-EOS
@@ -35,27 +11,30 @@ class FreelingEnTest < Test::Unit::TestCase
     Z
     EOS
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse(text)
+    raw = ["There there EX 0.857656", "once once RB 0.809237", "was be VBD 1", "a a DT 0.333333", "man man NN 0.980535", "from from IN 1", "X x NNP 1", "", "Who who WP 1", "took take VBD 1", "it it PRP 1", "upon upon IN 0.915152", "himself himself PRP 1", "to to TO 0.999909", "Y y NNP 1", "", "Z z NNP 1", ""]
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
   end
   def test_creates_tokens_from_data_that_is_ignored_in_parsing
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('A   B  ')
+    text = 'A   B  '
+    raw = ['A a DT 0.333333', 'B b NNP 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
     assert_equal ['A', '   ', 'B', '  ', ''], parse.tokens.collect { |t| t[:literal] }
   end
   def test_can_give_sentences
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('This is a sentence. And this was another one')
+    text = 'This is a sentence. And this was another one'
+    raw = ['This this PRP 0.0001755', 'is be VBZ 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '', 'And and CC 1', 'this this PRP 0.0001755', 'was be VBD 1', 'another another DT 0.999067', 'one one NN 0.25', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
   end
   def test_can_give_words
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('This was a sentence.')
+    text = 'This was a sentence.'
+    raw = ['This this PRP 0.0001755', 'was be VBD 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
@@ -67,48 +46,55 @@ class FreelingEnTest < Test::Unit::TestCase
     assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
   end
+  def test_words_can_handle_contractions
+    # TODO
+    skip
+    text = "I'm eating."
+    raw = ['I i PRP 1', "'m 'm VBP 0.997563", 'eating eat VBG 1', '. . Fp 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
+    assert_equal ["I'm", "eating", "."], parse.tokens.collect { |t| t[:literal] }
+  end
   def test_possessive_endings_must_be_reattached
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse("This is Jane's sentence.")
+    text = "This is Jane's sentence."
+    raw = ["This this PRP 0.0001755", "is be VBZ 1", "Jane jane NNP 1", "'s 's POS 0.751711", "sentence sentence NN 0.966667", ". . Fp 1", ""]
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
     assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
     assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
     assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
-    assert_equal [{:grammar => :personal}, {:grammar => nil}, {:gramamr => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
+    assert_equal [{:grammar => :personal}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
     assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
   end
   def test_date_parsing
     # Should be turned off. At least for now
-    freeling = Ve::Provider::FreelingEn.new
-    assert_parses_into_words(freeling,
+    assert_parses_into_words(Ve::Parse::FreelingEn,
                              {:words => ['January'],
                               :lemmas => ['january'],
                               :pos => [Ve::PartOfSpeech::Noun],
                               :extra => [{:grammar => nil}],
                               :tokens => [0..0]},
-                             'January')
+                             'January', ['January january NN 1'])
   end
   def test_symbol_parsing
-    freeling = Ve::Provider::FreelingEn.new
-    assert_parses_into_words(freeling,
+    assert_parses_into_words(Ve::Parse::FreelingEn,
                              {:words => ['.', ',', '$'],
                               :lemmas => ['.', ',', '$'],
                               :pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
                               :extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
                               :tokens => [0..0, 1..1, 2..2]},
-                             '.,$')
+                             '.,$', ['. . Fp 1', ', , Fc 1', '$ $ Fp', ''])
   end
   def test_can_handle_underscores_properly
     # Should restore them
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse("In New York")
+    text = 'In New York'
+    raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
@@ -120,8 +106,10 @@ class FreelingEnTest < Test::Unit::TestCase
     # Should keep them
     # TODO
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse("In New_York")
+    skip
+    text = 'In New_York'
+    raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
@@ -133,3 +121,4 @@ class FreelingEnTest < Test::Unit::TestCase
   end
 end