RubyGems - ve - Versions diffs - 0.0.2 → 0.0.3 - Mend

ve 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +7 -0
data/.gitignore +1 -0
data/.travis.yml +9 -0
data/Gemfile +5 -0
data/Gemfile.lock +6 -0
data/lib/providers/freeling_en.rb +4 -2
data/lib/providers/mecab_ipadic.rb +52 -32
data/lib/ve.rb +15 -4
data/tests/{freeling_en_test.rb → freeling_en_parse_test.rb} +37 -48
data/tests/freeling_en_provider_test.rb +38 -0
data/tests/japanese_transliterators_test.rb +1 -1
data/tests/mecab_ipadic_parse_test.rb +772 -0
data/tests/mecab_ipadic_provider_test.rb +21 -0
data/tests/test_helper.rb +5 -4
data/tests/ve_test.rb +5 -1
data/ve.gemspec +1 -1
metadata +27 -35
data/tests/mecab_ipadic_test.rb +0 -452

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7667f10a89f699b284d7a412bab815d46e9bf26d
+  data.tar.gz: 7700e9a46ee0321b746ce23c806df91fb54b7252
+SHA512:
+  metadata.gz: fa87fa761966cc70ec3edf7dad6b4ef36404a8adc14314f88aa7bd91b34784a98de198d699f6fe8650ace92fb45b9be7c5ac6e23d35b5f1fcf9a138d75d647b0
+  data.tar.gz: 78e90e9c7af44b26bebd04bd4f44e002b392438bfe7422950f473d01e99840e5e26f6ca0887a17f76f2d7211f49d44fee2ce7e86ed832824f0d01d5109a6cea4

data/.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 .DS_Store
 .*.swp
 *.gem
+.rvmrc

data/.travis.yml ADDED Viewed

@@ -0,0 +1,9 @@
+language: ruby
+rvm:
+  - 1.9.2
+  - 1.9.3
+  - jruby-19mode
+  - rbx-19mode
+  - ruby-head
+  - jruby-head
+  - ree

data/Gemfile CHANGED Viewed

@@ -6,3 +6,8 @@ group :server do
   gem "sinatra"
   gem "rack-cors"
 end
+group :test do
+  gem "rake"
+  gem "mocha", :require => false
+end

data/Gemfile.lock CHANGED Viewed

@@ -2,11 +2,15 @@ GEM
   remote: http://rubygems.org/
   specs:
     json (1.6.1)
+    metaclass (0.0.1)
+    mocha (0.11.4)
+      metaclass (~> 0.0.1)
     rack (1.3.5)
     rack-cors (0.2.4)
       rack
     rack-protection (1.1.4)
       rack
+    rake (0.8.7)
     sinatra (1.3.1)
       rack (~> 1.3, >= 1.3.4)
       rack-protection (~> 1.1, >= 1.1.2)
@@ -18,5 +22,7 @@ PLATFORMS
 DEPENDENCIES
   json
+  mocha
   rack-cors
+  rake
   sinatra

data/lib/providers/freeling_en.rb CHANGED Viewed

@@ -8,7 +8,7 @@ require 'open3'
 class Ve
   class Provider
     class FreelingEn < Ve::Provider
+      # FIX: This class isn't tested
       BIT_STOP = 'VeEnd'
       # TODO: Automatically set FREELINGSHARE if it's not set?
@@ -27,7 +27,8 @@ class Ve
       # Interface methods
       def works?
-        (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
+        p = parse('Wrote')
+        ["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
       end
       # Talks to the app and returns a parse object
@@ -41,6 +42,7 @@ class Ve
         output = []
         while line = @stdout.readline
+          puts line
           if line =~ /#{BIT_STOP}/x
             @stdout.readline
             break

data/lib/providers/mecab_ipadic.rb CHANGED Viewed

@@ -7,31 +7,31 @@ class Ve
     class MecabIpadic < Ve::Provider
       BIT_STOP = 'VeEnd'
       def initialize(config = {})
         # TODO: Make config handling better
         @config = {:app => 'mecab',
                    :path => '',
                    :flags => ''}.merge(config)
-        @config[:app] = `which #{@config[:app]}`
+        @config[:app] = `which #{@config[:app]}`.chomp
         start!
       end
       def works?
         (["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
           "た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
           "EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
       end
       # Talks to the app and returns a parse object
       def parse(text, options = {})
         start! if @stdin.nil? # Restart if the provider crashed
         @stdin.puts "#{text} #{BIT_STOP}"
         output = []
         while line = @stdout.readline.force_encoding('UTF-8')
           if line =~ /#{BIT_STOP}/x
             output << @stdout.readline # Catch the EOS
@@ -39,25 +39,25 @@ class Ve
           end
           output << line
         end
         Ve::Parse::MecabIpadic.new(text, output)
-      rescue
+      rescue => e
         # TODO: No good to catch all errors like this
         # I need a backtrace when something unexpected fails
         Ve::Parse::MecabIpadic.new(text, [])
       end
       private
       # TODO: Use Process.spawn/kill for process control?
       def start!
-        @stdin, @stdout, @stderr = Open3.popen3(@config[:app])
+        @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
         @stdin.set_encoding('UTF-8')
         @stdout.set_encoding('UTF-8')
-      rescue Errno::ENOENT
+      rescue Errno::ENOENT => e
         # The parser couldn't be started. Probably not installed on this system
       end
     end
   end
 end
@@ -65,15 +65,15 @@ end
 class Ve
   class Parse
     class MecabIpadic < Ve::Parse
       PARSER = %r{^ (.+?) \t (.+) }x
       attr_reader :tokens, :text
       def initialize(text, output)
         @tokens = []
         @text = text
         position = 0
         output.each_with_index do |line, index|
           line.rstrip!
           token = {:raw => line}
@@ -87,7 +87,7 @@ class Ve
               @tokens << unparsed_token
             end
           end
           if line =~ %r{^ EOS $}x
             token[:type] = :sentence_split
             token[:literal] = ''
@@ -99,7 +99,7 @@ class Ve
             [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
               token[attr] = info[i]
             end
             # Anything unparsed preceding this token
             unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
             if unparsed_md[1].length > 0
@@ -108,7 +108,7 @@ class Ve
               @tokens << unparsed_token
               position += unparsed_token[:literal].length
             end
             token[:characters] = (position..(position+token[:literal].length-1))
             position += token[:literal].length
           else
@@ -118,7 +118,7 @@ class Ve
           @tokens << token
         end
       end
       # PoS
       MEISHI = '名詞'
       KOYUUMEISHI = '固有名詞'
@@ -159,6 +159,11 @@ class Ve
       TOKUSHU_DESU = '特殊・デス'
       TOKUSHU_DA = '特殊・ダ'
       TOKUSHU_MASU = '特殊・マス'
+      TOKUSHU_NU = '特殊・ヌ'
+      FUHENKAGATA = '不変化型'
+      JINMEI = '人名'
+      MEIREI_I = '命令ｉ'
+      KAKARIJOSHI = '係助詞'
       # Etc
       NA = 'な'
@@ -166,11 +171,14 @@ class Ve
       TE = 'て'
       DE = 'で'
       BA = 'ば'
+      NN = 'ん'
+      SA = 'さ'
       def words
         words = []
         tokens = @tokens.find_all { |t| t[:type] == :parsed }
         tokens = tokens.to_enum
+        previous = nil
         # This is becoming very big
         begin
@@ -181,6 +189,7 @@ class Ve
             eat_lemma = true
             attach_to_previous = false
             also_attach_to_lemma = false
+            update_pos = false
             case token[:pos]
             when MEISHI
@@ -208,7 +217,7 @@ class Ve
                     eat_next = true
                   elsif following[:pos] == JOSHI && following[:literal] == NI
                     pos = Ve::PartOfSpeech::Adverb
-                    eat_next = true
+                    eat_next = false
                   end
                 end
               when HIJIRITSU, TOKUSHU
@@ -246,8 +255,13 @@ class Ve
                   also_attach_to_lemma = true
                 end
               when SETSUBI
-                # TODO: elaborate a bit?
-                pos = Ve::PartOfSpeech::Suffix
+                if token[:pos3] == TOKUSHU && token[:lemma] == SA
+                  attach_to_previous = true
+                  update_pos = true
+                  pos = Ve::PartOfSpeech::Noun
+                else
+                  pos = Ve::PartOfSpeech::Suffix
+                end
               when SETSUZOKUSHITEKI
                 pos = Ve::PartOfSpeech::Conjunction
               when DOUSHIHIJIRITSUTEKI
@@ -260,7 +274,10 @@ class Ve
             when JODOUSHI
               pos = Ve::PartOfSpeech::Postposition
-              if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
+              if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
+                 [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
+                attach_to_previous = true
+              elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
                 attach_to_previous = true
               elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
                 pos = Ve::PartOfSpeech::Verb
@@ -269,8 +286,8 @@ class Ve
               pos = Ve::PartOfSpeech::Verb
               if token[:pos2] == SETSUBI
                 attach_to_previous = true
-              elsif token[:pos2] == HIJIRITSU
-                grammar = :auxillary
+              elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
+                attach_to_previous = true
               end
             when KEIYOUSHI
               pos = Ve::PartOfSpeech::Adjective
@@ -301,6 +318,7 @@ class Ve
               words[-1].extra[:reading] << (token[:reading] || '')
               words[-1].extra[:transcription] << (token[:hatsuon] || '')
               words[-1].lemma << token[:lemma] if also_attach_to_lemma
+              words[-1].part_of_speech = pos if update_pos
             else
               pos = Ve::PartOfSpeech::TBD if pos.nil?
               word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
@@ -323,18 +341,20 @@ class Ve
               words << word
             end
+            previous = token
           end
         rescue StopIteration
         end
         return words
       end
       def sentences
         # TODO: Sentence objects that keep track of the sentence's tokens
         sentences = []
         current = ''
         @tokens.each do |token|
           if token[:type] == :sentence_split
             sentences << current
@@ -347,13 +367,13 @@ class Ve
             current << token[:literal]
           end
         end
         # In case there is no :sentence_split at the end
         sentences << current if current.length > 0
         sentences
       end
     end
   end
 end

data/lib/ve.rb CHANGED Viewed

@@ -10,23 +10,34 @@ require 'pp'
 class Ve
   class Manager
+    @@config_for = {}
+    def self.set_default_config_for(klass, config = {})
+      @@config_for[klass] = config
+    end
     def self.provider_for(language, function)
-      @@provider_for[language.to_sym][function.to_sym]
+      provider = @@provider_for[language.to_sym][function.to_sym]
+      if provider.is_a?(Class)
+        config = @@config_for[provider] || {}
+        provider = @@provider_for[language.to_sym][function.to_sym].new(config)
+        @@provider_for[language.to_sym][function.to_sym] = provider
+      end
+      provider
     end
     # TODO: Make a difference between what features are available locally
     # and what requires contacting external Ves
     def self.register(klass, language)
       @@provider_for ||= {}
-      provider = klass.new
       # This won't work if people start monkey patching the providers with public methods that arent abilities
       # It's also not pretty, but kinda nifty
-      provider_name = provider.class.to_s.split('::').last
+      provider_name = klass.to_s.split('::').last
       parse_class = Kernel.class_eval("Ve::Parse::#{provider_name}")
       abilities = parse_class.public_instance_methods - Object.public_instance_methods
       abilities.each do |a|
         @@provider_for[language.to_sym] ||= {}
-        @@provider_for[language.to_sym][a] = provider
+        @@provider_for[language.to_sym][a] = klass
       end
     end
   end

data/tests/{freeling_en_test.rb → freeling_en_parse_test.rb} RENAMED Viewed

@@ -2,31 +2,7 @@
 require_relative 'test_helper'
-class FreelingEnTest < Test::Unit::TestCase
-  def test_should_be_able_to_start
-    freeling = Ve::Provider::FreelingEn.new
-    assert freeling.works?
-  end
-  def test_doesnt_die_on_japanese
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('これは日本語です')
-    assert_equal Ve::Parse::FreelingEn, parse.class
-  end
-  # TODO: UTF-8 handling
-  def test_can_handle_utf8
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('I’m')
-    assert_equal ['I\'m'], parse.tokens.collect { |t| t[:literal] }
-  end
-  def test_can_parse
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('')
-    assert_equal Ve::Parse::FreelingEn, parse.class
-  end
+class FreelingEnParseTest < MiniTest::Unit::TestCase
   def test_all_literals_should_equal_the_input_text
     text = <<-EOS
@@ -35,27 +11,30 @@ class FreelingEnTest < Test::Unit::TestCase
     Z
     EOS
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse(text)
+    raw = ["There there EX 0.857656", "once once RB 0.809237", "was be VBD 1", "a a DT 0.333333", "man man NN 0.980535", "from from IN 1", "X x NNP 1", "", "Who who WP 1", "took take VBD 1", "it it PRP 1", "upon upon IN 0.915152", "himself himself PRP 1", "to to TO 0.999909", "Y y NNP 1", "", "Z z NNP 1", ""]
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     assert_equal text, parse.tokens.collect { |t| t[:literal] }.join
   end
   def test_creates_tokens_from_data_that_is_ignored_in_parsing
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('A   B  ')
+    text = 'A   B  '
+    raw = ['A a DT 0.333333', 'B b NNP 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     assert_equal [:parsed, :unparsed, :parsed, :unparsed, :sentence_split], parse.tokens.collect { |t| t[:type] }
     assert_equal ['A', '   ', 'B', '  ', ''], parse.tokens.collect { |t| t[:literal] }
   end
   def test_can_give_sentences
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('This is a sentence. And this was another one')
+    text = 'This is a sentence. And this was another one'
+    raw = ['This this PRP 0.0001755', 'is be VBZ 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '', 'And and CC 1', 'this this PRP 0.0001755', 'was be VBD 1', 'another another DT 0.999067', 'one one NN 0.25', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     assert_equal ['This is a sentence.', 'And this was another one'], parse.sentences
   end
   def test_can_give_words
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse('This was a sentence.')
+    text = 'This was a sentence.'
+    raw = ['This this PRP 0.0001755', 'was be VBD 1', 'a a DT 0.333333', 'sentence sentence NN 0.966667', '. . Fp 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
@@ -67,48 +46,55 @@ class FreelingEnTest < Test::Unit::TestCase
     assert_equal [[tokens[0]], [tokens[2]], [tokens[4]], [tokens[6]], [tokens[7]]], words.collect(&:tokens)
   end
+  def test_words_can_handle_contractions
+    # TODO
+    skip
+    text = "I'm eating."
+    raw = ['I i PRP 1', "'m 'm VBP 0.997563", 'eating eat VBG 1', '. . Fp 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
+    assert_equal ["I'm", "eating", "."], parse.tokens.collect { |t| t[:literal] }
+  end
   def test_possessive_endings_must_be_reattached
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse("This is Jane's sentence.")
+    text = "This is Jane's sentence."
+    raw = ["This this PRP 0.0001755", "is be VBZ 1", "Jane jane NNP 1", "'s 's POS 0.751711", "sentence sentence NN 0.966667", ". . Fp 1", ""]
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
     assert_equal ['This', 'is', "Jane's", 'sentence', '.'], words.collect(&:word)
     assert_equal ['this', 'be', "jane", 'sentence', '.'], words.collect(&:lemma)
     assert_equal [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Verb, Ve::PartOfSpeech::ProperNoun, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol], words.collect(&:part_of_speech)
-    assert_equal [{:grammar => :personal}, {:grammar => nil}, {:gramamr => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
+    assert_equal [{:grammar => :personal}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}, {:grammar => nil}], words.collect(&:extra)
     assert_equal [[tokens[0]], [tokens[2]], tokens[4..5], [tokens[7]], [tokens[8]]], words.collect(&:tokens)
   end
   def test_date_parsing
     # Should be turned off. At least for now
-    freeling = Ve::Provider::FreelingEn.new
-    assert_parses_into_words(freeling,
+    assert_parses_into_words(Ve::Parse::FreelingEn,
                              {:words => ['January'],
                               :lemmas => ['january'],
                               :pos => [Ve::PartOfSpeech::Noun],
                               :extra => [{:grammar => nil}],
                               :tokens => [0..0]},
-                             'January')
+                             'January', ['January january NN 1'])
   end
   def test_symbol_parsing
-    freeling = Ve::Provider::FreelingEn.new
-    assert_parses_into_words(freeling,
+    assert_parses_into_words(Ve::Parse::FreelingEn,
                              {:words => ['.', ',', '$'],
                               :lemmas => ['.', ',', '$'],
                               :pos => [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::Symbol],
                               :extra => [{:grammar => nil}, {:grammar => nil}, {:grammar => nil}],
                               :tokens => [0..0, 1..1, 2..2]},
-                             '.,$')
+                             '.,$', ['. . Fp 1', ', , Fc 1', '$ $ Fp', ''])
   end
   def test_can_handle_underscores_properly
     # Should restore them
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse("In New York")
+    text = 'In New York'
+    raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
@@ -120,8 +106,10 @@ class FreelingEnTest < Test::Unit::TestCase
     # Should keep them
     # TODO
-    freeling = Ve::Provider::FreelingEn.new
-    parse = freeling.parse("In New_York")
+    skip
+    text = 'In New_York'
+    raw = ['In in IN 0.986184', 'New_York new_york NNP 1', '']
+    parse = Ve::Parse::FreelingEn.new(text, raw)
     words = parse.words
     tokens = parse.tokens
@@ -133,3 +121,4 @@ class FreelingEnTest < Test::Unit::TestCase
   end
 end