RubyGems - ve - Versions diffs - 0.0.2 - Mend

ve 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/.gitignore +4 -0
data/Gemfile +8 -0
data/Gemfile.lock +22 -0
data/Rakefile +9 -0
data/Readme.md +60 -0
data/js/test.html +32 -0
data/js/ve.js +57 -0
data/lib/language.rb +2 -0
data/lib/languages/english.rb +6 -0
data/lib/languages/japanese.rb +9 -0
data/lib/misc.rb +10 -0
data/lib/part_of_speech.rb +30 -0
data/lib/provider.rb +29 -0
data/lib/providers/fallbacks.rb +0 -0
data/lib/providers/freeling_en.rb +229 -0
data/lib/providers/japanese_transliterators.rb +293 -0
data/lib/providers/mecab_ipadic.rb +362 -0
data/lib/ve.rb +111 -0
data/lib/word.rb +43 -0
data/sinatra/server.rb +46 -0
data/tests/freeling_en_test.rb +135 -0
data/tests/japanese_transliterators_test.rb +79 -0
data/tests/mecab_ipadic_test.rb +452 -0
data/tests/test_helper.rb +26 -0
data/tests/ve_test.rb +20 -0
data/ve.gemspec +20 -0
metadata +80 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+.DS_Store
+.*.swp
+*.gem

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+source "http://rubygems.org"
+gem "json"
+group :server do
+  gem "sinatra"
+  gem "rack-cors"
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,22 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    json (1.6.1)
+    rack (1.3.5)
+    rack-cors (0.2.4)
+      rack
+    rack-protection (1.1.4)
+      rack
+    sinatra (1.3.1)
+      rack (~> 1.3, >= 1.3.4)
+      rack-protection (~> 1.1, >= 1.1.2)
+      tilt (~> 1.3, >= 1.3.3)
+    tilt (1.3.3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  json
+  rack-cors
+  sinatra

data/Rakefile ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env rake
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.pattern = "tests/*_test.rb"
+end
+task :default => :test

data/Readme.md ADDED Viewed

@@ -0,0 +1,60 @@
+Ve
+==
+A linguistic framework for anyone. No degree required.
+Read all about it on [kimtaro.github.com/ve](http://kimtaro.github.com/ve).
+Ruby
+----
+    require 've'
+    words = Ve.in(:en).words('I like melons.')
+    # => [#<Ve::Word:0x8ee00cc @word="I", @lemma="i", @part_of_speech=Ve::PartOfSpeech::Pronoun, @tokens=[{:raw=>"I i PRP 1", :type=>:parsed, :literal=>"I", :lemma=>"i", :pos=>"PRP", :accuracy=>"1", :characters=>0..0}], @extra={:grammar=>:personal}, @info={}>, #<Ve::Word:0x8edff28 @word="like", @lemma="like", @part_of_speech=Ve::PartOfSpeech::Preposition, @tokens=[{:raw=>"like like IN 0.815649", :type=>:parsed, :literal=>"like", :lemma=>"like", :pos=>"IN", :accuracy=>"0.815649", :characters=>2..5}], @extra={:grammar=>nil}, @info={}>, #<Ve::Word:0x8edfe24 @word="melons", @lemma="melon", @part_of_speech=Ve::PartOfSpeech::Noun, @tokens=[{:raw=>"melons melon NNS 1", :type=>:parsed, :literal=>"melons", :lemma=>"melon", :pos=>"NNS", :accuracy=>"1", :characters=>7..12}], @extra={:grammar=>:plural}, @info={}>, #<Ve::Word:0x8edfcbc @word=".", @lemma=".", @part_of_speech=Ve::PartOfSpeech::Symbol, @tokens=[{:raw=>". . Fp 1", :type=>:parsed, :literal=>".", :lemma=>".", :pos=>"Fp", :accuracy=>"1", :characters=>13..13}], @extra={:grammar=>nil}, @info={}>]
+    words.collect(&:lemma) # => ["i", "like", "melon", "."]
+    words.collect(&:part_of_speec) # => [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol]
+Javascript
+----------
+    <script type="text/javascript" charset="utf-8" src="ve.js"></script>
+    <script type="text/javascript" charset="utf-8">
+      new Ve('ja').words('ビールがおいしかった', function(words) {
+        // [{"_class":"Word","word":"ビール","lemma":"ビール","part_of_speech":"noun","tokens":[{"raw":"ビール\t名詞,一般,*,*,*,*,ビール,ビール,ビール","type":"parsed","literal":"ビール","pos":"名詞","pos2":"一般","pos3":"*","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"ビール","reading":"ビール","hatsuon":"ビール","characters":"0..2"}],"extra":{"reading":"ビール","transcription":"ビール","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"が","lemma":"が","part_of_speech":"postposition","tokens":[{"raw":"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ","type":"parsed","literal":"が","pos":"助詞","pos2":"格助詞","pos3":"一般","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"が","reading":"ガ","hatsuon":"ガ","characters":"3..3"}],"extra":{"reading":"ガ","transcription":"ガ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"おいしい","lemma":"おいしい","part_of_speech":"adjective","tokens":[{"raw":"おいしい\t形容詞,自立,*,*,形容詞・イ段,基本形,おいしい,オイシイ,オイシイ","type":"parsed","literal":"おいしい","pos":"形容詞","pos2":"自立","pos3":"*","pos4":"*","inflection_type":"形容詞・イ段","inflection_form":"基本形","lemma":"おいしい","reading":"オイシイ","hatsuon":"オイシイ","characters":"4..7"}],"extra":{"reading":"オイシイ","transcription":"オイシイ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}}]
+        for ( i in words ) {
+          var word = words[i];
+          console.log(word.lemma + "/" + word.part_of_speech)
+        }
+        // ビール/noun
+        // が/postposition
+        // おいしい/adjective
+      });
+    </script>
+Structure
+---------
+- **Ve::LocalInterface** - Main interface that gives access to functionality in providers that exist locally
+- **Ve::XInterface** - Allows for different ways of accessing Ve providers. Locally, through an HTTP API, binary protocol or whatever
+- **Ve::Manager** - Keeps track of providers and what they can do
+- **Ve::Provider::X** - Talks to the underlying parser
+- **Ve::Parse::X** - Takes the output from the Provider and turns it into functions the end user can use
+Todo
+----
+- Expose more through the sinatra server
+- Alias lemma to base, so people don't need to know what lemmas are
+- Break out into separate projects for each component. Ve-ruby, Ve-js.
+- Better UTF-8 handling for Freeling
+- See all the TODO's in the code
+License
+-------
+(c) Kim Ahlström 2011
+This is under the MIT license.

data/js/test.html ADDED Viewed

@@ -0,0 +1,32 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta name="http-equiv" content="Content-Type: text/html; charset=utf-8">
+    <style type="text/css">
+      .fail { color: red; }
+      .pass { color: green; }
+    </style>
+    <script src="ve.js" type="text/javascript" charset="utf-8"></script>
+    <script type="text/javascript" charset="utf-8">
+      function assert(test, name) {
+        var report = document.getElementById('report');
+        var result = test ? 'pass' : 'fail';
+        report.innerHTML = report.innerHTML + '<p class="' + result + '">' + name + ': ' + result + '</p>'
+      }
+      new Ve('en').words('I ate hamburgers.', function(words){
+        assert((4 == words.length && 'eat' == words[1].lemma), 'English');
+      });
+      new Ve('ja').words('ビールを飲んだ', function(words){
+        // TODO: Shouldn't have to encode it here ...
+        var word = "\u98f2\u3080"; // 飲む
+        assert((3 == words.length && word == words[2].lemma), 'Japanese');
+      });
+    </script>
+  </head>
+  <body>
+    <div id="report"></div>
+  </body>
+</html>

data/js/ve.js ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ *  ve.js
+ *
+ *  Communicates with a Sinatra-server to facilitate linguistic
+ *  parsing tech in JS.
+ *
+ *  @Author: Kim Ahlstrom
+ *  @Author: Ryan McGrath <ryan@venodesigns.net>
+ *  @Requires: Nothing
+ */
+;(function(w, d, undefined) {
+  var Ve = w.Ve = function Ve(language) {
+    this.language = language;
+    this.url = 'http://localhost:4567/';
+    return this;
+  };
+  Ve.prototype = {
+    words: function(text, callbackfn) {
+      // Need to utf8-encode stuff at this point...
+      jsonp(this.url + this.language + '/words?text=' + text, callbackfn);
+      return this;
+    }
+  };
+  var jsonp = function jsonp(src, callbackfn) {
+    var newScript = document.createElement("script"),
+      callback = 've_callback_' + +new Date();
+    newScript.type = "text/javascript";
+    newScript.setAttribute("async", "true");
+    newScript.setAttribute("src", src + '&callback=' + callback);
+    window[callback] = callbackfn;
+    /**
+     *  Automagically handle cleanup of injected script tags, so we don't litter someone's DOM
+     *  with our stuff. This branches for various reasons - could be a bit cleaner.
+     */
+    if(newScript.readyState) {
+      newScript.onreadystatechange = function() {
+        if(/loaded|complete/.test(newScript.readyState)) {
+          newScript.onreadystatechange = null;
+          document.documentElement.firstChild.removeChild(newScript);
+          window[callback] = null;
+        }
+      }
+    } else {
+      newScript.addEventListener("load", function() {
+        document.documentElement.firstChild.removeChild(newScript);
+        window[callback] = null;
+      }, false);
+    }
+    document.documentElement.firstChild.appendChild(newScript);
+  }
+})(window, document, 'undefined');

data/lib/language.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class Ve::Language
2	+ end

data/lib/languages/english.rb ADDED Viewed

@@ -0,0 +1,6 @@
+class Ve
+  class Language
+    class English
+    end
+  end
+end

data/lib/languages/japanese.rb ADDED Viewed

@@ -0,0 +1,9 @@
+class Ve
+  class Language
+    class Japanese
+      #interface_for :ja
+    end
+  end
+end

data/lib/misc.rb ADDED Viewed

@@ -0,0 +1,10 @@
+class Enumerator
+  def more?
+    begin
+      self.peek
+      true
+    rescue
+      false
+    end
+  end
+end

data/lib/part_of_speech.rb ADDED Viewed

@@ -0,0 +1,30 @@
+class Ve
+  class PartOfSpeech
+    def self.name
+      self.to_s.split('::').last.downcase
+    end
+    class Noun < PartOfSpeech; end
+    class ProperNoun < PartOfSpeech; end
+    class Pronoun < PartOfSpeech; end
+    class Adjective < PartOfSpeech; end
+    class Adverb < PartOfSpeech; end
+    class Determiner < PartOfSpeech; end
+    class Preposition < PartOfSpeech; end
+    class Postposition < PartOfSpeech; end
+    class Verb < PartOfSpeech; end
+    class Suffix < PartOfSpeech; end
+    class Prefix < PartOfSpeech; end
+    class Conjunction < PartOfSpeech; end
+    class Interjection < PartOfSpeech; end
+    class Number < PartOfSpeech; end
+    class Unknown < PartOfSpeech; end
+    class Symbol < PartOfSpeech; end
+    class Determiner < PartOfSpeech; end
+    class Other < PartOfSpeech; end
+    class TBD < PartOfSpeech; end # Placeholder for provider PoS that haven't had a Ve PoS assigned yet
+  end
+end

data/lib/provider.rb ADDED Viewed

@@ -0,0 +1,29 @@
+class Ve
+  class Provider
+    # Interface, to be implemented by providers
+    def provides
+    end
+    def start!
+    end
+    def works?
+    end
+    def parse
+    end
+  end
+end
+class Ve
+  class Parse
+    # TODO
+    def as_json
+    end
+  end
+end

data/lib/providers/fallbacks.rb ADDED Viewed

File without changes

data/lib/providers/freeling_en.rb ADDED Viewed

@@ -0,0 +1,229 @@
+# Encoding: UTF-8
+# TODO: Retain capitalization in lemmas?
+# TODO: Memoize
+require 'open3'
+class Ve
+  class Provider
+    class FreelingEn < Ve::Provider
+      BIT_STOP = 'VeEnd'
+      # TODO: Automatically set FREELINGSHARE if it's not set?
+      def initialize(config = {})
+        @config = {:app => 'analyzer',
+                   :path => '',
+                   :flags => ''}.merge(config)
+        @config[:app] = `which #{@config[:app]}`.strip!
+        local = @config[:app] =~ /local/ ? '/local' : ''
+        @config[:flags] = "-f /usr#{local}/share/FreeLing/config/en.cfg --flush --nonumb --nodate"
+        start!
+      end
+      # Interface methods
+      def works?
+        (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
+      end
+      # Talks to the app and returns a parse object
+      def parse(text, options = {})
+        start! if @stdin.nil?
+        # Fix Unicode chars
+        # TODO: These need to be converted back to the original char in the :literal attribute
+        text = text.gsub('’', "'")
+        @stdin.puts "#{text}\n#{BIT_STOP}\n"
+        output = []
+        while line = @stdout.readline
+          if line =~ /#{BIT_STOP}/x
+            @stdout.readline
+            break
+          end
+          output << line
+        end
+        Ve::Parse::FreelingEn.new(text, output)
+      rescue
+        Ve::Parse::FreelingEn.new(text, [])
+      end
+      private
+      def start!
+        @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
+        # TODO: Also filter out non-iso-latin-1 characters
+        @stdin.set_encoding('UTF-8', 'ISO-8859-1')
+        @stdout.set_encoding('ISO-8859-1', 'UTF-8')
+      rescue Errno::ENOENT
+        # The parser couldn't be started. Probably not installed on this system
+      end
+    end
+  end
+end
+class Ve
+  class Parse
+    class FreelingEn < Ve::Parse
+      attr_reader :tokens, :text
+      def initialize(text, output)
+        @tokens = []
+        @text = text
+        position = 0
+        output.each_with_index do |line, index|
+          line.rstrip!
+          token = {:raw => line}
+          # Anything unparsed at the end of the text
+          # This must happen before sentence splits are detected to avoid funny ordering
+          if output.length > 1 && output.length == index + 1
+            unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
+            if unparsed_md[1].length > 0
+              unparsed_token = {:type => :unparsed,
+                                :literal => unparsed_md[1],
+                                :raw => ''}
+              unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
+              @tokens << unparsed_token
+            end
+          end
+          # Sentence splits are just empty lines in Freeling
+          if line.length == 0
+            token[:type] = :sentence_split
+            token[:literal] = ''
+            @tokens << token
+            next
+          end
+          # The parsed token
+          info = line.split(/\s+/)
+          token[:type] = :parsed
+          [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
+            token[attr] = info[i]
+          end
+          token[:literal].gsub!('_', ' ')
+          token[:lemma].gsub!('_', ' ')
+          # Anything unparsed preceding this token.
+          # We need to do this complicated dance with _ since Freeling replaces spaces with it.
+          # And so we need to be able to find the token with both spaces and _ in it since
+          # we don't know what the original in the text actually is.
+          # Once we have the location in the text we can figure out if it should be with spaces or _.
+          unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
+          unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
+          unparsed_md = unparsed_re.match(text, position)
+          if unparsed_md && unparsed_md[1].length > 0
+            unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
+            unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
+            @tokens << unparsed_token
+            position += unparsed_token[:literal].length
+          end
+          token[:characters] = (position..(position+token[:literal].length-1))
+          position += token[:literal].length
+          @tokens << token
+        end
+      end
+      INTERNAL_INFO_FOR_PARSED_POS = {
+        'CC' => [Ve::PartOfSpeech::Conjunction, nil],
+        'CD' => [Ve::PartOfSpeech::Number, nil],
+        'DT' => [Ve::PartOfSpeech::Determiner, nil],
+        'EX' => [Ve::PartOfSpeech::Pronoun, nil],
+        'FW' => [Ve::PartOfSpeech::Unknown, nil],
+        'IN' => [Ve::PartOfSpeech::Preposition, nil],
+        'JJ' => [Ve::PartOfSpeech::Adjective, nil],
+        'JJR' => [Ve::PartOfSpeech::Adjective, :comparative],
+        'JJS' => [Ve::PartOfSpeech::Adjective, :superlative],
+        'LS' => [Ve::PartOfSpeech::Unknown, nil],
+        'MD' => [Ve::PartOfSpeech::Verb, :modal],
+        'NN' => [Ve::PartOfSpeech::Noun, nil],
+        'NNS' => [Ve::PartOfSpeech::Noun, :plural],
+        'NNP' => [Ve::PartOfSpeech::ProperNoun, nil],
+        'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural],
+        'PDT' => [Ve::PartOfSpeech::Determiner, nil],
+        'PRP' => [Ve::PartOfSpeech::Pronoun, :personal],
+        'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
+        'RB' => [Ve::PartOfSpeech::Adverb, nil],
+        'RBR' => [Ve::PartOfSpeech::Adverb, :comparative],
+        'RBS' => [Ve::PartOfSpeech::Adverb, :superlative],
+        'RP' => [Ve::PartOfSpeech::Postposition, nil],
+        'SYM' => [Ve::PartOfSpeech::Symbol, nil],
+        'TO' => [Ve::PartOfSpeech::Preposition, nil],
+        'UH' => [Ve::PartOfSpeech::Interjection, nil],
+        'VB' => [Ve::PartOfSpeech::Verb, nil],
+        'VBD' => [Ve::PartOfSpeech::Verb, :past],
+        'VBG' => [Ve::PartOfSpeech::Verb, :present_participle],
+        'VBN' => [Ve::PartOfSpeech::Verb, :past_participle],
+        'VBP' => [Ve::PartOfSpeech::Verb, nil],
+        'VBZ' => [Ve::PartOfSpeech::Verb, nil],
+        'WDT' => [Ve::PartOfSpeech::Determiner, nil],
+        'WP' => [Ve::PartOfSpeech::Pronoun, nil],
+        'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
+        'WRB' => [Ve::PartOfSpeech::Adverb, nil],
+        'Z' => [Ve::PartOfSpeech::Determiner, nil]
+      }
+      def words
+        words = []
+        @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
+          if token[:pos] == 'POS'
+            # Possessive ending, add to previous token
+            words[-1].word << token[:literal]
+            words[-1].tokens << token
+            next
+          else
+            # All other tokens
+            pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]
+            if pos.nil? && token[:pos] =~ /^F\w+$/
+              pos = Ve::PartOfSpeech::Symbol
+            end
+            pos = Ve::PartOfSpeech::TBD if pos.nil?
+            word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
+            words << word
+          end
+        end
+        words
+      end
+      def sentences
+        sentences = []
+        current = ''
+        @tokens.each do |token|
+          if token[:type] == :sentence_split
+            sentences << current
+            current = ''
+          else
+            current << token[:literal]
+          end
+        end
+        # In case there is no :sentence_split at the end
+        sentences << current if current.length > 0
+        sentences.collect { |s| s.strip! }
+        sentences
+      end
+    end
+  end
+end
+Ve::Manager.register(Ve::Provider::FreelingEn, :en)