RubyGems - llt-morphologizer - Versions diffs - 0.0.1 - Mend

llt-morphologizer 0.0.1

Files changed (21) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rspec +2 -0
data/.travis.yml +8 -0
data/Gemfile +27 -0
data/LICENSE.txt +22 -0
data/README.md +35 -0
data/Rakefile +6 -0
data/lib/llt/morphologizer.rb +378 -0
data/lib/llt/morphologizer/lookup_statement.rb +66 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder.rb +130 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder/conjugable.rb +221 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder/contracted_forms.rb +38 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder/declinable.rb +214 -0
data/lib/llt/morphologizer/version.rb +5 -0
data/llt-morphologizer.gemspec +34 -0
data/spec/lib/llt/morphologizer/lookup_statement_spec.rb +29 -0
data/spec/lib/llt/morphologizer/stem_lookup_statement_builder_spec.rb +39 -0
data/spec/lib/llt/morphologizer_spec.rb +524 -0
data/spec/spec_helper.rb +27 -0
metadata +235 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 07f6f8feba062fb30ccc3806c038ba3300c22916
+  data.tar.gz: bbcc6c448dfa30429312b522501cb2fa59ff742f
+SHA512:
+  metadata.gz: d912133821b73df731b94b33ad2f82c340fcb1fd1200ee079dfe8a885c23a3565fb88f06242fd39140b5655268004c1b2212aac4d4b98362faddec61a4eec0e3
+  data.tar.gz: 75d0914bac36f5300c0e22840035faba2050d149ff9779e0a131609dca44419d913b104fcc8ea4b8770871c985c982d2f1e88d09c5173c0ccedaa979e37228f7

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,8 @@
+---
+language: ruby
+before_script:
+- export JRUBY_OPTS=--2.0
+rvm:
+- 2.1.0
+- 2.0.0
+- jruby-1.7.8

data/Gemfile ADDED Viewed

@@ -0,0 +1,27 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in llt-morphologizer.gemspec
+gemspec
+gem 'coveralls', require: false
+gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
+gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
+gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
+gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
+gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
+gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
+gem 'llt-logger', git: 'git://github.com/latin-language-toolkit/llt-logger.git'
+# Dependencies of db_handler
+gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
+platform :ruby do
+  gem 'pg'
+end
+platform :jruby do
+  gem 'activerecord-jdbcpostgresql-adapter'
+  gem 'jruby-httpclient'
+end
+gem 'pry'

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 LFDM
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# LLT::Morphologizer
+[![Version](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury)
+[![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium)
+[![Build Status](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis)
+[![Coverage](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls)
+[![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate)
+Morphological parsing of Latin forms
+## Installation
+Add this line to your application's Gemfile:
+    gem 'llt-morphologizer'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install llt-morphologizer
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( http://github.com/<my-github-username>/llt-morphologizer/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/lib/llt/morphologizer.rb ADDED Viewed

@@ -0,0 +1,378 @@
+require 'llt/constants'
+require 'llt/core'
+require 'llt/core_extensions/match_data'
+require 'llt/db_handler/prometheus'
+require 'llt/form_builder'
+require 'llt/helpers/constantize'
+require 'llt/helpers/normalizer'
+require 'llt/helpers/pluralize'
+require 'llt/helpers/primitive_cache'
+require 'llt/logger'
+require "llt/morphologizer/version"
+module LLT
+  # Analyzes a token string morphologically.
+  #
+  # Looks up stems in a given db-dictionary and builds LLT::Form objects with the
+  # help of the LLT::FormBuilder.
+  class Morphologizer
+    require 'llt/morphologizer/stem_lookup_statement_builder'
+    include Core::Serviceable
+    include Helpers::Constantize
+    include Helpers::Normalizer
+    include Helpers::Pluralize
+    include Helpers::PrimitiveCache
+    uses_db     { DbHandler::Prometheus.new }
+    uses_logger { Logger.new("Morphologizer", 2, default: :morph) }
+    # @option options [true] :cache enables caching
+    # @option options [DbHandler] :db db-handling object used to obtain stem information
+    # @option options [Logger] :logger object used for logging
+    def initialize(options = {})
+      super
+      enable_cache if options[:cache]
+    end
+    # Takes a string and analyzes it morphologically
+    #
+    # @param [String] word token to be analyzed
+    # @param add_to [#<<] Keyword Argument: can optionally defer the returned
+    #   forms to an object
+    #
+    # @return [Array<LLT::Form>] all valid Latin forms of the given string
+    def morphologize(word, add_to: nil)
+      forms = cached(word) { compute(word) }
+      add_to << forms if add_to.respond_to?(:<<)
+      forms
+    end
+    private
+    def setup(word)
+      @word  = word
+      @forms = []
+      @uniq = false
+      @statements = nil
+    end
+    def compute(word)
+      # the order is important, illustrated with the word cum.
+      # the preposition knows that it can have another form (the subjunction),
+      # while the subjunction says it's uniq.
+      setup(word)
+      return @forms if numerals
+      return @forms if prepositions                 &&  unique_present?
+      return @forms if look_up(:conjunctions)       &&  unique_present?
+      return @forms if look_up(:subjunctions)       &&  unique_present?
+      return @forms if clook_up(:personal_pronouns) &&  unique_pers_pron?
+      return @forms if other_pronouns               &&  unique_pronoun?
+      return @forms if irregular_verbs              &&  unique_present?
+      return @forms if clook_up(:cardinals)         &&  unique_cardinal?
+      direct_lookup
+      indirect_lookup
+      @logger.error("Missing Word: #{@word}".red) if @forms.empty?
+      @forms
+    end
+######### Numerals #########
+    def numerals
+      if Helpers::RomanNumerals.roman?(@word)
+        add_form(Form::Cardinal.new(roman: @word))
+      end
+    end
+######### Personal Pronouns && Cardinals #########
+    # Complex Lookup
+    def clook_up(type)
+      if forms = LLT::Constants.const_get(type.upcase)[@word.downcase]
+        new_forms = forms.map do |form|
+          sg_type = type.to_s.chop # cardinals to cardinal
+          args = send("#{sg_type}_args", form)
+          constant_by_type(sg_type, namespace: LLT::Form).new(args)
+        end
+        add_forms(new_forms)
+      end
+    end
+    def personal_pronoun_args(pp)
+      # pp is an array of iclass, casus, numerus
+      ic, c, n = pp
+      stem, suffix = pers_pron_suffix_detection
+      { stem: stem, suffix: suffix, inflection_class: ic, casus: c, numerus: n }
+    end
+    HOMOPHONIC_PRONOUNS = Set.new(%w{ mei tui sui nostri nostrum vestri vestrum sese })
+    def unique_pers_pron?
+      ! HOMOPHONIC_PRONOUNS.include?(@word)
+    end
+    def pers_pron_suffix_detection
+      stem = @word.clone
+      stem.chomp!($1) if stem.match(/.*(cum|met|te)$/)
+      [stem, ($1 || "")]
+    end
+    def cardinal_args(cardinal)
+      # cardinal is an array
+      dec, c, n, s = cardinal
+      { decimal: dec, casus: c, numerus: n, sexus: s }
+    end
+    def unique_cardinal?
+      true # not sure if there is more needed.
+    end
+######### Other Pronouns #########
+    def other_pronouns
+      if m = pronouns_regexp.match(downcased)
+        pronoun_type = extract_pronoun_type(m)
+        stem = { type: :pronoun, inflection_class: pronoun_type }
+        new_forms = FormBuilder.build(stem.merge(options: opts_with_val(m.to_hash)))
+        add_forms(new_forms)
+      end
+    end
+    # quis and quid and all derivates (like aliquid) take a different
+    # path and use the substantivic endings
+    def extract_pronoun_type(m)
+      subst = (m[:ending] =~ /i[ds]$/ && m[:stem] == "qu") ? "_s" : ""
+      key = if m[:particle] == m[:stem] + m[:ending]
+              "quisquis"
+            else
+              # take only 2 chars of prefixed particle to match al(i)
+              # and all forms of un(us|ius...) - to_s for nils
+              "#{m[:prefixed_particle].to_s[0..1]}#{m[:stem]}#{m[:particle]}#{subst}"
+            end
+      PRONOUN_MAP[key.downcase]
+    end
+    PRONOUN_MAP = {
+       #stem + particle => :type
+                    "hc" => :hic,                "alcu" => :aliqui,
+                    "h"  => :hic,                "alqu" => :aliqui,
+                    "hu" => :hic,                "alqu_s" => :aliquis,
+                    "huc" => :hic,               "culibet" => :quilibet,#subst?
+                    "cu" => :qui,                "qulibet" => :quilibet,
+                    "qu" => :qui,                "cuvis" => :quivis,
+                    "qudam" => :quidam,          "quvis" => :quivis,
+                    "cudam" => :quidam,          "qu_s" => :quis,
+                    "qunam" => :quinam,          "uterque" => :uterque,
+                    "cunam" => :quinam,          "utrque" => :uterque,
+                    "i" => :is,                  "uter" => :uter,
+                    "e" => :is,                  "utr" => :uter,
+                    "ips" => :ipse,              "quque" => :quisque,
+                    "ill" => :ille,              "cuque" => :quisque,
+                    "ist" => :iste,              "quque_s" => :quisque_s,
+                    "idem" => :idem,             "ququam" => :quisquam,
+                    "edem" => :idem,             "ququam_s" => :quisquam,
+                    "qucumque" => :quicumque,    "cuquam" => :quisquam,
+                    "cucumque" => :quicumque,    "quisquis" => :quisquis,
+                    "alcu" => :aliqui,           "ququid" => :quisquis,
+                    "alqu" => :aliqui,           "unquque_s" => :unusquisque_s,
+                    "alqu_s" => :aliquis,        "uncuque" => :unusquisque,
+                    "qupiam_s" => :quispiam,     "unquque" => :unusquisque,
+                    "qupiam" => :quispiam,       "cupiam" => :quispiam,
+    }
+    UNIQUE_PRONOUNS = Set.new(%w{ hic is eam eas eo i quam quod quo qua })
+    def unique_pronoun?
+      ! UNIQUE_PRONOUNS.include?(@word)
+    end
+    def pronouns_regexp
+      LLT::Constants::RegExps::PRONOUNS
+    end
+######### Irregular Verbs #########
+    def irregular_verbs
+      irregular_verbs_regexps.each do |verb, stems|
+        break if @uniq
+        stems.each do |stem_type, regexps|
+          regexps.each do |regexp|
+            if m = regexp.match(@word)
+              @logger.log("Matched irregular verb #{@word.yellow} with #{verb.to_s.yellow}")
+              stem_pack = irregular_stems(verb)
+              next unless stem_pack # temporary nexting, delete when all ISPs are written down
+              new_forms = create_forms(stem_type, stem_pack, m.to_hash)
+              add_forms(new_forms)
+              # We cannot immediately return as quite often another match
+              # will definitely made with the same lemma. Therefore only
+              # break at the top - that a match of esse cannot go to ire
+              # or anything else.
+              @uniq = true unless HOMOGRAPHIC_IRREGS[verb].match(@word)
+            end
+          end
+        end
+      end
+    end
+    HOMOGRAPHIC_IRREGS = {
+      ferre: /fero/,
+      ire:   /subito/,
+    }
+    HOMOGRAPHIC_IRREGS.default = (/in_doubt_better_don't_match/)
+    def irregular_stems(key)
+      LLT::StemBuilder::IRREGULAR_STEMS[key]
+    end
+    def irregular_verbs_regexps
+      LLT::Constants::RegExps::IRREGULAR_VERBS
+    end
+######### Subjunctions & Conjunctions#########
+    def look_up(arg)
+      # A bit messy, the constants are saved in a format of
+      #  key = string
+      #   value = homophonous_forms?
+      # That's why we need to access the hash twice, as const[@word]
+      # could return false and thus fail # the conditional test
+      # with an inline assigment 'if (something = const[@word])'
+      const = Constants.const_get(arg.upcase)
+      w = downcased
+      if const.has_key?(w)
+        @uniq = true unless const[w]
+        add_form(Form.const_get(arg.to_s.chop.capitalize).new(string: @word))
+      end
+    end
+######### Prepositions #########
+    def prepositions
+      if prep = Constants::PREPOSITIONS[downcased]
+        # preps are { word => 4 6 not_uniq }
+        @uniq = true unless prep.last
+        takes_4th, takes_6th = prep[0..1]
+        args = { string: @word, takes_4th: takes_4th, takes_6th: takes_6th }
+        add_form(Form::Preposition.new(args))
+      end
+    end
+######### Direct Lookup like Adverbs #########
+    def direct_lookup
+      create_adverbs
+    end
+    def create_adverbs
+      entries = @db.direct_lookup(:adverb, downcased)
+      entries.each do |entry|
+        add_form(Form::Adverb.new(string: entry.word))
+      end
+    end
+######### Creation through DB #########
+    def indirect_lookup
+      statements
+      look_up_and_build_forms
+    end
+    def statements
+      @statements ||= StemLookupStatementBuilder.new(@word, @logger).statements
+    end
+    def look_up_and_build_forms
+      @statements.each do |statement|
+        @logger.log(statement.to_s)
+        stems = @db.look_up_stem(statement.to_query)
+        if stems.any?
+          @logger.bare("#{stems.size} #{pluralize(stems.size, 'entry')} found: #{stems.map(&:to_s) * ", "}", 8)
+          stems.each do |stem_pack|
+            type = t.send(statement.stem_type, :full)
+            new_forms = create_forms(type, stem_pack, statement.options)
+            add_forms(new_forms)
+          end
+        else
+          @logger.bare("0 entries found".yellow, 8)
+        end
+      end
+    end
+######### Helpers #########
+    def create_forms(selector, stem_pack, options)
+      forms = FormBuilder.build(stem_pack.to_hash(selector, opts_with_val(options)))
+      forms.each { |form| form.stems = stem_pack }
+    end
+    def log_form_creation(new_forms)
+      m = if new_forms.empty?
+            "No forms created".red
+          else
+            "#{new_forms.size} #{pluralize(new_forms.size, "form")} created: #{new_forms.map(&:to_s) * ", "}".green
+          end
+      @logger.bare(m, 8)
+    end
+    def opts_with_val(opts)
+      adapted_components(opts).merge(validate: true)
+    end
+    def add_form(form)
+      log_form_creation([form])
+      @forms << form
+    end
+    def add_forms(forms)
+      log_form_creation(forms)
+      @forms += forms
+    end
+    def unique_present?
+      @uniq
+    end
+    def downcased
+      @word.downcase
+    end
+    def adapted_components(comps)
+      # TODO 30.09.13 12:13 by LFDM
+      # Look fors nils in comps, probably due to regexps
+      #
+      # This method looks useless at first sight, as this is already done in LookupStatement to some extent,
+      # it's main use seems for some nil cases that need to be found, afterwards we can delete this.
+      comps.reject do |k, v|
+        if v
+          v.empty? unless k == :ending
+        else
+          true
+        end
+      end
+    end
+    private_constant :HOMOPHONIC_PRONOUNS, :PRONOUN_MAP, :UNIQUE_PRONOUNS,
+      :HOMOPHONIC_PRONOUNS, :HOMOGRAPHIC_IRREGS
+  end
+end