RubyGems - llt-morphologizer - Versions diffs - 0.0.1 - Mend

llt-morphologizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rspec +2 -0
data/.travis.yml +8 -0
data/Gemfile +27 -0
data/LICENSE.txt +22 -0
data/README.md +35 -0
data/Rakefile +6 -0
data/lib/llt/morphologizer.rb +378 -0
data/lib/llt/morphologizer/lookup_statement.rb +66 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder.rb +130 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder/conjugable.rb +221 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder/contracted_forms.rb +38 -0
data/lib/llt/morphologizer/stem_lookup_statement_builder/declinable.rb +214 -0
data/lib/llt/morphologizer/version.rb +5 -0
data/llt-morphologizer.gemspec +34 -0
data/spec/lib/llt/morphologizer/lookup_statement_spec.rb +29 -0
data/spec/lib/llt/morphologizer/stem_lookup_statement_builder_spec.rb +39 -0
data/spec/lib/llt/morphologizer_spec.rb +524 -0
data/spec/spec_helper.rb +27 -0
metadata +235 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 07f6f8feba062fb30ccc3806c038ba3300c22916
+  data.tar.gz: bbcc6c448dfa30429312b522501cb2fa59ff742f
+SHA512:
+  metadata.gz: d912133821b73df731b94b33ad2f82c340fcb1fd1200ee079dfe8a885c23a3565fb88f06242fd39140b5655268004c1b2212aac4d4b98362faddec61a4eec0e3
+  data.tar.gz: 75d0914bac36f5300c0e22840035faba2050d149ff9779e0a131609dca44419d913b104fcc8ea4b8770871c985c982d2f1e88d09c5173c0ccedaa979e37228f7

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,8 @@
+---
+language: ruby
+before_script:
+- export JRUBY_OPTS=--2.0
+rvm:
+- 2.1.0
+- 2.0.0
+- jruby-1.7.8

data/Gemfile ADDED Viewed

@@ -0,0 +1,27 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in llt-morphologizer.gemspec
+gemspec
+gem 'coveralls', require: false
+gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
+gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
+gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
+gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
+gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
+gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
+gem 'llt-logger', git: 'git://github.com/latin-language-toolkit/llt-logger.git'
+# Dependencies of db_handler
+gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
+platform :ruby do
+  gem 'pg'
+end
+platform :jruby do
+  gem 'activerecord-jdbcpostgresql-adapter'
+  gem 'jruby-httpclient'
+end
+gem 'pry'

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 LFDM
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# LLT::Morphologizer
+[![Version](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury)
+[![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium)
+[![Build Status](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis)
+[![Coverage](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls)
+[![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate)
+Morphological parsing of Latin forms
+## Installation
+Add this line to your application's Gemfile:
+    gem 'llt-morphologizer'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install llt-morphologizer
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( http://github.com/<my-github-username>/llt-morphologizer/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/lib/llt/morphologizer.rb ADDED Viewed

@@ -0,0 +1,378 @@
+require 'llt/constants'
+require 'llt/core'
+require 'llt/core_extensions/match_data'
+require 'llt/db_handler/prometheus'
+require 'llt/form_builder'
+require 'llt/helpers/constantize'
+require 'llt/helpers/normalizer'
+require 'llt/helpers/pluralize'
+require 'llt/helpers/primitive_cache'
+require 'llt/logger'
+require "llt/morphologizer/version"
+module LLT
+  # Analyzes a token string morphologically.
+  #
+  # Looks up stems in a given db-dictionary and builds LLT::Form objects with the
+  # help of the LLT::FormBuilder.
+  class Morphologizer
+    require 'llt/morphologizer/stem_lookup_statement_builder'
+    include Core::Serviceable
+    include Helpers::Constantize
+    include Helpers::Normalizer
+    include Helpers::Pluralize
+    include Helpers::PrimitiveCache
+    uses_db     { DbHandler::Prometheus.new }
+    uses_logger { Logger.new("Morphologizer", 2, default: :morph) }
+    # @option options [true] :cache enables caching
+    # @option options [DbHandler] :db db-handling object used to obtain stem information
+    # @option options [Logger] :logger object used for logging
+    def initialize(options = {})
+      super
+      enable_cache if options[:cache]
+    end
+    # Takes a string and analyzes it morphologically
+    #
+    # @param [String] word token to be analyzed
+    # @param add_to [#<<] Keyword Argument: can optionally defer the returned
+    #   forms to an object
+    #
+    # @return [Array<LLT::Form>] all valid Latin forms of the given string
+    def morphologize(word, add_to: nil)
+      forms = cached(word) { compute(word) }
+      add_to << forms if add_to.respond_to?(:<<)
+      forms
+    end
+    private
+    def setup(word)
+      @word  = word
+      @forms = []
+      @uniq = false
+      @statements = nil
+    end
+    def compute(word)
+      # the order is important, illustrated with the word cum.
+      # the preposition knows that it can have another form (the subjunction),
+      # while the subjunction says it's uniq.
+      setup(word)
+      return @forms if numerals
+      return @forms if prepositions                 &&  unique_present?
+      return @forms if look_up(:conjunctions)       &&  unique_present?
+      return @forms if look_up(:subjunctions)       &&  unique_present?
+      return @forms if clook_up(:personal_pronouns) &&  unique_pers_pron?
+      return @forms if other_pronouns               &&  unique_pronoun?
+      return @forms if irregular_verbs              &&  unique_present?
+      return @forms if clook_up(:cardinals)         &&  unique_cardinal?
+      direct_lookup
+      indirect_lookup
+      @logger.error("Missing Word: #{@word}".red) if @forms.empty?
+      @forms
+    end
+######### Numerals #########
+    def numerals
+      if Helpers::RomanNumerals.roman?(@word)
+        add_form(Form::Cardinal.new(roman: @word))
+      end
+    end
+######### Personal Pronouns && Cardinals #########
+    # Complex Lookup
+    def clook_up(type)
+      if forms = LLT::Constants.const_get(type.upcase)[@word.downcase]
+        new_forms = forms.map do |form|
+          sg_type = type.to_s.chop # cardinals to cardinal
+          args = send("#{sg_type}_args", form)
+          constant_by_type(sg_type, namespace: LLT::Form).new(args)
+        end
+        add_forms(new_forms)
+      end
+    end
+    def personal_pronoun_args(pp)
+      # pp is an array of iclass, casus, numerus
+      ic, c, n = pp
+      stem, suffix = pers_pron_suffix_detection
+      { stem: stem, suffix: suffix, inflection_class: ic, casus: c, numerus: n }
+    end
+    HOMOPHONIC_PRONOUNS = Set.new(%w{ mei tui sui nostri nostrum vestri vestrum sese })
+    def unique_pers_pron?
+      ! HOMOPHONIC_PRONOUNS.include?(@word)
+    end
+    def pers_pron_suffix_detection
+      stem = @word.clone
+      stem.chomp!($1) if stem.match(/.*(cum|met|te)$/)
+      [stem, ($1 || "")]
+    end
+    def cardinal_args(cardinal)
+      # cardinal is an array
+      dec, c, n, s = cardinal
+      { decimal: dec, casus: c, numerus: n, sexus: s }
+    end
+    def unique_cardinal?
+      true # not sure if there is more needed.
+    end
+######### Other Pronouns #########
+    def other_pronouns
+      if m = pronouns_regexp.match(downcased)
+        pronoun_type = extract_pronoun_type(m)
+        stem = { type: :pronoun, inflection_class: pronoun_type }
+        new_forms = FormBuilder.build(stem.merge(options: opts_with_val(m.to_hash)))
+        add_forms(new_forms)
+      end
+    end
+    # quis and quid and all derivates (like aliquid) take a different
+    # path and use the substantivic endings
+    def extract_pronoun_type(m)
+      subst = (m[:ending] =~ /i[ds]$/ && m[:stem] == "qu") ? "_s" : ""
+      key = if m[:particle] == m[:stem] + m[:ending]
+              "quisquis"
+            else
+              # take only 2 chars of prefixed particle to match al(i)
+              # and all forms of un(us|ius...) - to_s for nils
+              "#{m[:prefixed_particle].to_s[0..1]}#{m[:stem]}#{m[:particle]}#{subst}"
+            end
+      PRONOUN_MAP[key.downcase]
+    end
+    PRONOUN_MAP = {
+       #stem + particle => :type
+                    "hc" => :hic,                "alcu" => :aliqui,
+                    "h"  => :hic,                "alqu" => :aliqui,
+                    "hu" => :hic,                "alqu_s" => :aliquis,
+                    "huc" => :hic,               "culibet" => :quilibet,#subst?
+                    "cu" => :qui,                "qulibet" => :quilibet,
+                    "qu" => :qui,                "cuvis" => :quivis,
+                    "qudam" => :quidam,          "quvis" => :quivis,
+                    "cudam" => :quidam,          "qu_s" => :quis,
+                    "qunam" => :quinam,          "uterque" => :uterque,
+                    "cunam" => :quinam,          "utrque" => :uterque,
+                    "i" => :is,                  "uter" => :uter,
+                    "e" => :is,                  "utr" => :uter,
+                    "ips" => :ipse,              "quque" => :quisque,
+                    "ill" => :ille,              "cuque" => :quisque,
+                    "ist" => :iste,              "quque_s" => :quisque_s,
+                    "idem" => :idem,             "ququam" => :quisquam,
+                    "edem" => :idem,             "ququam_s" => :quisquam,
+                    "qucumque" => :quicumque,    "cuquam" => :quisquam,
+                    "cucumque" => :quicumque,    "quisquis" => :quisquis,
+                    "alcu" => :aliqui,           "ququid" => :quisquis,
+                    "alqu" => :aliqui,           "unquque_s" => :unusquisque_s,
+                    "alqu_s" => :aliquis,        "uncuque" => :unusquisque,
+                    "qupiam_s" => :quispiam,     "unquque" => :unusquisque,
+                    "qupiam" => :quispiam,       "cupiam" => :quispiam,
+    }
+    UNIQUE_PRONOUNS = Set.new(%w{ hic is eam eas eo i quam quod quo qua })
+    def unique_pronoun?
+      ! UNIQUE_PRONOUNS.include?(@word)
+    end
+    def pronouns_regexp
+      LLT::Constants::RegExps::PRONOUNS
+    end
+######### Irregular Verbs #########
+    def irregular_verbs
+      irregular_verbs_regexps.each do |verb, stems|
+        break if @uniq
+        stems.each do |stem_type, regexps|
+          regexps.each do |regexp|
+            if m = regexp.match(@word)
+              @logger.log("Matched irregular verb #{@word.yellow} with #{verb.to_s.yellow}")
+              stem_pack = irregular_stems(verb)
+              next unless stem_pack # temporary nexting, delete when all ISPs are written down
+              new_forms = create_forms(stem_type, stem_pack, m.to_hash)
+              add_forms(new_forms)
+              # We cannot immediately return as quite often another match
+              # will definitely made with the same lemma. Therefore only
+              # break at the top - that a match of esse cannot go to ire
+              # or anything else.
+              @uniq = true unless HOMOGRAPHIC_IRREGS[verb].match(@word)
+            end
+          end
+        end
+      end
+    end
+    HOMOGRAPHIC_IRREGS = {
+      ferre: /fero/,
+      ire:   /subito/,
+    }
+    HOMOGRAPHIC_IRREGS.default = (/in_doubt_better_don't_match/)
+    def irregular_stems(key)
+      LLT::StemBuilder::IRREGULAR_STEMS[key]
+    end
+    def irregular_verbs_regexps
+      LLT::Constants::RegExps::IRREGULAR_VERBS
+    end
+######### Subjunctions & Conjunctions#########
+    def look_up(arg)
+      # A bit messy, the constants are saved in a format of
+      #  key = string
+      #   value = homophonous_forms?
+      # That's why we need to access the hash twice, as const[@word]
+      # could return false and thus fail # the conditional test
+      # with an inline assigment 'if (something = const[@word])'
+      const = Constants.const_get(arg.upcase)
+      w = downcased
+      if const.has_key?(w)
+        @uniq = true unless const[w]
+        add_form(Form.const_get(arg.to_s.chop.capitalize).new(string: @word))
+      end
+    end
+######### Prepositions #########
+    def prepositions
+      if prep = Constants::PREPOSITIONS[downcased]
+        # preps are { word => 4 6 not_uniq }
+        @uniq = true unless prep.last
+        takes_4th, takes_6th = prep[0..1]
+        args = { string: @word, takes_4th: takes_4th, takes_6th: takes_6th }
+        add_form(Form::Preposition.new(args))
+      end
+    end
+######### Direct Lookup like Adverbs #########
+    def direct_lookup
+      create_adverbs
+    end
+    def create_adverbs
+      entries = @db.direct_lookup(:adverb, downcased)
+      entries.each do |entry|
+        add_form(Form::Adverb.new(string: entry.word))
+      end
+    end
+######### Creation through DB #########
+    def indirect_lookup
+      statements
+      look_up_and_build_forms
+    end
+    def statements
+      @statements ||= StemLookupStatementBuilder.new(@word, @logger).statements
+    end
+    def look_up_and_build_forms
+      @statements.each do |statement|
+        @logger.log(statement.to_s)
+        stems = @db.look_up_stem(statement.to_query)
+        if stems.any?
+          @logger.bare("#{stems.size} #{pluralize(stems.size, 'entry')} found: #{stems.map(&:to_s) * ", "}", 8)
+          stems.each do |stem_pack|
+            type = t.send(statement.stem_type, :full)
+            new_forms = create_forms(type, stem_pack, statement.options)
+            add_forms(new_forms)
+          end
+        else
+          @logger.bare("0 entries found".yellow, 8)
+        end
+      end
+    end
+######### Helpers #########
+    def create_forms(selector, stem_pack, options)
+      forms = FormBuilder.build(stem_pack.to_hash(selector, opts_with_val(options)))
+      forms.each { |form| form.stems = stem_pack }
+    end
+    def log_form_creation(new_forms)
+      m = if new_forms.empty?
+            "No forms created".red
+          else
+            "#{new_forms.size} #{pluralize(new_forms.size, "form")} created: #{new_forms.map(&:to_s) * ", "}".green
+          end
+      @logger.bare(m, 8)
+    end
+    def opts_with_val(opts)
+      adapted_components(opts).merge(validate: true)
+    end
+    def add_form(form)
+      log_form_creation([form])
+      @forms << form
+    end
+    def add_forms(forms)
+      log_form_creation(forms)
+      @forms += forms
+    end
+    def unique_present?
+      @uniq
+    end
+    def downcased
+      @word.downcase
+    end
+    def adapted_components(comps)
+      # TODO 30.09.13 12:13 by LFDM
+      # Look fors nils in comps, probably due to regexps
+      #
+      # This method looks useless at first sight, as this is already done in LookupStatement to some extent,
+      # it's main use seems for some nil cases that need to be found, afterwards we can delete this.
+      comps.reject do |k, v|
+        if v
+          v.empty? unless k == :ending
+        else
+          true
+        end
+      end
+    end
+    private_constant :HOMOPHONIC_PRONOUNS, :PRONOUN_MAP, :UNIQUE_PRONOUNS,
+      :HOMOPHONIC_PRONOUNS, :HOMOGRAPHIC_IRREGS
+  end
+end