RubyGems - lingo - Versions diffs - 1.8.2 → 1.8.3 - Mend

lingo 1.8.2 → 1.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

data/ChangeLog +33 -0
data/README +6 -5
data/Rakefile +6 -4
data/{lib/lingo/cachable.rb → bin/lingosrv} +30 -58
data/bin/lingoweb +30 -0
data/de.lang +2 -13
data/en/lingo-irr.txt +266 -0
data/en/lingo-wdn.txt +37319 -0
data/en.lang +2 -15
data/lib/lingo/app.rb +82 -0
data/lib/lingo/attendee/abbreviator.rb +22 -26
data/lib/lingo/attendee/debugger.rb +8 -4
data/lib/lingo/attendee/decomposer.rb +0 -1
data/lib/lingo/attendee/dehyphenizer.rb +2 -2
data/lib/lingo/attendee/multi_worder.rb +20 -13
data/lib/lingo/attendee/noneword_filter.rb +2 -7
data/lib/lingo/attendee/sequencer.rb +43 -19
data/lib/lingo/attendee/stemmer/porter.rb +2 -2
data/lib/lingo/attendee/stemmer.rb +1 -1
data/lib/lingo/attendee/synonymer.rb +1 -9
data/lib/lingo/attendee/text_reader.rb +42 -29
data/lib/lingo/attendee/text_writer.rb +3 -6
data/lib/lingo/attendee/tokenizer.rb +87 -69
data/lib/lingo/attendee/variator.rb +7 -5
data/lib/lingo/attendee/vector_filter.rb +11 -11
data/lib/lingo/attendee/word_searcher.rb +1 -9
data/lib/lingo/attendee.rb +24 -105
data/lib/lingo/buffered_attendee.rb +2 -9
data/lib/lingo/call.rb +18 -13
data/lib/lingo/cli.rb +5 -10
data/lib/lingo/config.rb +40 -7
data/lib/lingo/ctl.rb +69 -57
data/lib/lingo/database/hash_store.rb +9 -4
data/lib/lingo/database/sdbm_store.rb +4 -7
data/lib/lingo/database/source/multi_key.rb +1 -1
data/lib/lingo/database/source/multi_value.rb +1 -1
data/lib/lingo/database/source.rb +2 -20
data/lib/lingo/database.rb +30 -19
data/lib/lingo/debug.rb +79 -0
data/lib/lingo/{core_ext.rb → language/char.rb} +43 -42
data/lib/lingo/language/dictionary.rb +38 -46
data/lib/lingo/language/grammar.rb +40 -57
data/lib/lingo/language/lexical.rb +4 -7
data/lib/lingo/language/lexical_hash.rb +17 -35
data/lib/lingo/language/token.rb +4 -0
data/lib/lingo/language/word.rb +7 -8
data/lib/lingo/language/word_form.rb +4 -4
data/lib/lingo/language.rb +2 -1
data/lib/lingo/srv/config.ru +4 -0
data/lib/lingo/srv/lingosrv.cfg +14 -0
data/lib/lingo/{reportable.rb → srv.rb} +59 -61
data/lib/lingo/version.rb +1 -1
data/lib/lingo/web/config.ru +4 -0
data/lib/lingo/web/lingoweb.cfg +14 -0
data/lib/lingo/web/public/lingo.png +0 -0
data/lib/lingo/web/public/lingoweb.css +74 -0
data/lib/lingo/web/views/index.erb +92 -0
data/lib/lingo/web.rb +94 -0
data/lib/lingo.rb +27 -29
data/lingo.cfg +1 -1
data/lir.cfg +24 -0
data/ru/lingo-dic.txt +22342 -0
data/ru/lingo-mul.txt +5151 -0
data/ru/lingo-syn.txt +0 -0
data/ru.lang +99 -0
data/test/attendee/ts_sequencer.rb +2 -2
data/test/attendee/ts_text_reader.rb +36 -2
data/test/attendee/ts_text_writer.rb +6 -6
data/test/lir.vec +3 -3
data/test/test_helper.rb +104 -102
data/test/ts_database.rb +1 -1
data/test/ts_language.rb +55 -96
data/txt/artikel-ru.txt +45 -0
data/txt/lir.txt +1 -3
metadata +143 -83
data/TODO +0 -23

data/lib/lingo/{core_ext.rb → language/char.rb} RENAMED Viewed

@@ -1,42 +1,43 @@
-# encoding: utf-8
-#--
-###############################################################################
-#                                                                             #
-# Lingo -- A full-featured automatic indexing system                          #
-#                                                                             #
-# Copyright (C) 2005-2007 John Vorhauer                                       #
-# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
-#                                                                             #
-# Lingo is free software; you can redistribute it and/or modify it under the  #
-# terms of the GNU Affero General Public License as published by the Free     #
-# Software Foundation; either version 3 of the License, or (at your option)   #
-# any later version.                                                          #
-#                                                                             #
-# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
-# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
-# more details.                                                               #
-#                                                                             #
-# You should have received a copy of the GNU Affero General Public License    #
-# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
-#                                                                             #
-###############################################################################
-#++
-require 'unicode'
-class String
-  alias_method :_lingo_original_downcase,  :downcase
-  alias_method :_lingo_original_downcase!, :downcase!
-  def downcase
-    Unicode.downcase(self)
-  end
-  def downcase!
-    replace(downcase)
-  end
-end
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    module Char
+      ANY = [
+        CHAR  = '[[:alpha:]]',
+        DIGIT = '[[:digit:]]',
+        LEGAL = '[ /&()\[\].,\'<>-]'
+      ].join('|')
+    end
+  end
+end

data/lib/lingo/language/dictionary.rb CHANGED Viewed

@@ -30,8 +30,7 @@ class Lingo
     class Dictionary
-      include Cachable
-      include Reportable
+      KEY_REF_RE = %r{\A#{Database::KEY_REF_ESC}\d+}
       def self.open(*args)
         yield dictionary = new(*args)
@@ -41,12 +40,9 @@ class Lingo
       def initialize(config, lingo)
         unless config.has_key?('source')
-          raise ArgumentError, 'Required parameter `source\' missing.'
+          raise ArgumentError, "Required parameter `source' missing."
         end
-        init_cachable
-        init_reportable
         @suffixes, @infixes = [], []
         Array(lingo.dictionary_config['suffix']).each { |t, s|
@@ -67,56 +63,49 @@ class Lingo
       end
       def close
-        @src.each(&:close)
-      end
-      def report
-        super.tap { |rep| @src.each { |src| rep.update(src.report) } }
+        @src.each { |i| i.close }
       end
       # _dic_.find_word( _aString_ ) -> _aNewWord_
       #
       # Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
       def find_word(str)
-        if hit?(key = str.downcase)
-          inc('cache hits')
-          return retrieve(key).tap { |word| word.form = str }
-        end
-        word = Word.new(str, WA_UNKNOWN)
-        unless (lexicals = select_with_suffix(str)).empty?
-          word.lexicals = lexicals
-          word.attr = WA_IDENTIFIED
-        end
-        store(key, word)
+        (@_word ||= {})[str] ||= Word.new(str, WA_UNKNOWN).tap { |w|
+          unless (lexicals = select_with_suffix(str)).empty?
+            w.lexicals = lexicals
+            w.attr = WA_IDENTIFIED
+          end
+        }
       end
-      def find_synonyms(obj)
+      def find_synonyms(obj, syn = [])
         lex = obj.lexicals
         lex = [obj] if lex.empty? && obj.unknown?
-        # multiworder optimization
-        ref = %r{\A#{Database::KEY_REF_ESC}\d+}
+        com, ref = obj.attr == WA_COMPOUND, KEY_REF_RE
-        lex.each_with_object([]) { |l, s|
-          next if l.attr == LA_SYNONYM
-          next if l.attr != LA_COMPOUND && obj.attr == WA_COMPOUND
-          select(l.form).each { |y| s << y unless y =~ ref }
+        lex.each { |l|
+          select(l.form, syn) { |i| i =~ ref } unless com &&
+            l.attr != LA_COMPOUND || l.attr == LA_SYNONYM
         }
+        syn
       end
       # _dic_.select( _aString_ ) -> _ArrayOfLexicals_
       #
       # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
-      def select(str)
-        @src.each_with_object([]) { |src, lex|
+      def select(str, lex = [])
+        @src.each { |src|
           l = src[str] or next
-          lex.concat(l)
-          break lex unless @all
-        }.tap { |lex| lex.sort!; lex.uniq! }
+          lex.concat(block_given? ? l.delete_if { |i| yield i } : l)
+          break unless @all
+        }
+        lex.sort!
+        lex.uniq!
+        lex
       end
       # _dic_.select_with_suffix( _aString_ ) -> _ArrayOfLexicals_
@@ -154,19 +143,22 @@ class Lingo
       private
       def select_with_affix(affix, str)
-        select(str).tap { |l|
-          if l.empty?
-            affix_lexicals(affix, str).each { |a| select(a.form).each { |b|
-              l << b if affix != :suffix || a.attr == b.attr
-            } }
-          end
-        }
+        lex = select(str)
+        affix_lexicals(affix, str).each { |a| select(a.form, lex) { |b|
+          affix == :suffix && a.attr != b.attr
+        } } if lex.empty?
+        lex
       end
       def affix_lexicals(affix, str)
-        instance_variable_get("@#{affix}es").each_with_object([]) { |(r, e, t), l|
-          l << Lexical.new("#{$`}#{e == '*' ? '' : e}#{$'}", t) if str =~ r
+        lex = instance_variable_get("@#{affix}es").map { |r, e, t|
+          Lexical.new("#{$`}#{e == '*' ? '' : e}#{$'}", t) if str =~ r
         }
+        lex.compact!
+        lex
       end
     end

data/lib/lingo/language/grammar.rb CHANGED Viewed

@@ -35,9 +35,6 @@ class Lingo
     class Grammar
-      include Cachable
-      include Reportable
       HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
       def self.open(*args)
@@ -47,11 +44,10 @@ class Lingo
       end
       def initialize(config, lingo)
-        init_cachable
-        init_reportable
         @dic, @suggestions = Dictionary.new(config, lingo), []
+        lingo.deprecate(:compositum, :compound, self) if lingo.dictionary_config.has_key?('compositum')
         cfg = lingo.dictionary_config['compound'] ||
               lingo.dictionary_config['compositum']  # DEPRECATE compositum
@@ -70,80 +66,63 @@ class Lingo
         # Bestimmte Sequenzen können als ungültige Komposita erkannt werden,
         # z.B. ist ein Kompositum aus zwei Adjetiven kein Kompositum, also
         # skip-sequence = 'aa'
-        @sequences = cfg.fetch('skip-sequences', []).map!(&:downcase)
+        @sequences = cfg.fetch('skip-sequences', []).map! { |i| i.downcase }
       end
       def close
         @dic.close
       end
-      def report
-        super.update(@dic.report)
-      end
       # find_compound(str) -> word wenn level=1
       # find_compound(str) -> [lex, sta] wenn level!=1
       #
       # find_compound arbeitet in verschiedenen Leveln, da die Methode auch rekursiv aufgerufen wird. Ein Level größer 1
       # entspricht daher einem rekursiven Aufruf
       def find_compound(str, level = 1, tail = false)
-        key, top, empty = str.downcase, level == 1, [[], [], '']
-        if top && hit?(key)
-          inc('cache hits')
-          return retrieve(key)
-        end
-        com = Word.new(str, WA_UNKNOWN)
+        return permute_compound([[], [], ''], str, level, tail) if level != 1
+        (@_compound ||= {})[str] ||= permute_compound(
+          com = Word.new(str, WA_UNKNOWN), str, level, tail
+        ) { |lex|
+          com.attr = WA_COMPOUND
+          com.lexicals = lex.each { |l|
+            l.attr += @append_wc unless l.attr == LA_COMPOUND
+          }
+        }
+      end
-        unless str.length > @min_word_size
-          inc('String zu kurz')
-          return top ? com : empty
-        end
+      private
-        inc('Komposita geprüft')
+      def permute_compound(ret, str, level, tail)
+        if (len = str.length) > @min_word_size
+          str = Unicode.downcase(str)
-        lex, sta, seq = res = permute_compound(key, level, tail)
+          lex, sta, seq = res = if str =~ HYPHEN_RE
+            test_compound($1, '-', $2, level, tail)
+          else
+            sug = @suggestions[level] ||= []
-        val = !lex.empty? &&
-          sta.size              <= @max_parts         &&
-          sta.min               >= @min_part_size     &&
-          str.length / sta.size >= @min_avg_part_size &&
-          (@sequences.empty? || !@sequences.include?(seq))
+            catch(:res) {
+              1.upto(len - 1) { |i|
+                tst = test_compound(str[0, i], '', str[i, len], level, tail)
-        if top
-          if val
-            inc('Komposita erkannt')
+                unless (lex = tst.first).empty?
+                  lex.last.attr == LA_TAKEITASIS ? sug << tst : throw(:res, tst)
+                end
+              }
-            com.attr = WA_COMPOUND
-            com.lexicals = lex.map { |l|
-              l.attr == LA_COMPOUND ? l :
-                Lexical.new(l.form, l.attr + @append_wc)
+              sug.empty? ? [[], [], ''] : sug.first.tap { sug.clear }
             }
           end
-          store(key, com)
-        else
-          val ? res : empty
+          block_given? ? yield(lex) : ret = res if !lex.empty? &&
+            sta.size              <= @max_parts         &&
+            sta.min               >= @min_part_size     &&
+            str.length / sta.size >= @min_avg_part_size &&
+            (@sequences.empty? || !@sequences.include?(seq))
         end
-      end
-      # permute_compound( _aString_ ) ->  [lex, sta, seq]
-      def permute_compound(str, level = 1, tail = false)
-        return test_compound($1, '-', $2, level, tail) if str =~ HYPHEN_RE
-        sug, len = @suggestions[level] ||= [], str.length
-        1.upto(len - 1) { |i|
-          res = test_compound(str[0, i], '', str[i, len], level, tail)
-          unless (lex = res.first).empty?
-            return res unless lex.last.attr == LA_TAKEITASIS
-            sug << res
-          end
-        }
-        sug.empty? ? [[], [], ''] : sug.first.tap { sug.clear }
+        ret
       end
       # test_compound() ->  [lex, sta, seq]
@@ -189,6 +168,10 @@ class Lingo
           end
         end
+        { flex => fform, blex => bform }.each { |a, f|
+          a.each { |l| l.src ||= f }
+        }
         flex.concat(blex).delete_if { |l| l.attr == LA_COMPOUND }.
           push(Lexical.new(fform + infix + bform, LA_COMPOUND)).sort!

data/lib/lingo/language/lexical.rb CHANGED Viewed

@@ -40,14 +40,11 @@ class Lingo
         return 1 unless other.is_a?(self.class)
         a1, a2 = attr, other.attr
+        return form <=> other.form if a1 == a2
-        if a1 == a2
-          form <=> other.form
-        else
-          a1.empty? ? 1 : a2.empty? ? -1 : begin
-            i1, i2 = [a1, a2].map(&LA_SORTORDER.method(:index))
-            i1 ? i2 ? i2 <=> i1 : -1 : i2 ? 1 : a1 <=> a2
-          end
+        a1.empty? ? 1 : a2.empty? ? -1 : begin
+          i1, i2 = LA_SORTORDER.values_at(a1, a2)
+          i1 ? i2 ? i1 <=> i2 : -1 : i2 ? 1 : a1 <=> a2
         end
       end

data/lib/lingo/language/lexical_hash.rb CHANGED Viewed

@@ -34,9 +34,6 @@ class Lingo
     class LexicalHash
-      include Cachable
-      include Reportable
       def self.open(*args)
         yield lexical_hash = new(*args)
       ensure
@@ -44,9 +41,6 @@ class Lingo
       end
       def initialize(id, lingo)
-        init_cachable
-        init_reportable(id)
         @wc  = lingo.database_config(id).fetch('def-wc', LA_UNKNOWN)
         @src = Database.open(id, lingo)
       end
@@ -56,35 +50,23 @@ class Lingo
       end
       def [](key)
-        inc('total requests')
-        key = key.downcase
-        if hit?(key)
-          inc('cache hits')
-          return retrieve(key)
-        end
-        inc('source reads')
-        if record = @src[key]
-          record = record.map { |str|
-            case str
-              when /^\*\d+$/           then str
-              when /^#(.)$/            then Lexical.new(key, $1)
-              when /^([^#]+?)\s*#(.)$/ then Lexical.new($1, $2)
-              when /^([^#]+)$/         then Lexical.new($1, @wc)
-              else                          str
-            end
-          }
-          record.compact!
-          record.sort!
-          record.uniq!
-          inc('data found')
-        end
-        store(key, record)
+        rec = @src[key = Unicode.downcase(key)] or return
+        res = rec.map { |str|
+          case str
+            when /^\*\d+$/           then str
+            when /^#(.)$/            then Lexical.new(key, $1)
+            when /^([^#]+?)\s*#(.)$/ then Lexical.new($1,  $2)
+            when /^([^#]+)$/         then Lexical.new($1, @wc)
+            else                          str
+          end
+        }
+        res.compact!
+        res.sort!
+        res.uniq!
+        res
       end
     end

data/lib/lingo/language/token.rb CHANGED Viewed

@@ -37,6 +37,10 @@ class Lingo
     class Token < WordForm
+      def word?
+        attr == TA_WORD
+      end
       def to_s
         ":#{super}:"
       end

data/lib/lingo/language/word.rb CHANGED Viewed

@@ -80,16 +80,14 @@ class Lingo
       end
       def add_lexicals(lex)
-        @lexicals.concat(lex)
-        @lexicals.sort!
-        @lexicals.uniq!
-        self
+        unless lex.empty?
+          @lexicals.concat(lex).uniq!
+          @lexicals.sort!
+        end
       end
       def attrs(compound_parts = true)
-        lexicals(compound_parts).map(&:attr)
+        lexicals(compound_parts).map { |i| i.attr }
       end
       def parts
@@ -125,7 +123,8 @@ class Lingo
       end
       def <<(*other)
-        lexicals.concat(other.tap(&:flatten!))
+        other.flatten!
+        lexicals.concat(other)
         self
       end

data/lib/lingo/language/word_form.rb CHANGED Viewed

@@ -36,10 +36,10 @@ class Lingo
       include Comparable
-      attr_accessor :form, :attr
+      attr_accessor :form, :attr, :src
-      def initialize(form, attr = '-')
-        @form, @attr = form || '', attr || ''
+      def initialize(form, attr = '-', src = nil)
+        @form, @attr, @src = form || '', attr || '', src
       end
       def unknown?
@@ -67,7 +67,7 @@ class Lingo
       end
       def hash
-        to_s.hash
+        to_a.hash
       end
       def eql?(other)

data/lib/lingo/language.rb CHANGED Viewed

@@ -31,6 +31,7 @@ require_relative 'language/word_form'
 require_relative 'language/token'
 require_relative 'language/lexical'
 require_relative 'language/word'
+require_relative 'language/char'
 class Lingo
@@ -72,7 +73,7 @@ class Lingo
       LA_SYNONYM    = 'y',
       LA_STEM       = 'z',
       LA_UNKNOWN    = '?'
-    ].reverse.join
+    ].each_with_index.inject({}) { |h, (i, j)| h[i] = j; h }
   end

data/lib/lingo/srv/config.ru ADDED Viewed

@@ -0,0 +1,4 @@
+require 'lingo'
+require 'lingo/srv'
+run Lingo::Srv

data/lib/lingo/srv/lingosrv.cfg ADDED Viewed

@@ -0,0 +1,14 @@
+---
+meeting:
+  attendees:
+    - text_reader:   { files: STDIN }
+    - tokenizer:     { }
+    - word_searcher: { source: sys-dic, mode: first }
+    - decomposer:    { source: sys-dic }
+    - multi_worder:  { source: sys-mul }
+    - sequencer:     { stopper: PUNC,OTHR }
+    - synonymer:     { skip: '?,t', source: sys-syn }
+    - vector_filter: { debug: 'true', prompt: '' }
+    - text_writer:   { ext: STDOUT, sep: "\n" }