RubyGems - lingo - Versions diffs - 1.8.0 → 1.8.1 - Mend

lingo 1.8.0 → 1.8.1

Files changed (100) hide show

data/ChangeLog +13 -0
data/README +49 -29
data/Rakefile +28 -4
data/TODO +2 -9
data/bin/lingo +24 -0
data/bin/lingoctl +24 -0
data/de/lingo-dic.txt +559 -74
data/info/gpl-hdr.txt +21 -24
data/lib/lingo.rb +83 -112
data/lib/lingo/agenda_item.rb +53 -0
data/lib/lingo/attendee.rb +261 -0
data/lib/lingo/attendee/abbreviator.rb +95 -97
data/lib/lingo/attendee/debugger.rb +94 -93
data/lib/lingo/attendee/decomposer.rb +76 -83
data/lib/lingo/attendee/dehyphenizer.rb +141 -144
data/lib/lingo/attendee/formatter.rb +65 -0
data/lib/lingo/attendee/multi_worder.rb +302 -0
data/lib/lingo/attendee/noneword_filter.rb +89 -84
data/lib/lingo/attendee/object_filter.rb +91 -0
data/lib/lingo/attendee/sequencer.rb +159 -158
data/lib/lingo/attendee/synonymer.rb +81 -84
data/lib/lingo/attendee/text_reader.rb +242 -0
data/lib/lingo/attendee/text_writer.rb +169 -0
data/lib/lingo/attendee/tokenizer.rb +192 -191
data/lib/lingo/attendee/variator.rb +152 -156
data/lib/lingo/attendee/vector_filter.rb +140 -135
data/lib/lingo/attendee/word_searcher.rb +98 -0
data/lib/lingo/buffered_attendee.rb +69 -0
data/lib/lingo/cachable.rb +58 -0
data/lib/lingo/call.rb +72 -0
data/lib/lingo/cli.rb +26 -0
data/lib/lingo/config.rb +23 -26
data/lib/lingo/core_ext.rb +42 -0
data/lib/lingo/ctl.rb +239 -173
data/lib/lingo/database.rb +148 -496
data/lib/lingo/database/crypter.rb +85 -0
data/lib/lingo/database/gdbm_store.rb +49 -0
data/lib/lingo/database/hash_store.rb +67 -0
data/lib/lingo/database/libcdb_store.rb +58 -0
data/lib/lingo/database/sdbm_store.rb +64 -0
data/lib/lingo/database/show_progress.rb +81 -0
data/lib/lingo/database/source.rb +134 -0
data/lib/lingo/database/source/key_value.rb +62 -0
data/lib/lingo/database/source/multi_key.rb +65 -0
data/lib/lingo/database/source/multi_value.rb +65 -0
data/lib/lingo/database/source/single_word.rb +60 -0
data/lib/lingo/database/source/word_class.rb +64 -0
data/lib/lingo/error.rb +122 -0
data/lib/lingo/language.rb +78 -518
data/lib/lingo/language/dictionary.rb +173 -0
data/lib/lingo/language/grammar.rb +211 -0
data/lib/lingo/language/lexical.rb +66 -0
data/lib/lingo/language/lexical_hash.rb +88 -0
data/lib/lingo/language/token.rb +48 -0
data/lib/lingo/language/word.rb +130 -0
data/lib/lingo/language/word_form.rb +83 -0
data/lib/lingo/reportable.rb +59 -0
data/lib/lingo/version.rb +1 -1
data/lingo-all.cfg +14 -10
data/lingo-call.cfg +5 -5
data/lingo.cfg +14 -12
data/lingo.rb +26 -0
data/lir.cfg +13 -9
data/spec/spec_helper.rb +1 -0
data/test.cfg +11 -11
data/test/attendee/ts_abbreviator.rb +0 -6
data/test/attendee/ts_decomposer.rb +0 -6
data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
data/test/attendee/ts_noneword_filter.rb +1 -7
data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
data/test/attendee/ts_sequencer.rb +0 -6
data/test/attendee/ts_synonymer.rb +0 -6
data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
data/test/attendee/ts_tokenizer.rb +0 -6
data/test/attendee/ts_variator.rb +0 -6
data/test/attendee/ts_vector_filter.rb +1 -7
data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
data/test/ref/artikel.non +2 -29
data/test/ref/artikel.seq +13 -8
data/test/ref/artikel.vec +30 -15
data/test/ref/artikel.ven +29 -14
data/test/ref/artikel.ver +58 -43
data/test/ref/lir.csv +146 -145
data/test/ref/lir.non +186 -210
data/test/ref/lir.seq +54 -50
data/test/test_helper.rb +41 -36
data/test/ts_database.rb +12 -11
data/test/ts_language.rb +118 -68
metadata +67 -29
data/lib/lingo/attendee/multiworder.rb +0 -301
data/lib/lingo/attendee/objectfilter.rb +0 -86
data/lib/lingo/attendee/textreader.rb +0 -237
data/lib/lingo/attendee/textwriter.rb +0 -196
data/lib/lingo/attendee/wordsearcher.rb +0 -96
data/lib/lingo/attendees.rb +0 -289
data/lib/lingo/const.rb +0 -131
data/lib/lingo/modules.rb +0 -98
data/lib/lingo/types.rb +0 -285
data/lib/lingo/utilities.rb +0 -40

data/lib/lingo/language/dictionary.rb ADDED

@@ -0,0 +1,173 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    class Dictionary
+      include Cachable
+      include Reportable
+      def initialize(config, lingo)
+        unless config.has_key?('source')
+          raise ArgumentError, 'Required parameter `source\' missing.'
+        end
+        init_cachable
+        init_reportable
+        @suffixes, @infixes = [], []
+        if suffix = lingo.dictionary_config['suffix']
+          suffix.each { |t, s|
+            t.downcase!
+            s.split.each { |suf|
+              su, ex = suf.split('/')
+              (t == 'f' ? @infixes : @suffixes) << [
+                Regexp.new(su << '$', 'i'), ex || '*', t
+              ]
+            }
+          }
+        end
+        @sources     = config['source'].map { |src| lingo.lexical_hash(src) }
+        @all_sources = config['mode'].nil? || config['mode'].downcase == 'all'
+        lingo.dictionaries << self
+      end
+      def close
+        @sources.each(&:close)
+      end
+      def report
+        super.tap { |rep| @sources.each { |src| rep.update(src.report) } }
+      end
+      # _dic_.find_word( _aString_ ) -> _aNewWord_
+      #
+      # Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
+      def find_word(str)
+        if hit?(key = str.downcase)
+          inc('cache hits')
+          return retrieve(key).tap { |word| word.form = str }
+        end
+        word = Word.new(str, WA_UNKNOWN)
+        unless (lexicals = select_with_suffix(str)).empty?
+          word.lexicals = lexicals
+          word.attr = WA_IDENTIFIED
+        end
+        store(key, word)
+      end
+      def find_synonyms(obj)
+        lex = obj.lexicals
+        lex = [obj] if lex.empty? && obj.unknown?
+        # multiworder optimization
+        ref = %r{\A#{Regexp.escape(Database::KEY_REF)}\d+}o
+        lex.each_with_object([]) { |l, s|
+          next if l.attr == LA_SYNONYM
+          next if l.attr != LA_KOMPOSITUM && obj.attr == WA_KOMPOSITUM
+          select(l.form).each { |y| s << y unless y =~ ref }
+        }
+      end
+      # _dic_.select( _aString_ ) -> _ArrayOfLexicals_
+      #
+      # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
+      def select(str)
+        @sources.each_with_object([]) { |src, lex|
+          l = src[str] or next
+          lex.concat(l)
+          break lex unless @all_sources
+        }.tap { |lex| lex.sort!; lex.uniq! }
+      end
+      # _dic_.select_with_suffix( _aString_ ) -> _ArrayOfLexicals_
+      #
+      # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
+      # Sucht dabei auch Wörter, die um wortklassenspezifische Suffixe bereinigt wurden.
+      def select_with_suffix(str)
+        select_with_affix(:suffix, str)
+      end
+      # _dic_.select_with_infix( _aString_ ) -> _ArrayOfLexicals_
+      #
+      # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
+      # Sucht dabei auch Wörter, die eine Fugung am Ende haben.
+      def select_with_infix(str)
+        select_with_affix(:infix, str)
+      end
+      # _dic_.suffix_lexicals( _aString_ ) -> _ArrayOfLexicals_
+      #
+      # Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
+      #
+      # dic.suffix_lexicals("Hasens") -> [(hasen/s), (hasen/e), (has/e)]
+      def suffix_lexicals(str)
+        affix_lexicals(:suffix, str)
+      end
+      # _dic_.gap_lexicals( _aString_ ) -> _ArrayOfLexicals_
+      #
+      # Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
+      def infix_lexicals(str)
+        affix_lexicals(:infix, str)
+      end
+      private
+      def select_with_affix(affix, str)
+        select(str).tap { |l|
+          if l.empty?
+            affix_lexicals(affix, str).each { |a| select(a.form).each { |b|
+              l << b if affix != :suffix || a.attr == b.attr
+            } }
+          end
+        }
+      end
+      def affix_lexicals(affix, str)
+        instance_variable_get("@#{affix}es").each_with_object([]) { |(r, e, t), l|
+          l << Lexical.new("#{$`}#{e == '*' ? '' : e}#{$'}", t) if str =~ r
+        }
+      end
+    end
+  end
+end

data/lib/lingo/language/grammar.rb ADDED

@@ -0,0 +1,211 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    # Die Klasse Grammar beinhaltet grammatikalische Spezialitäten einer Sprache. Derzeit findet die
+    # Kompositumerkennung hier ihren Platz, die mit der Methode find_compositum aufgerufen werden kann.
+    # Die Klasse Grammar wird genau wie ein Dictionary initialisiert. Das bei der Initialisierung angegebene Wörterbuch ist Grundlage
+    # für die Erkennung der Kompositumteile.
+    class Grammar
+      include Cachable
+      include Reportable
+      HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
+      # initialize(config, dictionary_config) -> _Grammar_
+      # config = Attendee-spezifische Parameter
+      # dictionary_config = Datenbankkonfiguration aus de.lang
+      def initialize(config, lingo)
+        init_cachable
+        init_reportable
+        @dic, @suggestions = Dictionary.new(config, lingo), []
+        cfg = lingo.dictionary_config['compositum']
+        # Ein Wort muss mindestens 8 Zeichen lang sein, damit
+        # überhaupt eine Prüfung stattfindet.
+        @min_word_size = (cfg['min-word-size'] || 8).to_i
+        # Die durchschnittliche Länge der Kompositum-Wortteile
+        # muss mindestens 4 Zeichen lang sein, sonst ist es kein
+        # gültiges Kompositum.
+        @min_avg_part_size = (cfg['min-avg-part-size'] || 4).to_i
+        # Der kürzeste Kompositum-Wortteil muss mindestens 1 Zeichen lang sein
+        @min_part_size = (cfg['min-part-size'] || 1).to_i
+        # Ein Kompositum darf aus höchstens 4 Wortteilen bestehen
+        @max_parts = (cfg['max-parts'] || 4).to_i
+        # Die Wortklasse eines Kompositum-Wortteils kann separat gekennzeichnet
+        # werden, um sie von Wortklassen normaler Wörter unterscheiden zu
+        # können z.B. Hausmeister => ['haus/s', 'meister/s'] oder Hausmeister
+        # => ['haus/s+', 'meister/s+'] mit append-wordclass = '+'
+        @append_wc = cfg.fetch('append-wordclass', '')
+        # Bestimmte Sequenzen können als ungültige Komposita erkannt werden,
+        # z.B. ist ein Kompositum aus zwei Adjetiven kein Kompositum, also
+        # skip-sequence = 'aa'
+        @sequences = cfg.fetch('skip-sequences', []).map(&:downcase)
+      end
+      def close
+        @dic.close
+      end
+      def report
+        super.update(@dic.report)
+      end
+      # find_compositum(str) -> word wenn level=1
+      # find_compositum(str) -> [lex, sta] wenn level!=1
+      #
+      # find_compositum arbeitet in verschiedenen Leveln, da die Methode auch rekursiv aufgerufen wird. Ein Level größer 1
+      # entspricht daher einem rekursiven Aufruf
+      def find_compositum(str, level = 1, tail = false)
+        key, top, empty = str.downcase, level == 1, [[], [], '']
+        if top && hit?(key)
+          inc('cache hits')
+          return retrieve(key)
+        end
+        com = Word.new(str, WA_UNKNOWN)
+        unless str.length > @min_word_size
+          inc('String zu kurz')
+          return top ? com : empty
+        end
+        inc('Komposita geprüft')
+        res = permute_compositum(key, level, tail)
+        val = !(lex = res.first).empty? && valid?(str, *res[1..-1])
+        if top
+          if val
+            inc('Komposita erkannt')
+            com.attr = WA_KOMPOSITUM
+            com.lexicals = lex.map { |l|
+              l.attr == LA_KOMPOSITUM ? l :
+                Lexical.new(l.form, l.attr + @append_wc)
+            }
+          end
+          store(key, com)
+        else
+          val ? res : empty
+        end
+      end
+      # permute_compositum( _aString_ ) ->  [lex, sta, seq]
+      def permute_compositum(str, level, tail)
+        return test_compositum($1, '-', $2, level, tail) if str =~ HYPHEN_RE
+        sug, len = @suggestions[level] ||= [], str.length
+        1.upto(len - 1) { |i|
+          res = test_compositum(str[0, i], '', str[i, len], level, tail)
+          unless (lex = res.first).empty?
+            return res unless lex.last.attr == LA_TAKEITASIS
+            sug << res
+          end
+        }
+        sug.empty? ? [[], [], ''] : sug.first.tap { sug.clear }
+      end
+      # test_compositum() ->  [lex, sta, seq]
+      #
+      # Testet einen definiert zerlegten String auf Kompositum
+      def test_compositum(fstr, infix, bstr, level, tail)
+        sta, seq, empty = [fstr.length, bstr.length], %w[? ?], [[], [], '']
+        if !(blex = @dic.select_with_suffix(bstr)).sort!.empty?
+          # 1. Word w/ suffix
+          bform, seq[1] = tail ? bstr : blex.first.form, blex.first.attr
+        elsif tail && !(blex = @dic.select_with_infix(bstr)).sort!.empty?
+          # 2. Word w/ infix, unless tail part
+          bform, seq[1] = bstr, blex.first.attr
+        elsif infix == '-'
+          blex, bsta, bseq = find_compositum(bstr, level + 1, tail)
+          if !blex.sort!.empty?
+            # 3. Compositum
+            bform, seq[1], sta[1..-1] = blex.first.form, bseq, bsta
+          else
+            # 4. Take it as is
+            blex = [Lexical.new(bform = bstr, seq[1] = LA_TAKEITASIS)]
+          end
+        else
+          return empty
+        end
+        if !(flex = @dic.select_with_infix(fstr)).sort!.empty?
+          # 1. Word w/ infix
+          fform, seq[0] = fstr, flex.first.attr
+        else
+          flex, fsta, fseq = find_compositum(fstr, level + 1, true)
+          if !flex.sort!.empty?
+            # 2. Compositum
+            fform, seq[0], sta[0..0] = flex.first.form, fseq, fsta
+          elsif infix == '-'
+            # 3. Take it as is
+            flex = [Lexical.new(fform = fstr, seq[0] = LA_TAKEITASIS)]
+          else
+            return empty
+          end
+        end
+        flex.concat(blex).delete_if { |l| l.attr == LA_KOMPOSITUM }.
+          push(Lexical.new(fform + infix + bform, LA_KOMPOSITUM)).sort!
+        [flex, sta, seq.join]
+      end
+      private
+      def valid?(str, sta, seq)
+        sta.size               <= @max_parts         &&
+        sta.sort.first         >= @min_part_size     &&
+        str.length / sta.size  >= @min_avg_part_size &&
+        (@sequences.empty? || !@sequences.include?(seq))
+      end
+    end
+  end
+end

data/lib/lingo/language/lexical.rb ADDED

@@ -0,0 +1,66 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    # Die Klasse Lexical, abgeleitet von der Klasse WordForm, stellt den Container
+    # für eine Grundform eines Wortes bereit, welches mit der Wortklasse versehen ist.
+    #
+    # Wird z.B. aus dem Wörterbuch eine Grundform gelesen, so wird dies in Form eines
+    # Lexical-Objektes zurückgegeben, z.B. Lexical.new('Rennen', 'S') -> (rennen/s)
+    class Lexical < WordForm
+      def <=>(other)
+        return 1 unless other.is_a?(self.class)
+        if attr == other.attr
+          form <=> other.form
+        else
+          attr.empty? ? 1 : other.attr.empty? ? -1 : begin
+            a = LA_SORTORDER.index(attr)
+            b = LA_SORTORDER.index(other.attr)
+            a ? b ? b <=> a : -1 : b ? 1 : attr <=> other.attr
+          end
+        end
+      end
+      def to_str
+        to_a.join('#')
+      end
+      def to_s
+        "(#{super})"
+      end
+    end
+  end
+end