RubyGems - lingo - Versions diffs - 1.8.0 → 1.8.1 - Mend

lingo 1.8.0 → 1.8.1

Files changed (100) hide show

data/ChangeLog +13 -0
data/README +49 -29
data/Rakefile +28 -4
data/TODO +2 -9
data/bin/lingo +24 -0
data/bin/lingoctl +24 -0
data/de/lingo-dic.txt +559 -74
data/info/gpl-hdr.txt +21 -24
data/lib/lingo.rb +83 -112
data/lib/lingo/agenda_item.rb +53 -0
data/lib/lingo/attendee.rb +261 -0
data/lib/lingo/attendee/abbreviator.rb +95 -97
data/lib/lingo/attendee/debugger.rb +94 -93
data/lib/lingo/attendee/decomposer.rb +76 -83
data/lib/lingo/attendee/dehyphenizer.rb +141 -144
data/lib/lingo/attendee/formatter.rb +65 -0
data/lib/lingo/attendee/multi_worder.rb +302 -0
data/lib/lingo/attendee/noneword_filter.rb +89 -84
data/lib/lingo/attendee/object_filter.rb +91 -0
data/lib/lingo/attendee/sequencer.rb +159 -158
data/lib/lingo/attendee/synonymer.rb +81 -84
data/lib/lingo/attendee/text_reader.rb +242 -0
data/lib/lingo/attendee/text_writer.rb +169 -0
data/lib/lingo/attendee/tokenizer.rb +192 -191
data/lib/lingo/attendee/variator.rb +152 -156
data/lib/lingo/attendee/vector_filter.rb +140 -135
data/lib/lingo/attendee/word_searcher.rb +98 -0
data/lib/lingo/buffered_attendee.rb +69 -0
data/lib/lingo/cachable.rb +58 -0
data/lib/lingo/call.rb +72 -0
data/lib/lingo/cli.rb +26 -0
data/lib/lingo/config.rb +23 -26
data/lib/lingo/core_ext.rb +42 -0
data/lib/lingo/ctl.rb +239 -173
data/lib/lingo/database.rb +148 -496
data/lib/lingo/database/crypter.rb +85 -0
data/lib/lingo/database/gdbm_store.rb +49 -0
data/lib/lingo/database/hash_store.rb +67 -0
data/lib/lingo/database/libcdb_store.rb +58 -0
data/lib/lingo/database/sdbm_store.rb +64 -0
data/lib/lingo/database/show_progress.rb +81 -0
data/lib/lingo/database/source.rb +134 -0
data/lib/lingo/database/source/key_value.rb +62 -0
data/lib/lingo/database/source/multi_key.rb +65 -0
data/lib/lingo/database/source/multi_value.rb +65 -0
data/lib/lingo/database/source/single_word.rb +60 -0
data/lib/lingo/database/source/word_class.rb +64 -0
data/lib/lingo/error.rb +122 -0
data/lib/lingo/language.rb +78 -518
data/lib/lingo/language/dictionary.rb +173 -0
data/lib/lingo/language/grammar.rb +211 -0
data/lib/lingo/language/lexical.rb +66 -0
data/lib/lingo/language/lexical_hash.rb +88 -0
data/lib/lingo/language/token.rb +48 -0
data/lib/lingo/language/word.rb +130 -0
data/lib/lingo/language/word_form.rb +83 -0
data/lib/lingo/reportable.rb +59 -0
data/lib/lingo/version.rb +1 -1
data/lingo-all.cfg +14 -10
data/lingo-call.cfg +5 -5
data/lingo.cfg +14 -12
data/lingo.rb +26 -0
data/lir.cfg +13 -9
data/spec/spec_helper.rb +1 -0
data/test.cfg +11 -11
data/test/attendee/ts_abbreviator.rb +0 -6
data/test/attendee/ts_decomposer.rb +0 -6
data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
data/test/attendee/ts_noneword_filter.rb +1 -7
data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
data/test/attendee/ts_sequencer.rb +0 -6
data/test/attendee/ts_synonymer.rb +0 -6
data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
data/test/attendee/ts_tokenizer.rb +0 -6
data/test/attendee/ts_variator.rb +0 -6
data/test/attendee/ts_vector_filter.rb +1 -7
data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
data/test/ref/artikel.non +2 -29
data/test/ref/artikel.seq +13 -8
data/test/ref/artikel.vec +30 -15
data/test/ref/artikel.ven +29 -14
data/test/ref/artikel.ver +58 -43
data/test/ref/lir.csv +146 -145
data/test/ref/lir.non +186 -210
data/test/ref/lir.seq +54 -50
data/test/test_helper.rb +41 -36
data/test/ts_database.rb +12 -11
data/test/ts_language.rb +118 -68
metadata +67 -29
data/lib/lingo/attendee/multiworder.rb +0 -301
data/lib/lingo/attendee/objectfilter.rb +0 -86
data/lib/lingo/attendee/textreader.rb +0 -237
data/lib/lingo/attendee/textwriter.rb +0 -196
data/lib/lingo/attendee/wordsearcher.rb +0 -96
data/lib/lingo/attendees.rb +0 -289
data/lib/lingo/const.rb +0 -131
data/lib/lingo/modules.rb +0 -98
data/lib/lingo/types.rb +0 -285
data/lib/lingo/utilities.rb +0 -40

data/lib/lingo/language/lexical_hash.rb ADDED

@@ -0,0 +1,88 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    # Die Klasse LexicalHash ermöglicht den Zugriff auf die Lingodatenbanken. Im Gegensatz zur
+    # Klasse Database, welche nur Strings als Ergebnis zurück gibt, wird hier als Ergebnis ein
+    # Array von Lexical-Objekten zurück gegeben.
+    class LexicalHash
+      include Cachable
+      include Reportable
+      def initialize(id, lingo)
+        init_cachable
+        init_reportable(id)
+        @wc  = lingo.database_config(id).fetch('def-wc', LA_UNKNOWN)
+        @src = Database.open(id, lingo)
+      end
+      def close
+        @src.close
+      end
+      def [](key)
+        inc('total requests')
+        key = key.downcase
+        if hit?(key)
+          inc('cache hits')
+          return retrieve(key)
+        end
+        inc('source reads')
+        if record = @src[key]
+          record = record.map { |str|
+            case str
+              when /^\*\d+$/           then str
+              when /^#(.)$/            then Lexical.new(key, $1)
+              when /^([^#]+?)\s*#(.)$/ then Lexical.new($1, $2)
+              when /^([^#]+)$/         then Lexical.new($1, @wc)
+              else                          str
+            end
+          }
+          record.compact!
+          record.sort!
+          record.uniq!
+          inc('data found')
+        end
+        store(key, record)
+      end
+    end
+  end
+end

data/lib/lingo/language/token.rb ADDED

@@ -0,0 +1,48 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    # Die Klasse Token, abgeleitet von der Klasse WordForm, stellt den Container
+    # für ein einzelnes Wort eines Textes dar. Das Wort wird mit einem Attribut versehen,
+    # welches der Regel entspricht, die dieses Wort identifiziert hat.
+    #
+    # Steht z.B. in ruby.cfg eine Regel zur Erkennung einer Zahl, die mit NUM bezeichnet wird,
+    # so wird dies dem Token angeheftet, z.B. Token.new('100', 'NUM') -> #100/NUM#
+    class Token < WordForm
+      def to_s
+        ":#{super}:"
+      end
+    end
+  end
+end

data/lib/lingo/language/word.rb ADDED

@@ -0,0 +1,130 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    # Die Klasse Word bündelt spezifische Eigenschaften eines Wortes mit den
+    # dazu notwendigen Methoden.
+    class Word < WordForm
+      def self.new_lexical(form, attr, lex_attr)
+        new(form, attr) << Lexical.new(form, lex_attr)
+      end
+      # Exakte Representation der originären Zeichenkette, so wie sie im Satz
+      # gefunden wurde, z.B. <tt>form = "RubyLing"</tt>
+      #
+      # Ergebnis der Wörterbuch-Suche. Sie stellt die Grundform des Wortes dar.
+      # Dabei kann es mehrere mögliche Grundformen geben, z.B. kann +abgeschoben+
+      # als Grundform das _Adjektiv_ +abgeschoben+ sein, oder aber das _Verb_
+      # +abschieben+.
+      #
+      # <tt>lemma = [['abgeschoben', '#a'], ['abschieben', '#v']]</tt>.
+      #
+      # <b>Achtung: Lemma wird nicht durch die Word-Klasse bestückt, sondern extern
+      # durch die Klasse Dictionary</b>
+      def initialize(form, attr = WA_UNSET)
+        super
+        @lexicals = []
+      end
+      def lexicals(compound_parts = true)
+        if !compound_parts && attr == WA_KOMPOSITUM
+          @lexicals.select { |lex| lex.attr == LA_KOMPOSITUM }
+        else
+          @lexicals
+        end
+      end
+      def lexicals=(lexis)
+        if lexis.is_a?(Array)
+          @lexicals = lexis.sort.uniq
+        else
+          raise TypeError, "wrong argument type #{lexis.class} (expected Array)"
+        end
+      end
+      def attrs(compound_parts = true)
+        lexicals(compound_parts).map { |lex| lex.attr }
+      end
+      def parts
+        1
+      end
+      def min_part_size
+        form.length
+      end
+      # Gibt genau die Grundform der Wortklasse zurück, die der RegExp des Übergabe-Parameters
+      # entspricht, z.B. <tt>word.get_wc(/a/) = ['abgeschoben', '#a']</tt>
+      def get_class(wc_re)
+        wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
+        unless lexicals.empty?
+          lexicals.select { |lex| lex.attr =~ wc_re }
+        else
+          attr =~ wc_re ? [self] : []
+        end
+      end
+      def norm
+        identified? ? lexicals.first.form : form
+      end
+      def compo_form
+        if attr == WA_KOMPOSITUM
+          get_class(LA_KOMPOSITUM).first
+        else
+          nil
+        end
+      end
+      def <<(*other)
+        lexicals.concat(other.flatten)
+        self
+      end
+      def <=>(other)
+        other.nil? ? 1 : to_a.push(lexicals) <=> other.to_a.push(other.lexicals)
+      end
+      def to_s
+        s =  "<#{form}"
+        s << "|#{attr}" unless identified?
+        s << " = #{lexicals.inspect}" unless lexicals.empty?
+        s << '>'
+      end
+    end
+  end
+end

data/lib/lingo/language/word_form.rb ADDED

@@ -0,0 +1,83 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  module Language
+    # Die Klasse WordForm ist die Basisklasse für weitere Klassen, die im Rahmen der
+    # Objektstruktur eines Wortes benötigt werden. Die Klasse stellt eine Zeichenkette bereit,
+    # die mit einem Attribut versehen werden kann.
+    class WordForm
+      include Comparable
+      attr_accessor :form, :attr
+      def initialize(form, attr = '-')
+        @form, @attr = form || '', attr || ''
+      end
+      def unknown?
+        [WA_UNKNOWN, WA_UNKMULPART].include?(attr)
+      end
+      def identified?
+        attr == WA_IDENTIFIED
+      end
+      def <=>(other)
+        other.nil? ? 1 : to_a <=> other.to_a
+      end
+      def to_a
+        [form, attr]
+      end
+      def to_s
+        to_a.join('/')
+      end
+      def inspect
+        to_s
+      end
+      def hash
+        to_s.hash
+      end
+      def eql?(other)
+        self.class.equal?(other.class) && to_s == other.to_s
+      end
+      alias_method :==, :eql?
+    end
+  end
+end

data/lib/lingo/reportable.rb ADDED

@@ -0,0 +1,59 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  # Provides counters.
+  module Reportable
+    def init_reportable(prefix = nil)
+      @counters, @prefix = Hash.new(0), prefix ? "#{prefix}: " : ''
+    end
+    def inc(counter)
+      @counters[counter] += 1
+    end
+    def add(counter, value)
+      @counters[counter] += value
+    end
+    def set(counter, value)
+      @counters[counter] = value
+    end
+    def get(counter)
+      @counters[counter]
+    end
+    def report
+      @counters.each_with_object({}) { |(k, v), r| r["#{@prefix}#{k}"] = v }
+    end
+  end
+end

data/lib/lingo/version.rb CHANGED

@@ -4,7 +4,7 @@ class Lingo
     MAJOR = 1
     MINOR = 8
-    TINY  = 0
+    TINY  = 1
     class << self

data/lingo-all.cfg CHANGED

@@ -11,7 +11,7 @@ meeting:
     #
     #  Angegebene Datei zeilenweise einlesen und verarbeitet
-    -  textreader:      { files: '$(files)' }
+    -  text_reader:     { files: '$(files)' }
     ########################################
@@ -25,7 +25,7 @@ meeting:
 #   -  abbreviator:     { source: 'sys-abk' }
     #  Verbleibende Token im Wörterbuch suchen
-    -  wordsearcher:    { source: 'sys-dic', mode: 'first' }
+    -  word_searcher:   { source: 'sys-dic', mode: 'first' }
     #  Schreibweisen variieren und erneut suchen
 #   -  variator:        { source: 'sys-dic' }
@@ -37,7 +37,7 @@ meeting:
 #   -  decomposer:      { source: 'sys-dic' }
     #  Mehrwortgruppen im Strom erkennen
-#   -  multiworder:     { stopper: 'PUNC,OTHR', source: 'sys-mul' }
+#   -  multi_worder:    { stopper: 'PUNC,OTHR', source: 'sys-mul' }
     #  Wortsequenzen anhand von Regeln identifizieren
 #   -  sequencer:       { stopper: 'PUNC,OTHR' }
@@ -56,30 +56,34 @@ meeting:
     #  Ergebnisse ausgeben
     #
+    #  Erstelle Datei mit Endung .log für Datenstrom
+#   -  vector_filter:   { in: syn, debug: 'true', prompt: 'lex:) ' }
+#   -  text_writer:     { ext: log, sep: "\n" }
     #  Erstelle Datei mit Endung .non für nicht erkannte Wörter
 #   -  noneword_filter: { in: syn }
-#   -  textwriter:      { ext: non, sep: "\n" }
+#   -  text_writer:     { ext: non, sep: "\n" }
     #  Erstelle Datei mit Endung .vec für erkannte Indexterme
 #   -  vector_filter:   { in: syn, lexicals: '^[ksavem]$' }
-#   -  textwriter:      { ext: vec, sep: "\n" }
+#   -  text_writer:     { ext: vec, sep: "\n" }
     #  Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
 #   -  vector_filter:   { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
-#   -  textwriter:      { ext: ven, sep: "\n" }
+#   -  text_writer:     { ext: ven, sep: "\n" }
     #  Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
 #   -  vector_filter:   { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
-#   -  textwriter:      { ext: ver, sep: "\n" }
+#   -  text_writer:     { ext: ver, sep: "\n" }
     #  Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
 #   -  vector_filter:   { in: syn, lexicals: m }
-#   -  textwriter:      { ext: mul, sep: "\n" }
+#   -  text_writer:     { ext: mul, sep: "\n" }
     #  Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
 #   -  vector_filter:   { in: syn, lexicals: q, sort: 'term_abs' }
-#   -  textwriter:      { ext: seq, sep: "\n" }
+#   -  text_writer:     { ext: seq, sep: "\n" }
     #  Erstelle Datei mit Endung .syn für erkannte Synonyme
 #   -  vector_filter:   { in: syn, lexicals: y, sort: 'term_abs' }
-#   -  textwriter:      { ext: syn, sep: "\n" }
+#   -  text_writer:     { ext: syn, sep: "\n" }