RubyGems - lingo - Versions diffs - 1.8.1 → 1.8.2 - Mend

lingo 1.8.1 → 1.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

data/ChangeLog +23 -5
data/README +1 -1
data/Rakefile +5 -7
data/TODO +2 -0
data/bin/lingo +5 -1
data/de.lang +1 -1
data/en/lingo-syn.txt +0 -0
data/en.lang +2 -1
data/lib/lingo/attendee/abbreviator.rb +8 -9
data/lib/lingo/attendee/debugger.rb +5 -4
data/lib/lingo/attendee/decomposer.rb +8 -3
data/lib/lingo/attendee/dehyphenizer.rb +19 -63
data/lib/lingo/attendee/formatter.rb +1 -1
data/lib/lingo/attendee/multi_worder.rb +67 -155
data/lib/lingo/attendee/noneword_filter.rb +16 -9
data/lib/lingo/attendee/object_filter.rb +1 -1
data/lib/lingo/attendee/sequencer.rb +32 -63
data/lib/lingo/attendee/stemmer/porter.rb +343 -0
data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
data/lib/lingo/attendee/synonymer.rb +10 -9
data/lib/lingo/attendee/text_reader.rb +102 -76
data/lib/lingo/attendee/text_writer.rb +23 -26
data/lib/lingo/attendee/tokenizer.rb +13 -27
data/lib/lingo/attendee/variator.rb +26 -66
data/lib/lingo/attendee/vector_filter.rb +42 -43
data/lib/lingo/attendee/word_searcher.rb +6 -7
data/lib/lingo/attendee.rb +25 -7
data/lib/lingo/buffered_attendee.rb +36 -10
data/lib/lingo/cachable.rb +8 -8
data/lib/lingo/config.rb +5 -6
data/lib/lingo/ctl.rb +2 -3
data/lib/lingo/database/crypter.rb +9 -26
data/lib/lingo/database/gdbm_store.rb +3 -5
data/lib/lingo/database/libcdb_store.rb +4 -6
data/lib/lingo/database/sdbm_store.rb +11 -6
data/lib/lingo/database/show_progress.rb +3 -43
data/lib/lingo/database/source/key_value.rb +2 -6
data/lib/lingo/database/source/multi_key.rb +3 -5
data/lib/lingo/database/source/multi_value.rb +2 -6
data/lib/lingo/database/source/single_word.rb +4 -6
data/lib/lingo/database/source/word_class.rb +4 -10
data/lib/lingo/database/source.rb +20 -18
data/lib/lingo/database.rb +84 -59
data/lib/lingo/error.rb +57 -1
data/lib/lingo/language/dictionary.rb +21 -18
data/lib/lingo/language/grammar.rb +40 -49
data/lib/lingo/language/lexical.rb +6 -6
data/lib/lingo/language/lexical_hash.rb +6 -0
data/lib/lingo/language/word.rb +32 -15
data/lib/lingo/language/word_form.rb +1 -1
data/lib/lingo/language.rb +14 -25
data/lib/lingo/reportable.rb +12 -10
data/lib/lingo/show_progress.rb +81 -0
data/lib/lingo/version.rb +1 -1
data/lib/lingo.rb +63 -24
data/lingo-call.cfg +6 -10
data/lingo.cfg +60 -44
data/lir.cfg +42 -41
data/test/attendee/ts_abbreviator.rb +3 -5
data/test/attendee/ts_decomposer.rb +3 -5
data/test/attendee/ts_multi_worder.rb +87 -145
data/test/attendee/ts_noneword_filter.rb +5 -3
data/test/attendee/ts_object_filter.rb +5 -3
data/test/attendee/ts_sequencer.rb +3 -5
data/test/attendee/ts_stemmer.rb +309 -0
data/test/attendee/ts_synonymer.rb +15 -11
data/test/attendee/ts_text_reader.rb +12 -15
data/test/attendee/ts_text_writer.rb +24 -29
data/test/attendee/ts_tokenizer.rb +9 -7
data/test/attendee/ts_variator.rb +4 -4
data/test/attendee/ts_vector_filter.rb +24 -16
data/test/attendee/ts_word_searcher.rb +20 -36
data/test/{lir.csv → lir.vec} +0 -0
data/test/ref/artikel.vec +943 -943
data/test/ref/artikel.ven +943 -943
data/test/ref/lir.non +201 -201
data/test/ref/lir.seq +178 -178
data/test/ref/lir.syn +49 -49
data/test/ref/lir.vec +329 -0
data/test/test_helper.rb +20 -36
data/test/ts_database.rb +10 -10
data/test/ts_language.rb +279 -319
metadata +93 -104
data/info/Objekte.png +0 -0
data/info/Typen.png +0 -0
data/info/database.png +0 -0
data/info/db_small.png +0 -0
data/info/download.png +0 -0
data/info/kerze.png +0 -0
data/info/language.png +0 -0
data/info/lingo.png +0 -0
data/info/logo.png +0 -0
data/info/meeting.png +0 -0
data/info/types.png +0 -0
data/lingo-all.cfg +0 -89
data/porter/stem.cfg +0 -311
data/porter/stem.rb +0 -150
data/test/ref/lir.csv +0 -329
data/test.cfg +0 -79

data/lib/lingo/attendee/noneword_filter.rb CHANGED Viewed

@@ -71,32 +71,39 @@ class Lingo
       protected
       def init
-        @nonewords = []
+        @nonewords, @sort = [], get_key('sort', true)
       end
-      # Control behandelt die Kommandos zum Öffnen und Schließen einer Datei.
-      # Für jede Datei wird ein neuer Satz nicht erkannter Wörter registriert.
-      def control(cmd, par)
+      def control(cmd, param)
         case cmd
           when STR_CMD_FILE
             @nonewords.clear
           when STR_CMD_EOL
             skip_command
           when STR_CMD_RECORD, STR_CMD_EOF
-            nones = @nonewords.sort.uniq
-            nones.each(&method(:forward))
-            add('Objekte gefiltert', nones.size)
-            @nonewords.clear
+            send_nonewords unless @nonewords.empty?
         end
       end
       def process(obj)
         if obj.is_a?(Word) && obj.unknown?
           inc('Anzahl nicht erkannter Wörter')
-          @nonewords << obj.form.downcase
+          non = obj.form.downcase
+          @sort ? @nonewords << non : forward(non)
         end
       end
+      private
+      def send_nonewords
+        @nonewords.sort!
+        @nonewords.uniq!
+        add('Objekte gefiltert', @nonewords.size)
+        @nonewords.each(&method(:forward)).clear
+      end
     end
     # For backwards compatibility.

data/lib/lingo/attendee/object_filter.rb CHANGED Viewed

@@ -59,7 +59,7 @@ class Lingo
     #       - text_reader:   { out: lines, files: '$(files)' }
     #       - tokenizer:     { in: lines, out: token }
     #       - word_searcher: { in: token, out: words, source: 'sys-dic' }
-    #       - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_SUBSTANTIV' }
+    #       - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_NOUN' }
     #       - debugger:      { in: filtr, prompt: 'out>' }
     # ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
     #   out> *FILE('test.txt')

data/lib/lingo/attendee/sequencer.rb CHANGED Viewed

@@ -96,92 +96,61 @@ class Lingo
       protected
       def init
-        # Parameter verwerten
-        @stopper = get_array('stopper', TA_PUNCTUATION + ',' + TA_OTHER).map(&:upcase)
-        @seq_strings = get_key('sequences').map { |e| WordSequence.new(*e) }
+        @stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
-        raise MissingConfigError.new(:sequences) if @seq_strings.empty?
+        @seq = get_key('sequences').map { |string, format|
+          [string = string.downcase, string.split(//), format]
+        }
+        raise MissingConfigError.new(:sequences) if @seq.empty?
       end
-      def control(cmd, par)
-        # Jedes Control-Object ist auch Auslöser der Verarbeitung
+      def control(cmd, param)
         process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
       end
       def process_buffer?
-        #   start buffer processing when stopper token are found or at unknown words
-        item = @buffer.last
-        (item.is_a?(WordForm) && @stopper.include?(item.attr.upcase)) ||
-        (item.is_a?(Word) && item.unknown?)
+        (obj = @buffer.last).is_a?(WordForm) && (obj.is_a?(Word) &&
+          obj.unknown? || @stopper.include?(obj.attr.upcase))
       end
       def process_buffer
-        return if @buffer.empty?
-        unless @buffer.size < 2
-          matches = Hash.new { |h, k| h[k] = [] }
-          sequences(@buffer.map { |obj|
-            obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
-          }).uniq.each { |sequence|
-            @seq_strings.each { |wordseq|
-              wordseq.scan(sequence) { |pos, form, classes|
-                inc('Anzahl erkannter Sequenzen')
-                classes.each_with_index { |wc, index|
-                  @buffer[pos + index].lexicals.find { |lex|
-                    form.gsub!(index.succ.to_s, lex.form) if lex.attr == wc
-                  } or break
-                } or next
-                matches[pos] << form
-              }
-            }
-          }
-          matches.sort.each { |pos, forms|
-            forms.uniq.each { |form|
-              deferred_insert(pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE))
-            }
-          }
-        end
+        insert_sequences if @buffer.size > 1
         forward_buffer
       end
       private
-      def sequences(map)
-        res = map.shift
+      def insert_sequences
+        matches, buf, seq = Hash.new { |h, k| h[k] = [] }, @buffer, @seq
-        map.each { |classes|
-          temp = []
-          res.each { |wc1| classes.each { |wc2| temp << (wc1 + wc2) } }
-          res = temp
+        map = buf.map { |obj|
+          obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
         }
-        res
-      end
-      class WordSequence
+        map.shift.product(*map).map!(&:join).tap(&:uniq!).each { |q|
+          seq.each { |string, classes, format|
+            while pos = q.index(string, pos || 0)
+              inc('Anzahl erkannter Sequenzen')
-        attr_reader :classes, :format, :string
+              fmt = format.dup
-        def initialize(wordclasses, format)
-          @string  = wordclasses.downcase
-          @classes = @string.split(//)
-          @format  = format
-        end
+              classes.each_with_index { |wc, i|
+                buf[pos + i].lexicals.find { |l|
+                  fmt.gsub!(i.succ.to_s, l.form) if l.attr == wc
+                } or break
+              } or next
-        def scan(sequence)
-          pos = 0
+              matches[pos] << fmt
-          while pos = sequence.index(string, pos)
-            yield pos, format.dup, classes
-            pos += 1
-          end
-        end
+              pos += 1
+            end
+          }
+        }
+        matches.sort.each { |pos, forms| forms.tap(&:uniq!).each { |form|
+          @inserts << [pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)]
+        } }
       end
     end

data/lib/lingo/attendee/stemmer/porter.rb ADDED Viewed

@@ -0,0 +1,343 @@
+# encoding: utf-8
+#--
+###############################################################################
+#                                                                             #
+# Lingo -- A full-featured automatic indexing system                          #
+#                                                                             #
+# Copyright (C) 2005-2007 John Vorhauer                                       #
+# Copyright (C) 2007-2012 John Vorhauer, Jens Wille                           #
+#                                                                             #
+# Lingo is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
+#                                                                             #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with Lingo. If not, see <http://www.gnu.org/licenses/>.               #
+#                                                                             #
+###############################################################################
+#++
+class Lingo
+  class Attendee
+    class Stemmer
+      module Porter
+        extend self
+        # Rules for Porter-Stemmer, based on:
+        #
+        #               An algorithm for suffix stripping
+        #
+        #                            M.F. Porter
+        #                               1980
+        #
+        # Originally published in Program, 14 no. 3, pp 130-137, July 1980.
+        # (A few typos have been corrected.)
+        #
+        # http://tartarus.org/~martin/PorterStemmer/def.txt
+        #
+        # -------------------------------------------------------------------
+        #
+        # 2. THE ALGORITHM
+        #
+        # To present the suffix stripping algorithm in its entirety we will
+        # need a few definitions.
+        #
+        # A _consonant_ in a word is a letter other than A, E, I, O or U,
+        # and other than Y preceded by a consonant. (The fact that the term
+        # `consonant' is defined to some extent in terms of itself does not
+        # make it ambiguous.) So in TOY the consonants are T and Y, and in
+        # SYZYGY they are S, Z and G. If a letter is not a consonant it is
+        # a _vowel_.
+        #
+        # A consonant will be denoted by c, a vowel by v. A list ccc... of
+        # length greater than 0 will be denoted by C, and a list vvv... of
+        # length greater than 0 will be denoted by V. Any word, or part of
+        # a word, therefore has one of the four forms:
+        #
+        #     CVCV ... C
+        #     CVCV ... V
+        #     VCVC ... C
+        #     VCVC ... V
+        #
+        # These may all be represented by the single form
+        #
+        #     [C]VCVC ... [V]
+        #
+        # where the square brackets denote arbitrary presence of their
+        # contents. Using (VC){m} to denote VC repeated m times, this
+        # may again be written as
+        #
+        #     [C](VC){m}[V].
+        #
+        # m will be called the _measure_ of any word or word part when
+        # represented in this form. The case m = 0 covers the null word.
+        # Here are some examples:
+        #
+        #     m=0    TR,  EE,  TREE,  Y,  BY.
+        #     m=1    TROUBLE,  OATS,  TREES,  IVY.
+        #     m=2    TROUBLES,  PRIVATE,  OATEN,  ORRERY.
+        #
+        # The _rules_ for removing a suffix will be given in the form
+        #
+        #     (condition) S1 -> S2
+        #
+        # This means that if a word ends with the suffix S1, and the stem
+        # before S1 satisfies the given condition, S1 is replaced by S2.
+        # The condition is usually given in terms of m, e.g.
+        #
+        #     (m > 1) EMENT ->
+        #
+        # Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to
+        # REPLAC, since REPLAC is a word part for which m = 2.
+        #
+        # The `condition' part may also contain the following:
+        #
+        # *S  - the stem ends with S (and similarly for the other letters).
+        #
+        # *v* - the stem contains a vowel.
+        #
+        # *d  - the stem ends with a double consonant (e.g. -TT, -SS).
+        #
+        # *o  - the stem ends cvc, where the second c is not W, X or Y (e.g.
+        #        -WIL, -HOP).
+        #
+        # And the condition part may also contain expressions with _and_,
+        # _or_ and _not_, so that
+        #
+        #     (m>1 and (*S or *T))
+        #
+        # tests for a stem with m>1 ending in S or T, while
+        #
+        #     (*d and not (*L or *S or *Z))
+        #
+        # tests for a stem ending with a double consonant other than L, S
+        # or Z. Elaborate conditions like this are required only rarely.
+        #
+        # In a set of rules written beneath each other, only one is obeyed,
+        # and this will be the one with the longest matching S1 for the
+        # given word. For example, with
+        #
+        #     SSES -> SS
+        #     IES  -> I
+        #     SS   -> SS
+        #     S    ->
+        #
+        # (here the conditions are all null) CARESSES maps to CARESS since
+        # SSES is the longest match for S1. Equally CARESS maps to CARESS
+        # (S1=`SS') and CARES to CARE (S1=`S').
+        #
+        # In the rules below, examples of their application, successful or
+        # otherwise, are given on the right in lower case. The algorithm
+        # now follows: see RULES.
+        #
+        # The algorithm is careful not to remove a suffix when the stem is
+        # too short, the length of the stem being given by its measure, m.
+        # There is no linguistic basis for this approach. It was merely
+        # observed that m could be used quite effectively to help decide
+        # whether or not it was wise to take off a suffix.
+        #
+        # -------------------------------------------------------------------
+        #
+        RULES = {
+          # Step 1a
+          S100: [
+            'SSES -> SS',  # caresses -> caress
+            'IES  -> I',   # ponies   -> poni, ties -> ti
+            'SS   -> SS',  # caress   -> caress
+            'S    -> '     # cats     -> cat
+          ],
+          # Step 1b
+          S110: [
+            '(m>0) EED -> EE goto(S120)',  # agreed    ->  agree,   feed -> feed
+            '(*v*) ED  ->    goto(S111)',  # plastered ->  plaster, bled -> bled
+            '(*v*) ING ->    goto(S111)',  # motoring  ->  motor,   sing -> sing
+            'goto(S120)'
+          ],
+          # If the second or third of the rules in Step 1b is successful,
+          # the following is done:
+          S111: [
+            'AT -> ATE',                            # conflat(ed) -> conflate
+            'BL -> BLE',                            # troubl(ed)  -> trouble
+            'IZ -> IZE',                            # siz(ed)     -> size
+            '(*d and not (*L or *S or *Z)) -> -1',  # hopp(ing)   -> hop
+                                                    # tann(ed)    -> tan
+                                                    # fall(ing)   -> fall
+                                                    # hiss(ing)   -> hiss
+                                                    # fizz(ed)    -> fizz
+            '(m=1 and *o) -> E'                     # fail(ing)   -> fail
+                                                    # fil(ing)    -> file
+          ],
+          # The rule to map to a single letter causes the removal of one of
+          # the double letter pair. The -E is put back on -AT, -BL and -IZ,
+          # so that the suffixes -ATE, -BLE and -IZE can be recognised later.
+          # This E may be removed in step 4.
+          # Step 1c
+          S120: [
+            '(*v*) Y -> I'  # happy -> happi, sky -> sky
+          ],
+          # Step 1 deals with plurals and past participles. The subsequent
+          # steps are much more straightforward.
+          # Step 2
+          S200: [
+            '(m>0) ATIONAL -> ATE',   # relational     -> relate
+            '(m>0) TIONAL  -> TION',  # conditional    -> condition, rational -> rational
+            '(m>0) ENCI    -> ENCE',  # valenci        -> valence
+            '(m>0) ANCI    -> ANCE',  # hesitanci      -> hesitance
+            '(m>0) IZER    -> IZE',   # digitizer      -> digitize
+            '(m>0) ABLI    -> ABLE',  # conformabli    -> conformable
+            '(m>0) ALLI    -> AL',    # radicalli      -> radical
+            '(m>0) ENTLI   -> ENT',   # differentli    -> different
+            '(m>0) ELI     -> E',     # vileli         -> vile
+            '(m>0) OUSLI   -> OUS',   # analogousli    -> analogous
+            '(m>0) IZATION -> IZE',   # vietnamization -> vietnamize
+            '(m>0) ATION   -> ATE',   # predication    -> predicate
+            '(m>0) ATOR    -> ATE',   # operator       -> operate
+            '(m>0) ALISM   -> AL',    # feudalism      -> feudal
+            '(m>0) IVENESS -> IVE',   # decisiveness   -> decisive
+            '(m>0) FULNESS -> FUL',   # hopefulness    -> hopeful
+            '(m>0) OUSNESS -> OUS',   # callousness    -> callous
+            '(m>0) ALITI   -> AL',    # formaliti      -> formal
+            '(m>0) IVITI   -> IVE',   # sensitiviti    -> sensitive
+            '(m>0) BILITI  -> BLE'    # sensibiliti    -> sensible
+          ],
+          # The test for the string S1 can be made fast by doing a program
+          # switch on the penultimate letter of the word being tested. This
+          # gives a fairly even breakdown of the possible values of the
+          # string S1. It will be seen in fact that the S1-strings in step 2
+          # are presented here in the alphabetical order of their penultimate
+          # letter. Similar techniques may be applied in the other steps.
+          # Step 3
+          S300: [
+            '(m>0) ICATE -> IC',  # triplicate  -> triplic
+            '(m>0) ATIVE -> ',    # formative   -> form
+            '(m>0) ALIZE -> AL',  # formalize   -> formal
+            '(m>0) ICITI -> IC',  # electriciti -> electric
+            '(m>0) ICAL  -> IC',  # electrical  -> electric
+            '(m>0) FUL   -> ',    # hopeful     -> hope
+            '(m>0) NESS  -> '     # goodness    -> good
+          ],
+          # Step 4
+          S400: [
+            '(m>1) AL    -> ',               # revival     -> reviv
+            '(m>1) ANCE  -> ',               # allowance   -> allow
+            '(m>1) ENCE  -> ',               # inference   -> infer
+            '(m>1) ER    -> ',               # airliner    -> airlin
+            '(m>1) IC    -> ',               # gyroscopic  -> gyroscop
+            '(m>1) ABLE  -> ',               # adjustable  -> adjust
+            '(m>1) IBLE  -> ',               # defensible  -> defens
+            '(m>1) ANT   -> ',               # irritant    -> irrit
+            '(m>1) EMENT -> ',               # replacement -> replac
+            '(m>1) MENT  -> ',               # adjustment  -> adjust
+            '(m>1) ENT   -> ',               # dependent   -> depend
+            '(m>1 and (*S or *T)) ION -> ',  # adoption    -> adopt
+            '(m>1) OU    -> ',               # homologou   -> homolog
+            '(m>1) ISM   -> ',               # communism   -> commun
+            '(m>1) ATE   -> ',               # activate    -> activ
+            '(m>1) ITI   -> ',               # angulariti  -> angular
+            '(m>1) OUS   -> ',               # homologous  -> homolog
+            '(m>1) IVE   -> ',               # effective   -> effect
+            '(m>1) IZE   -> '                # bowdlerize  -> bowdler
+          ],
+          # The suffixes are now removed. All that remains is a little
+          # tidying up.
+          # Step 5a
+          S500: [
+            '(m>1) E -> ',            # probate -> probat, rate -> rate
+            '(m=1 and not *o) E -> '  # cease   -> ceas
+          ],
+          # Step 5b
+          S510: [
+            '(m > 1 and *d and *L) -> -1'  # controll -> control, roll -> roll
+          ]
+        }
+        GOTO_RE = %r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}
+        RULE_RE = %r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}
+        def stem(word, found = false)
+          goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }
+          RULES.each { |key, rules|
+            next if goto && goto != key.to_s
+            rules.each { |rule|
+              case rule
+                when RULE_RE
+                  cond, repl, goto = $1, $3, $4
+                  stem = word[/(.+)#{$2.downcase}$/, 1] or next
+                when GOTO_RE
+                  goto = $1
+                  break
+              end
+              conv[shad = stem.dup,
+                /[^aeiouy]/ => 'c',
+                /[aeiou]/   => 'v',
+                /cy/        => 'cv',
+                /y/         => 'c'
+              ]
+              if cond
+                conv[cond,
+                  'm'   => shad.scan(/vc/).size,
+                  '*v*' => shad.include?('v'),
+                  '*d'  => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
+                  '*o'  => shad.end_with?('cvc') && !'wxy'.include?(last),
+                  'and' => '&&',
+                  'or'  => '||',
+                  'not' => '!',
+                  '='   => '=='
+                ]
+                last.upcase! if last
+                cond.gsub!(/\*(\w)/) { last == $1 }
+                next unless eval(cond)
+              end
+              found, word = true, begin
+                stem[0...Integer(repl)]
+              rescue ArgumentError
+                stem << repl.downcase
+              end
+              break
+            }
+          }
+          word if found
+        end
+      end
+    end
+  end
+end

data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} RENAMED Viewed

@@ -1,3 +1,5 @@
+# encoding: utf-8
 #--
 ###############################################################################
 #                                                                             #
@@ -22,3 +24,34 @@
 ###############################################################################
 #++
+class Lingo
+  class Attendee
+    class Stemmer < self
+      protected
+      def init
+        extend(Lingo.get_const(get_key('type', 'porter'), self.class))
+        @wc  = get_key('wordclass', LA_STEM)
+        @all = get_key('mode', '').downcase == 'all'
+      end
+      def process(obj)
+        if obj.is_a?(Word) && obj.unknown?
+          stem = stem(obj.form.downcase, @all)
+          obj.add_lexicals([Lexical.new(stem, @wc)]) if stem
+        end
+        forward(obj)
+      end
+    end
+  end
+end
+require_relative 'stemmer/porter'

data/lib/lingo/attendee/synonymer.rb CHANGED Viewed

@@ -74,24 +74,25 @@ class Lingo
       def init
         set_dic
-        @skip = get_array('skip', WA_UNKNOWN).map(&:upcase)
+        @skip = get_array('skip', WA_UNKNOWN, :upcase)
       end
-      def control(cmd, par)
-        @dic.report.each_pair { |k, v| set( k, v ) } if cmd == STR_CMD_STATUS
+      def control(cmd, param)
+        report_on(cmd, @dic)
       end
       def process(obj)
-        if obj.is_a?(Word) && @skip.index(obj.attr).nil?
+        if obj.is_a?(Word) && !@skip.include?(obj.attr)
           inc('Anzahl gesuchter Wörter')
-          # finde die Synonyme für alle Lexicals des Wortes
-          synos = @dic.find_synonyms(obj)
-          obj.lexicals += synos.sort.uniq
+          unless (syn = @dic.find_synonyms(obj)).empty?
+            inc('Anzahl erweiteter Wörter')
-          inc('Anzahl erweiteter Wörter') if synos.size>0
-          add('Anzahl gefundener Synonyme', synos.size)
+            obj.add_lexicals(syn.tap(&:uniq!))
+            add('Anzahl gefundener Synonyme', syn.size)
+          end
         end
         forward(obj)
       end