RubyGems - lingo - Versions diffs - 1.8.0 - Mend

lingo 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

data/.rspec +1 -0
data/COPYING +663 -0
data/ChangeLog +754 -0
data/README +322 -0
data/Rakefile +100 -0
data/TODO +28 -0
data/bin/lingo +5 -0
data/bin/lingoctl +6 -0
data/de.lang +121 -0
data/de/lingo-abk.txt +74 -0
data/de/lingo-dic.txt +56822 -0
data/de/lingo-mul.txt +3209 -0
data/de/lingo-syn.txt +14841 -0
data/de/test_dic.txt +24 -0
data/de/test_mul.txt +17 -0
data/de/test_mul2.txt +2 -0
data/de/test_singleword.txt +2 -0
data/de/test_syn.txt +4 -0
data/de/test_syn2.txt +1 -0
data/de/user-dic.txt +10 -0
data/en.lang +113 -0
data/en/lingo-dic.txt +55434 -0
data/en/lingo-mul.txt +456 -0
data/en/user-dic.txt +5 -0
data/info/Objekte.png +0 -0
data/info/Typen.png +0 -0
data/info/database.png +0 -0
data/info/db_small.png +0 -0
data/info/download.png +0 -0
data/info/gpl-hdr.txt +27 -0
data/info/kerze.png +0 -0
data/info/language.png +0 -0
data/info/lingo.png +0 -0
data/info/logo.png +0 -0
data/info/meeting.png +0 -0
data/info/types.png +0 -0
data/lib/lingo.rb +321 -0
data/lib/lingo/attendee/abbreviator.rb +119 -0
data/lib/lingo/attendee/debugger.rb +111 -0
data/lib/lingo/attendee/decomposer.rb +101 -0
data/lib/lingo/attendee/dehyphenizer.rb +167 -0
data/lib/lingo/attendee/multiworder.rb +301 -0
data/lib/lingo/attendee/noneword_filter.rb +103 -0
data/lib/lingo/attendee/objectfilter.rb +86 -0
data/lib/lingo/attendee/sequencer.rb +190 -0
data/lib/lingo/attendee/synonymer.rb +105 -0
data/lib/lingo/attendee/textreader.rb +237 -0
data/lib/lingo/attendee/textwriter.rb +196 -0
data/lib/lingo/attendee/tokenizer.rb +218 -0
data/lib/lingo/attendee/variator.rb +185 -0
data/lib/lingo/attendee/vector_filter.rb +158 -0
data/lib/lingo/attendee/wordsearcher.rb +96 -0
data/lib/lingo/attendees.rb +289 -0
data/lib/lingo/cli.rb +62 -0
data/lib/lingo/config.rb +104 -0
data/lib/lingo/const.rb +131 -0
data/lib/lingo/ctl.rb +173 -0
data/lib/lingo/database.rb +587 -0
data/lib/lingo/language.rb +530 -0
data/lib/lingo/modules.rb +98 -0
data/lib/lingo/types.rb +285 -0
data/lib/lingo/utilities.rb +40 -0
data/lib/lingo/version.rb +27 -0
data/lingo-all.cfg +85 -0
data/lingo-call.cfg +15 -0
data/lingo.cfg +78 -0
data/lingo.rb +3 -0
data/lir.cfg +72 -0
data/porter/stem.cfg +311 -0
data/porter/stem.rb +150 -0
data/spec/spec_helper.rb +0 -0
data/test.cfg +79 -0
data/test/attendee/ts_abbreviator.rb +35 -0
data/test/attendee/ts_decomposer.rb +31 -0
data/test/attendee/ts_multiworder.rb +390 -0
data/test/attendee/ts_noneword_filter.rb +19 -0
data/test/attendee/ts_objectfilter.rb +19 -0
data/test/attendee/ts_sequencer.rb +43 -0
data/test/attendee/ts_synonymer.rb +33 -0
data/test/attendee/ts_textreader.rb +58 -0
data/test/attendee/ts_textwriter.rb +98 -0
data/test/attendee/ts_tokenizer.rb +32 -0
data/test/attendee/ts_variator.rb +24 -0
data/test/attendee/ts_vector_filter.rb +62 -0
data/test/attendee/ts_wordsearcher.rb +119 -0
data/test/lir.csv +3 -0
data/test/lir.txt +12 -0
data/test/lir2.txt +12 -0
data/test/mul.txt +1 -0
data/test/ref/artikel.mul +1 -0
data/test/ref/artikel.non +159 -0
data/test/ref/artikel.seq +270 -0
data/test/ref/artikel.syn +16 -0
data/test/ref/artikel.vec +928 -0
data/test/ref/artikel.ven +928 -0
data/test/ref/artikel.ver +928 -0
data/test/ref/lir.csv +328 -0
data/test/ref/lir.mul +1 -0
data/test/ref/lir.non +274 -0
data/test/ref/lir.seq +249 -0
data/test/ref/lir.syn +94 -0
data/test/test_helper.rb +113 -0
data/test/ts_database.rb +269 -0
data/test/ts_language.rb +396 -0
data/txt/artikel-en.txt +157 -0
data/txt/artikel.txt +170 -0
data/txt/lir.txt +1317 -0
metadata +211 -0

data/en/user-dic.txt ADDED Viewed

@@ -0,0 +1,5 @@
+#
+#   Musterwörterbuch user-dic.txt als Vorlage
+#   für eigene Benutzerwörterbücher
+#
+#

data/info/Objekte.png ADDED Viewed

Binary file

data/info/Typen.png ADDED Viewed

Binary file

data/info/database.png ADDED Viewed

Binary file

data/info/db_small.png ADDED Viewed

Binary file

data/info/download.png ADDED Viewed

Binary file

data/info/gpl-hdr.txt ADDED Viewed

@@ -0,0 +1,27 @@
+#--
+# LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
+# Mehrworterkennung und Relationierung.
+#
+# Copyright (C) 2005-2007 John Vorhauer
+# Copyright (C) 2007-2011 John Vorhauer, Jens Wille
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+#
+# For more information visit http://www.lex-lingo.de or contact me at
+# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
+#
+# Lex Lingo rules from here on
+#++

data/info/kerze.png ADDED Viewed

Binary file

data/info/language.png ADDED Viewed

Binary file

data/info/lingo.png ADDED Viewed

Binary file

data/info/logo.png ADDED Viewed

Binary file

data/info/meeting.png ADDED Viewed

Binary file

data/info/types.png ADDED Viewed

Binary file

data/lib/lingo.rb ADDED Viewed

@@ -0,0 +1,321 @@
+# encoding: utf-8
+#--
+# LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
+# Mehrworterkennung und Relationierung.
+#
+# Copyright (C) 2005-2007 John Vorhauer
+# Copyright (C) 2007-2011 John Vorhauer, Jens Wille
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+#
+# For more information visit http://www.lex-lingo.de or contact me at
+# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
+#
+# Lex Lingo rules from here on
+#++
+require 'stringio'
+require 'benchmark'
+require 'nuggets/env/user_home'
+require 'nuggets/numeric/duration'
+require_relative 'lingo/config'
+require_relative 'lingo/attendees'
+require_relative 'lingo/attendee/abbreviator'
+require_relative 'lingo/attendee/debugger'
+require_relative 'lingo/attendee/decomposer'
+require_relative 'lingo/attendee/dehyphenizer'
+require_relative 'lingo/attendee/multiworder'
+require_relative 'lingo/attendee/noneword_filter'
+require_relative 'lingo/attendee/objectfilter'
+require_relative 'lingo/attendee/variator'
+require_relative 'lingo/attendee/sequencer'
+require_relative 'lingo/attendee/synonymer'
+require_relative 'lingo/attendee/textreader'
+require_relative 'lingo/attendee/textwriter'
+require_relative 'lingo/attendee/tokenizer'
+require_relative 'lingo/attendee/vector_filter'
+require_relative 'lingo/attendee/wordsearcher'
+require_relative 'lingo/version'
+class Lingo
+  # The system-wide Lingo directory (+LINGO_BASE+).
+  BASE = ENV['LINGO_BASE'] || File.expand_path('../..', __FILE__)
+  # The user's personal Lingo directory (+LINGO_HOME+).
+  HOME = ENV['LINGO_HOME'] || File.join(ENV.user_home, '.lingo')
+  # The local Lingo directory (+LINGO_CURR+).
+  CURR = ENV['LINGO_CURR'] || '.'
+  # The search path for Lingo dictionary and configuration files.
+  PATH = ENV['LINGO_PATH'] || [CURR, HOME, BASE].join(File::PATH_SEPARATOR)
+  FIND_OPTIONS = {
+    config: { dir: 'config', ext: 'cfg'  },
+    dict:   { dir: 'dict',   ext: 'txt'  },
+    lang:   { dir: 'lang',   ext: 'lang' },
+    store:  { dir: 'store',  ext: nil    }
+  }
+  class << self
+    def talk(*args)
+      new(*args).talk
+    end
+    def call(cfg = find(:config, 'lingo-call'), args = [], &block)
+      Call.new(['-c', cfg, *args]).call(&block)
+    end
+    def error(msg)
+      abort(msg)
+    end
+    def list(type, options = {})
+      options = options_for(type, options)
+      path    = path_for(options)
+      glob = file_with_ext('*', options)
+      glob = File.join('??', glob) if type == :dict
+      [].tap { |list| walk(path, options) { |dir|
+        Dir[File.join(dir, glob)].sort.each { |file|
+          pn = Pathname.new(file)
+          list << realpath_for(pn, path) if pn.file?
+        }
+      } }
+    end
+    def find(type, file, options = {})
+      if options.is_a?(Array)
+        path    = options
+        options = options_for(type)
+      else
+        options = options_for(type, options)
+        path    = path_for(options)
+      end
+      type = :file if type != :store
+      send("find_#{type}", file, path, options)
+    rescue RuntimeError, Errno::ENOENT => err
+      block_given? ? yield(err) : raise
+    end
+    def basename(type, file)
+      dir, name = File.split(file)
+      type != :dict ? name : File.join(File.basename(dir), name)
+    end
+    def basepath(type, file)
+      File.join(options_for(type)[:dir], basename(type, file))
+    end
+    private
+    def find_file(file, path, options)
+      pn = Pathname.new(file_with_ext(file, options)).cleanpath
+      walk(path, options) { |dir|
+        pn2 = pn.expand_path(dir)
+        pn = pn2 and break if pn2.exist?
+      } if pn.relative?
+      realpath_for(pn, path)
+    end
+    def find_store(file, path, options)
+      base = basename(:dict, find(:dict, file, path))
+      walk(path.reverse, options, false) { |dir|
+        Pathname.new(dir).ascend { |r|
+          break true if r.file?
+          return File.join(dir, base).tap { |s|
+            s.chomp!(File.extname(s))
+          } if r.writable?
+          break true if r.exist?
+        }
+      }
+      raise 'No writable store found in search path'
+    end
+    def options_for(type, options = {})
+      if find_options = FIND_OPTIONS[type]
+        options = find_options.merge(options)
+      else
+        raise ArgumentError, "Invalid type `#{type.inspect}'", caller(1)
+      end
+    end
+    def path_for(options)
+      options[:path] || PATH.split(File::PATH_SEPARATOR)
+    end
+    def file_with_ext(file, options)
+      ext = options[:ext]
+      ext && File.extname(file).empty? ? "#{file}.#{ext}" : file
+    end
+    def walk(path, options, legacy = true)
+      dirs = [options[:dir].to_s]
+      dirs << '' if legacy
+      dirs.uniq!
+      seen = Hash.new { |h, k| h[k] = true; false }
+      path.each { |d|
+        next if seen[d = File.expand_path(d)]
+        dirs.each { |i| yield File.join(d, i) } or break
+      }
+    end
+    def realpath_for(pn, path)
+      pn.realpath(path.first).to_s
+    end
+  end
+  attr_reader :dictionaries, :report_status, :report_time
+  def initialize(*args)
+    @config_args = args
+    reset(false)
+  end
+  def config
+    @config ||= Config.new(*@config_args)
+  end
+  def dictionary_config
+    @dictionary_config ||= config['language/dictionary']
+  end
+  def database_config(id)
+    dictionary_config['databases'][id]
+  end
+  def talk
+    invite
+    start
+  ensure
+    reset
+  end
+  def invite(list = config['meeting/attendees'])
+    supplier   = Hash.new { |h, k| h[k] = [] }
+    subscriber = Hash.new { |h, k| h[k] = [] }
+    last_link, auto_link = '', 0
+    list.each { |hash|
+      # {'attendee' => {'name'=>'Attendee', 'in'=>'nase', 'out'=>'ohr', 'param'=>'hase'}}
+      cfg = hash.values.first.merge('name' => hash.keys.first.capitalize)
+      %w[in out].each { |key| (cfg[key] ||= '').downcase! }
+      cfg['in']  = last_link                         if cfg['in'].empty?
+      cfg['out'] = "auto_link_out_#{auto_link += 1}" if cfg['out'].empty?
+      last_link  = cfg['out']
+      data = config["language/attendees/#{cfg['name'].downcase}"]
+      cfg.update(data) if data
+      attendee = Attendee.const_get(cfg['name']).new(cfg, self)
+      @attendees << attendee
+      cfg['in'].split(STRING_SEPERATOR_PATTERN).each { |interest|
+        subscriber[interest] << attendee
+      }
+      cfg['out'].split(STRING_SEPERATOR_PATTERN).each { |theme|
+        supplier[theme] << attendee
+      }
+    }
+    supplier.each { |channel, attendees| attendees.each { |att|
+      att.add_subscriber(subscriber[channel])
+    } }
+  end
+  def start(report_status = config['status'], report_time = config['perfmon'])
+    @report_status, @report_time = report_status, report_time
+    time = Benchmark.realtime {
+      @attendees.first.listen(AgendaItem.new(STR_CMD_TALK))
+    }
+    if report_status || report_time
+      config.stderr.puts "Require protocol...\n#{separator = '-' * 61}"
+      @attendees.first.listen(AgendaItem.new(STR_CMD_STATUS))
+      config.stderr.puts "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
+    end
+  end
+  def reset(close = true)
+    dictionaries.each(&:close) if close
+    @dictionaries, @attendees = [], []
+  end
+  class Call < Lingo
+    def initialize(args = [])
+      super(args, StringIO.new, StringIO.new, StringIO.new)
+    end
+    def call
+      invite
+      if block_given?
+        begin
+          yield self
+        ensure
+          reset
+        end
+      else
+        self
+      end
+    end
+    def talk(str)
+      config.stdin.reopen(str)
+      start
+      %w[stdout stderr].flat_map { |key|
+        io = config.send(key).tap(&:rewind)
+        io.readlines.each(&:chomp!).tap {
+          io.truncate(0)
+          io.rewind
+        }
+      }.tap { |res|
+        if block_given?
+          res.map!(&Proc.new)
+        else
+          res.sort!
+          res.uniq!
+        end
+      }
+    end
+  end
+end
+require 'nuggets/util/pluggable'
+Util::Pluggable.load_plugins_for(Lingo)

data/lib/lingo/attendee/abbreviator.rb ADDED Viewed

@@ -0,0 +1,119 @@
+# encoding: utf-8
+#--
+# LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
+# Mehrworterkennung und Relationierung.
+#
+# Copyright (C) 2005-2007 John Vorhauer
+# Copyright (C) 2007-2011 John Vorhauer, Jens Wille
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+#
+# For more information visit http://www.lex-lingo.de or contact me at
+# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
+#
+# Lex Lingo rules from here on
+#++
+class Lingo
+  # Die Erkennung von Abkürzungen kann auf vielfältige Weise erfolgen. In jedem Fall
+  # sollte eine sichere Unterscheidung von einem Satzende-Punkt möglich sein.
+  # Der in Lingo gewählte Ansatz befreit den Tokenizer von dieser Arbeit und konzentriert
+  # die Erkennung in diesem Attendee.
+  # Sobald der Abbreviator im Datenstrom auf ein Punkt trifft (Token = <tt>:./PUNC:</tt>),
+  # prüft er das vorhergehende Token auf eine gültige Abkürzung im Abkürzungs-Wörterbuch.
+  # Wird es als Abkürzung erkannt, dann wird das Token in ein Word gewandelt und das
+  # Punkt-Token aus dem Zeichenstrom entfernt.
+  #
+  # === Mögliche Verlinkung
+  # Erwartet:: Daten des Typs *Token* z.B. von Tokenizer
+  # Erzeugt:: Leitet Token weiter und wandelt erkannte Abkürzungen in den Typ *Word* z.B. für Wordsearcher
+  #
+  # === Parameter
+  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
+  # Alle anderen Parameter müssen zwingend angegeben werden.
+  # <b>in</b>:: siehe allgemeine Beschreibung des Attendee
+  # <b>out</b>:: siehe allgemeine Beschreibung des Attendee
+  # <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
+  # <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
+  #
+  # === Beispiele
+  # Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
+  #   meeting:
+  #     attendees:
+  #       - textreader:  { out: lines, files: '$(files)' }
+  #       - tokenizer:   { in: lines, out: token }
+  #       - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
+  #       - debugger:    { in: abbrev, prompt: 'out>' }
+  # ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
+  #   out> *FILE('test.txt')
+  #   out> :Dies/WORD:
+  #   out> :ist/WORD:
+  #   out> <ggf. = [(gegebenenfalls/w)]>
+  #   out> :eine/WORD:
+  #   out> :Abk³rzung/WORD:
+  #   out> :./PUNC:
+  #   out> *EOL('test.txt')
+  #   out> *EOF('test.txt')
+  class Attendee::Abbreviator < BufferedAttendee
+    protected
+    def init
+      # Wörterbuch bereitstellen
+      src = get_array('source')
+      mod = get_key('mode', 'all')
+      @dic = Dictionary.new({'source'=>src, 'mode'=>mod}, @lingo)
+    end
+    def control(cmd, par)
+      @dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
+      # Jedes Control-Object ist auch Auslöser der Verarbeitung
+      process_buffer
+    end
+    private
+    def process_buffer?
+      @buffer[-1].kind_of?(Token) && @buffer[-1].form == CHAR_PUNCT
+    end
+    def process_buffer
+      if @buffer.size < 2
+        forward_buffer
+        return
+      end
+      # Wort vor dem Punkt im Abkürzungswörterbuch suchen
+      if @buffer[-2].kind_of?(Token)
+        inc('Anzahl gesuchter Abkürzungen')
+        abbr = @dic.find_word(@buffer[-2].form)
+        if abbr.attr == WA_IDENTIFIED
+          inc('Anzahl gefundener Abkürzungen')
+          abbr.form += CHAR_PUNCT
+          @buffer[-2] = abbr
+          @buffer.delete_at(-1)
+        end
+      end
+      forward_buffer
+    end
+  end
+end