lingo 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. data/.rspec +1 -0
  2. data/COPYING +663 -0
  3. data/ChangeLog +754 -0
  4. data/README +322 -0
  5. data/Rakefile +100 -0
  6. data/TODO +28 -0
  7. data/bin/lingo +5 -0
  8. data/bin/lingoctl +6 -0
  9. data/de.lang +121 -0
  10. data/de/lingo-abk.txt +74 -0
  11. data/de/lingo-dic.txt +56822 -0
  12. data/de/lingo-mul.txt +3209 -0
  13. data/de/lingo-syn.txt +14841 -0
  14. data/de/test_dic.txt +24 -0
  15. data/de/test_mul.txt +17 -0
  16. data/de/test_mul2.txt +2 -0
  17. data/de/test_singleword.txt +2 -0
  18. data/de/test_syn.txt +4 -0
  19. data/de/test_syn2.txt +1 -0
  20. data/de/user-dic.txt +10 -0
  21. data/en.lang +113 -0
  22. data/en/lingo-dic.txt +55434 -0
  23. data/en/lingo-mul.txt +456 -0
  24. data/en/user-dic.txt +5 -0
  25. data/info/Objekte.png +0 -0
  26. data/info/Typen.png +0 -0
  27. data/info/database.png +0 -0
  28. data/info/db_small.png +0 -0
  29. data/info/download.png +0 -0
  30. data/info/gpl-hdr.txt +27 -0
  31. data/info/kerze.png +0 -0
  32. data/info/language.png +0 -0
  33. data/info/lingo.png +0 -0
  34. data/info/logo.png +0 -0
  35. data/info/meeting.png +0 -0
  36. data/info/types.png +0 -0
  37. data/lib/lingo.rb +321 -0
  38. data/lib/lingo/attendee/abbreviator.rb +119 -0
  39. data/lib/lingo/attendee/debugger.rb +111 -0
  40. data/lib/lingo/attendee/decomposer.rb +101 -0
  41. data/lib/lingo/attendee/dehyphenizer.rb +167 -0
  42. data/lib/lingo/attendee/multiworder.rb +301 -0
  43. data/lib/lingo/attendee/noneword_filter.rb +103 -0
  44. data/lib/lingo/attendee/objectfilter.rb +86 -0
  45. data/lib/lingo/attendee/sequencer.rb +190 -0
  46. data/lib/lingo/attendee/synonymer.rb +105 -0
  47. data/lib/lingo/attendee/textreader.rb +237 -0
  48. data/lib/lingo/attendee/textwriter.rb +196 -0
  49. data/lib/lingo/attendee/tokenizer.rb +218 -0
  50. data/lib/lingo/attendee/variator.rb +185 -0
  51. data/lib/lingo/attendee/vector_filter.rb +158 -0
  52. data/lib/lingo/attendee/wordsearcher.rb +96 -0
  53. data/lib/lingo/attendees.rb +289 -0
  54. data/lib/lingo/cli.rb +62 -0
  55. data/lib/lingo/config.rb +104 -0
  56. data/lib/lingo/const.rb +131 -0
  57. data/lib/lingo/ctl.rb +173 -0
  58. data/lib/lingo/database.rb +587 -0
  59. data/lib/lingo/language.rb +530 -0
  60. data/lib/lingo/modules.rb +98 -0
  61. data/lib/lingo/types.rb +285 -0
  62. data/lib/lingo/utilities.rb +40 -0
  63. data/lib/lingo/version.rb +27 -0
  64. data/lingo-all.cfg +85 -0
  65. data/lingo-call.cfg +15 -0
  66. data/lingo.cfg +78 -0
  67. data/lingo.rb +3 -0
  68. data/lir.cfg +72 -0
  69. data/porter/stem.cfg +311 -0
  70. data/porter/stem.rb +150 -0
  71. data/spec/spec_helper.rb +0 -0
  72. data/test.cfg +79 -0
  73. data/test/attendee/ts_abbreviator.rb +35 -0
  74. data/test/attendee/ts_decomposer.rb +31 -0
  75. data/test/attendee/ts_multiworder.rb +390 -0
  76. data/test/attendee/ts_noneword_filter.rb +19 -0
  77. data/test/attendee/ts_objectfilter.rb +19 -0
  78. data/test/attendee/ts_sequencer.rb +43 -0
  79. data/test/attendee/ts_synonymer.rb +33 -0
  80. data/test/attendee/ts_textreader.rb +58 -0
  81. data/test/attendee/ts_textwriter.rb +98 -0
  82. data/test/attendee/ts_tokenizer.rb +32 -0
  83. data/test/attendee/ts_variator.rb +24 -0
  84. data/test/attendee/ts_vector_filter.rb +62 -0
  85. data/test/attendee/ts_wordsearcher.rb +119 -0
  86. data/test/lir.csv +3 -0
  87. data/test/lir.txt +12 -0
  88. data/test/lir2.txt +12 -0
  89. data/test/mul.txt +1 -0
  90. data/test/ref/artikel.mul +1 -0
  91. data/test/ref/artikel.non +159 -0
  92. data/test/ref/artikel.seq +270 -0
  93. data/test/ref/artikel.syn +16 -0
  94. data/test/ref/artikel.vec +928 -0
  95. data/test/ref/artikel.ven +928 -0
  96. data/test/ref/artikel.ver +928 -0
  97. data/test/ref/lir.csv +328 -0
  98. data/test/ref/lir.mul +1 -0
  99. data/test/ref/lir.non +274 -0
  100. data/test/ref/lir.seq +249 -0
  101. data/test/ref/lir.syn +94 -0
  102. data/test/test_helper.rb +113 -0
  103. data/test/ts_database.rb +269 -0
  104. data/test/ts_language.rb +396 -0
  105. data/txt/artikel-en.txt +157 -0
  106. data/txt/artikel.txt +170 -0
  107. data/txt/lir.txt +1317 -0
  108. metadata +211 -0
data/lib/lingo/cli.rb ADDED
@@ -0,0 +1,62 @@
1
+ require 'nuggets/util/cli'
2
+
3
+ class Lingo
4
+
5
+ class CLI < ::Util::CLI
6
+
7
+ class << self
8
+
9
+ def defaults
10
+ super.merge(
11
+ config: 'lingo.cfg',
12
+ language: 'de',
13
+ status: false,
14
+ perfmon: false
15
+ )
16
+ end
17
+
18
+ end
19
+
20
+ attr_reader :files
21
+
22
+ def run(arguments)
23
+ @files = arguments
24
+ end
25
+
26
+ private
27
+
28
+ def load_config(*)
29
+ @config = {}
30
+ end
31
+
32
+ def opts(opts)
33
+ opts.on('-c', '--config YAML', "Config file [Default: #{defaults[:config]}#{' (currently not present)' unless File.readable?(defaults[:config])}]") { |config|
34
+ options[:config] = config
35
+ }
36
+
37
+ opts.separator ''
38
+
39
+ opts.on('-l', '--language LANG', "Language for processing [Default: #{defaults[:language]}]") { |language|
40
+ options[:language] = language
41
+ }
42
+
43
+ opts.separator ''
44
+
45
+ opts.on('-s', '--status', 'Print status information after processing') {
46
+ options[:status] = true
47
+ }
48
+
49
+ opts.on('-p', '--perfmon', 'Print performance details after processing') {
50
+ options[:perfmon] = true
51
+ }
52
+
53
+ opts.separator ''
54
+
55
+ opts.on('-L', '--log FILE', 'Log file to print debug and status information to') { |log|
56
+ options[:log] = @stderr.reopen(File.open(log, 'a+', encoding: ENC))
57
+ }
58
+ end
59
+
60
+ end
61
+
62
+ end
@@ -0,0 +1,104 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
+ # Mehrworterkennung und Relationierung.
6
+ #
7
+ # Copyright (C) 2005-2007 John Vorhauer
8
+ # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
+ #
10
+ # This program is free software; you can redistribute it and/or modify it under
11
+ # the terms of the GNU Affero General Public License as published by the Free
12
+ # Software Foundation; either version 3 of the License, or (at your option)
13
+ # any later version.
14
+ #
15
+ # This program is distributed in the hope that it will be useful, but WITHOUT
16
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
+ # details.
19
+ #
20
+ # You should have received a copy of the GNU Affero General Public License along
21
+ # with this program; if not, write to the Free Software Foundation, Inc.,
22
+ # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
+ #
24
+ # For more information visit http://www.lex-lingo.de or contact me at
25
+ # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
+ #
27
+ # Lex Lingo rules from here on
28
+ #++
29
+
30
+ require 'yaml'
31
+ require_relative 'cli'
32
+
33
+ class Lingo
34
+
35
+ class Config
36
+
37
+ def initialize(*args)
38
+ @cli, @opts = CLI.new, {}
39
+
40
+ @cli.execute(*args)
41
+ @cli.options.each { |key, val| @opts[key.to_s] = val }
42
+
43
+ load_config('language', :lang)
44
+ load_config('config')
45
+
46
+ Array(self['meeting/attendees']).each { |a|
47
+ r = a['textreader'] or next
48
+
49
+ f = @cli.files
50
+
51
+ if i = r['files']
52
+ r['files'] = i.strip == '$(files)' ?
53
+ f : i.split(STRING_SEPERATOR_PATTERN)
54
+ elsif !f.empty?
55
+ r['files'] = f
56
+ end
57
+
58
+ break
59
+ }
60
+ end
61
+
62
+ def [](key)
63
+ key_to_nodes(key).inject(@opts) { |value, node| value[node] }
64
+ end
65
+
66
+ def []=(key, value)
67
+ nodes = key_to_nodes(key); node = nodes.pop
68
+ (self[nodes_to_key(nodes)] ||= {})[node] = value
69
+ end
70
+
71
+ def stdin
72
+ @cli.stdin
73
+ end
74
+
75
+ def stdout
76
+ @cli.stdout
77
+ end
78
+
79
+ def stderr
80
+ @cli.stderr
81
+ end
82
+
83
+ def quit(*args)
84
+ @cli.send(:quit, *args)
85
+ end
86
+
87
+ private
88
+
89
+ def key_to_nodes(key)
90
+ key.downcase.split('/')
91
+ end
92
+
93
+ def nodes_to_key(nodes)
94
+ nodes.join('/')
95
+ end
96
+
97
+ def load_config(key, type = key.to_sym)
98
+ file = Lingo.find(type, @opts[key], &method(:quit))
99
+ @opts.update(File.open(file, encoding: ENC, &YAML.method(:load)))
100
+ end
101
+
102
+ end
103
+
104
+ end
@@ -0,0 +1,131 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
+ # Mehrworterkennung und Relationierung.
6
+ #
7
+ # Copyright (C) 2005-2007 John Vorhauer
8
+ # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
+ #
10
+ # This program is free software; you can redistribute it and/or modify it under
11
+ # the terms of the GNU Affero General Public License as published by the Free
12
+ # Software Foundation; either version 3 of the License, or (at your option)
13
+ # any later version.
14
+ #
15
+ # This program is distributed in the hope that it will be useful, but WITHOUT
16
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
+ # details.
19
+ #
20
+ # You should have received a copy of the GNU Affero General Public License along
21
+ # with this program; if not, write to the Free Software Foundation, Inc.,
22
+ # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
+ #
24
+ # For more information visit http://www.lex-lingo.de or contact me at
25
+ # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
+ #
27
+ # Lex Lingo rules from here on
28
+ #++
29
+
30
+ class Lingo
31
+
32
+ ENC = 'UTF-8'
33
+
34
+ STRING_SEPERATOR_PATTERN = /[; ,\|]/
35
+
36
+ # String-Konstanten im Datenstrom
37
+ CHAR_PUNCT = '.'
38
+
39
+ # Define printable characters for tokenizer for UTF-8 encoding
40
+ UTF_8_DIGIT = '[0-9]'
41
+ # Define Basic Latin printable characters for UTF-8 encoding from U+0000 to U+007f
42
+ UTF_8_BASLAT = '[A-Za-z]'
43
+ # Define Latin-1 Supplement printable characters for UTF-8 encoding from U+0080 to U+00ff
44
+ UTF_8_LAT1SP = '[\xc3\x80-\xc3\x96\xc3\x98-\xc3\xb6\xc3\xb8-\xc3\xbf]'
45
+ # Define Latin Extended-A printable characters for UTF-8 encoding from U+0100 to U+017f
46
+ UTF_8_LATEXA = '[\xc4\x80-\xc4\xbf\xc5\x80-\xc5\xbf]'
47
+ # Define Latin Extended-B printable characters for UTF-8 encoding from U+0180 to U+024f
48
+ UTF_8_LATEXB = '[\xc6\x80-\xc6\xbf\xc7\x80-\xc7\xbf\xc8\x80-\xc8\xbf\xc9\x80-\xc9\x8f]'
49
+ # Define IPA Extension printable characters for UTF-8 encoding from U+024f to U+02af
50
+ UTF_8_IPAEXT = '[\xc9\xa0-\xc9\xbf\xca\xa0-\xca\xaf]'
51
+ # Collect all UTF-8 printable characters in Unicode range U+0000 to U+02af
52
+ UTF_8_CHAR = "#{UTF_8_DIGIT}|#{UTF_8_BASLAT}|#{UTF_8_LAT1SP}|#{UTF_8_LATEXA}|#{UTF_8_LATEXB}|#{UTF_8_IPAEXT}"
53
+
54
+ PRINTABLE_CHAR = "#{UTF_8_CHAR}|[<>-]"
55
+
56
+ # Status vars
57
+ STA_FORMAT_INT = ' %-20s = %d'
58
+ STA_FORMAT_FLT = ' %-20s = %6.5f'
59
+ STA_NUM_COMMANDS = 'Received Commands'
60
+ STA_NUM_OBJECTS = 'Received Objects '
61
+ STA_TIM_COMMANDS = 'Time to control '
62
+ STA_TIM_OBJECTS = 'Time to process '
63
+ STA_PER_OBJECT = 'Time per object '
64
+ STA_PER_COMMAND = 'Time per command '
65
+
66
+ # Stream commands
67
+ STR_CMD_TALK = 'TALK'
68
+ STR_CMD_STATUS = 'STATUS'
69
+ STR_CMD_ERR = 'ERR'
70
+ STR_CMD_WARN = 'WARN'
71
+ STR_CMD_LIR = 'LIR-FORMAT'
72
+ STR_CMD_FILE = 'FILE'
73
+ STR_CMD_EOL = 'EOL'
74
+ STR_CMD_RECORD = 'RECORD'
75
+ STR_CMD_EOF = 'EOF'
76
+
77
+ # Token attributes
78
+ TA_WORD = 'WORD'
79
+ TA_PUNCTUATION = 'PUNC'
80
+ TA_NUMERICAL = 'NUMS'
81
+ TA_URL = 'URLS'
82
+ TA_ABREVIATION = 'ABRV'
83
+ TA_ABREVIATION1 = 'ABRS'
84
+ TA_OTHER = 'OTHR'
85
+ TA_STOPWORD = 'STOP'
86
+
87
+ # Word attributes
88
+ WA_UNSET = '-' # Standardattribut bei der Initialisierung eines Word-Objektes
89
+ WA_IDENTIFIED = 'IDF' # Status, nachdem das Word im Wörterbuch gefunden wurde
90
+ WA_UNKNOWN = '?' # Status, wenn das Word nicht gefunden werden konnte
91
+ WA_KOMPOSITUM = 'KOM' # Wort ist als Kompositum erkannt worden
92
+ WA_MULTIWORD = 'MUL' # Wort ist eine Mehrwortgruppe
93
+ WA_SEQUENCE = 'SEQ' # Wort ist eine Mehrwortgruppe
94
+ WA_UNKMULPART = 'MU?' # Word ist unbekannt, jedoch Teil einer Mehrwortgruppe
95
+
96
+ # Lexical attributes (Wortklassen)
97
+ LA_SUBSTANTIV = 's'
98
+ LA_ADJEKTIV = 'a'
99
+ LA_VERB = 'v'
100
+ LA_EIGENNAME = 'e'
101
+ LA_KOMPOSITUM = 'k'
102
+ LA_MULTIWORD = 'm'
103
+ LA_SEQUENCE = 'q'
104
+ LA_WORTFORM = 'w'
105
+ LA_SYNONYM = 'y'
106
+ LA_STOPWORD = 't'
107
+ LA_TAKEITASIS = 'x'
108
+ LA_UNKNOWN = '?'
109
+
110
+ LA_SORTORDER = [
111
+ LA_MULTIWORD,
112
+ LA_KOMPOSITUM,
113
+ LA_SUBSTANTIV,
114
+ LA_VERB,
115
+ LA_ADJEKTIV,
116
+ LA_EIGENNAME,
117
+ LA_WORTFORM,
118
+ LA_STOPWORD,
119
+ LA_TAKEITASIS,
120
+ LA_SYNONYM,
121
+ LA_UNKNOWN
122
+ ].reverse.join
123
+
124
+ # Field separators for DBM files
125
+ KEY_SEP = '='
126
+ FLD_SEP = '|'
127
+ IDX_REF = '^'
128
+ KEY_REF = '*'
129
+ SYS_KEY = '~'
130
+
131
+ end
data/lib/lingo/ctl.rb ADDED
@@ -0,0 +1,173 @@
1
+ require 'optparse'
2
+ require 'fileutils'
3
+ require 'nuggets/enumerable/minmax'
4
+
5
+ class Lingo
6
+
7
+ module Ctl
8
+
9
+ extend self
10
+
11
+ PROG, VERSION = $0, '0.0.1'
12
+ PROGNAME = File.basename(PROG)
13
+
14
+ COMMANDS = {}
15
+
16
+ { config: %w[configuration],
17
+ lang: %w[language],
18
+ dict: %w[dictionary dictionaries],
19
+ store: %w[store] }.each { |what, (sing, plur)|
20
+ COMMANDS["list#{what}"] = [
21
+ "List available #{plur || "#{sing}s"}", 'Arguments: [name...]'
22
+ ] if what != :store
23
+ COMMANDS["find#{what}"] = [
24
+ "Find #{sing} in Lingo search path", 'Arguments: name'
25
+ ]
26
+ COMMANDS["copy#{what}"] = [
27
+ "Copy #{sing} to local Lingo directory", 'Arguments: name'
28
+ ] if what != :store
29
+
30
+ %w[list find copy].each { |method|
31
+ class_eval %Q{def do_#{method}#{what}; #{method}(:#{what}); end}
32
+ }
33
+ }
34
+
35
+ COMMANDS.update(
36
+ 'path' => 'Print search path for dictionaries and configurations',
37
+ 'help' => 'Print help for available commands',
38
+ 'version' => 'Print Lingo version number'
39
+ )
40
+
41
+ USAGE = <<EOT
42
+ Usage: #{PROG} <command> [arguments] [options]
43
+ #{PROG} [-h|--help] [--version]
44
+ EOT
45
+
46
+ OPTIONS = {}
47
+
48
+ def do
49
+ parse_options
50
+ send("do_#{COMMANDS.has_key?(command = ARGV.shift) ? command : 'usage'}")
51
+ end
52
+
53
+ private
54
+
55
+ def list(what)
56
+ names = Regexp.union(*ARGV.empty? ? '' : ARGV)
57
+
58
+ Lingo.list(what, path: path_for_scope).each { |file|
59
+ puts file if File.basename(file) =~ names
60
+ }
61
+ end
62
+
63
+ def find(what, doit = true)
64
+ name = ARGV.shift or do_usage('Required argument `name\' missing.')
65
+ no_args
66
+
67
+ file = Lingo.find(what, name, path: path_for_scope, &method(:do_usage))
68
+ doit ? puts(file) : file
69
+ end
70
+
71
+ def copy(what)
72
+ do_usage('Source and target are the same.') if OPTIONS[:scope] == :local
73
+
74
+ source = find(what, false)
75
+ target = File.join(path_for_scope(:local), Lingo.basepath(what, source))
76
+
77
+ do_usage('Source and target are the same.') if source == target
78
+
79
+ FileUtils.mkdir_p(File.dirname(target))
80
+ FileUtils.cp(source, target, verbose: true)
81
+ end
82
+
83
+ def do_path
84
+ no_args
85
+ puts path_for_scope || PATH
86
+ end
87
+
88
+ def do_help(opts = nil)
89
+ no_args
90
+
91
+ msg = %w[Commands:]
92
+ msg.unshift(opts) if opts
93
+
94
+ max = COMMANDS.keys.max(:length)
95
+
96
+ COMMANDS.each { |command, description|
97
+ description = [*description]
98
+ msg << " %-#{max}s - %s" % [command, description.shift]
99
+
100
+ description.each { |extra|
101
+ msg << " %#{max}s + %s" % [' ', extra]
102
+ } unless opts
103
+ }
104
+
105
+ abort msg.join("\n")
106
+ end
107
+
108
+ def do_version(doit = true)
109
+ no_args
110
+
111
+ msg = "Lingo v#{Lingo::VERSION}"
112
+ doit ? puts(msg) : msg
113
+ end
114
+
115
+ def do_usage(msg = nil)
116
+ msg = msg ? "#{PROGNAME}: #{msg}\n\n" : ''
117
+ abort msg << USAGE
118
+ end
119
+
120
+ def parse_options
121
+ OptionParser.new(USAGE, 14) { |opts|
122
+ opts.separator ''
123
+ opts.separator 'Scope options:'
124
+
125
+ opts.on('--system', 'Restrict command to the system-wide Lingo directory') {
126
+ OPTIONS[:scope] = :system
127
+ }
128
+
129
+ opts.on('--global', 'Restrict command to the user\'s personal Lingo directory') {
130
+ OPTIONS[:scope] = :global
131
+ }
132
+
133
+ opts.on('--local', 'Restrict command to the local Lingo directory') {
134
+ OPTIONS[:scope] = :local
135
+ }
136
+
137
+ opts.separator ''
138
+ opts.separator 'Generic options:'
139
+
140
+ opts.on('-h', '--help', 'Print this help message and exit') {
141
+ do_help(opts)
142
+ }
143
+
144
+ opts.on('--version', 'Print program version and exit') {
145
+ abort "#{PROGNAME} v#{VERSION} (#{do_version(false)})"
146
+ }
147
+ }.parse!
148
+ end
149
+
150
+ def path_for_scope(scope = OPTIONS[:scope])
151
+ case scope
152
+ when :system then [BASE]
153
+ when :global then [HOME]
154
+ when :local then [CURR]
155
+ when nil
156
+ else do_usage("Invalid scope `#{scope.inspect}'.")
157
+ end
158
+ end
159
+
160
+ def no_args
161
+ do_usage('Too many arguments.') unless ARGV.empty?
162
+ end
163
+
164
+ end
165
+
166
+ def self.ctl
167
+ Ctl.do
168
+ rescue => err
169
+ raise if $VERBOSE
170
+ abort "#{err.backtrace.first}: #{err} (#{err.class})"
171
+ end
172
+
173
+ end