lingo 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. data/.rspec +1 -0
  2. data/COPYING +663 -0
  3. data/ChangeLog +754 -0
  4. data/README +322 -0
  5. data/Rakefile +100 -0
  6. data/TODO +28 -0
  7. data/bin/lingo +5 -0
  8. data/bin/lingoctl +6 -0
  9. data/de.lang +121 -0
  10. data/de/lingo-abk.txt +74 -0
  11. data/de/lingo-dic.txt +56822 -0
  12. data/de/lingo-mul.txt +3209 -0
  13. data/de/lingo-syn.txt +14841 -0
  14. data/de/test_dic.txt +24 -0
  15. data/de/test_mul.txt +17 -0
  16. data/de/test_mul2.txt +2 -0
  17. data/de/test_singleword.txt +2 -0
  18. data/de/test_syn.txt +4 -0
  19. data/de/test_syn2.txt +1 -0
  20. data/de/user-dic.txt +10 -0
  21. data/en.lang +113 -0
  22. data/en/lingo-dic.txt +55434 -0
  23. data/en/lingo-mul.txt +456 -0
  24. data/en/user-dic.txt +5 -0
  25. data/info/Objekte.png +0 -0
  26. data/info/Typen.png +0 -0
  27. data/info/database.png +0 -0
  28. data/info/db_small.png +0 -0
  29. data/info/download.png +0 -0
  30. data/info/gpl-hdr.txt +27 -0
  31. data/info/kerze.png +0 -0
  32. data/info/language.png +0 -0
  33. data/info/lingo.png +0 -0
  34. data/info/logo.png +0 -0
  35. data/info/meeting.png +0 -0
  36. data/info/types.png +0 -0
  37. data/lib/lingo.rb +321 -0
  38. data/lib/lingo/attendee/abbreviator.rb +119 -0
  39. data/lib/lingo/attendee/debugger.rb +111 -0
  40. data/lib/lingo/attendee/decomposer.rb +101 -0
  41. data/lib/lingo/attendee/dehyphenizer.rb +167 -0
  42. data/lib/lingo/attendee/multiworder.rb +301 -0
  43. data/lib/lingo/attendee/noneword_filter.rb +103 -0
  44. data/lib/lingo/attendee/objectfilter.rb +86 -0
  45. data/lib/lingo/attendee/sequencer.rb +190 -0
  46. data/lib/lingo/attendee/synonymer.rb +105 -0
  47. data/lib/lingo/attendee/textreader.rb +237 -0
  48. data/lib/lingo/attendee/textwriter.rb +196 -0
  49. data/lib/lingo/attendee/tokenizer.rb +218 -0
  50. data/lib/lingo/attendee/variator.rb +185 -0
  51. data/lib/lingo/attendee/vector_filter.rb +158 -0
  52. data/lib/lingo/attendee/wordsearcher.rb +96 -0
  53. data/lib/lingo/attendees.rb +289 -0
  54. data/lib/lingo/cli.rb +62 -0
  55. data/lib/lingo/config.rb +104 -0
  56. data/lib/lingo/const.rb +131 -0
  57. data/lib/lingo/ctl.rb +173 -0
  58. data/lib/lingo/database.rb +587 -0
  59. data/lib/lingo/language.rb +530 -0
  60. data/lib/lingo/modules.rb +98 -0
  61. data/lib/lingo/types.rb +285 -0
  62. data/lib/lingo/utilities.rb +40 -0
  63. data/lib/lingo/version.rb +27 -0
  64. data/lingo-all.cfg +85 -0
  65. data/lingo-call.cfg +15 -0
  66. data/lingo.cfg +78 -0
  67. data/lingo.rb +3 -0
  68. data/lir.cfg +72 -0
  69. data/porter/stem.cfg +311 -0
  70. data/porter/stem.rb +150 -0
  71. data/spec/spec_helper.rb +0 -0
  72. data/test.cfg +79 -0
  73. data/test/attendee/ts_abbreviator.rb +35 -0
  74. data/test/attendee/ts_decomposer.rb +31 -0
  75. data/test/attendee/ts_multiworder.rb +390 -0
  76. data/test/attendee/ts_noneword_filter.rb +19 -0
  77. data/test/attendee/ts_objectfilter.rb +19 -0
  78. data/test/attendee/ts_sequencer.rb +43 -0
  79. data/test/attendee/ts_synonymer.rb +33 -0
  80. data/test/attendee/ts_textreader.rb +58 -0
  81. data/test/attendee/ts_textwriter.rb +98 -0
  82. data/test/attendee/ts_tokenizer.rb +32 -0
  83. data/test/attendee/ts_variator.rb +24 -0
  84. data/test/attendee/ts_vector_filter.rb +62 -0
  85. data/test/attendee/ts_wordsearcher.rb +119 -0
  86. data/test/lir.csv +3 -0
  87. data/test/lir.txt +12 -0
  88. data/test/lir2.txt +12 -0
  89. data/test/mul.txt +1 -0
  90. data/test/ref/artikel.mul +1 -0
  91. data/test/ref/artikel.non +159 -0
  92. data/test/ref/artikel.seq +270 -0
  93. data/test/ref/artikel.syn +16 -0
  94. data/test/ref/artikel.vec +928 -0
  95. data/test/ref/artikel.ven +928 -0
  96. data/test/ref/artikel.ver +928 -0
  97. data/test/ref/lir.csv +328 -0
  98. data/test/ref/lir.mul +1 -0
  99. data/test/ref/lir.non +274 -0
  100. data/test/ref/lir.seq +249 -0
  101. data/test/ref/lir.syn +94 -0
  102. data/test/test_helper.rb +113 -0
  103. data/test/ts_database.rb +269 -0
  104. data/test/ts_language.rb +396 -0
  105. data/txt/artikel-en.txt +157 -0
  106. data/txt/artikel.txt +170 -0
  107. data/txt/lir.txt +1317 -0
  108. metadata +211 -0
data/lib/lingo/cli.rb ADDED
@@ -0,0 +1,62 @@
1
+ require 'nuggets/util/cli'
2
+
3
+ class Lingo
4
+
5
+ class CLI < ::Util::CLI
6
+
7
+ class << self
8
+
9
+ def defaults
10
+ super.merge(
11
+ config: 'lingo.cfg',
12
+ language: 'de',
13
+ status: false,
14
+ perfmon: false
15
+ )
16
+ end
17
+
18
+ end
19
+
20
+ attr_reader :files
21
+
22
+ def run(arguments)
23
+ @files = arguments
24
+ end
25
+
26
+ private
27
+
28
+ def load_config(*)
29
+ @config = {}
30
+ end
31
+
32
+ def opts(opts)
33
+ opts.on('-c', '--config YAML', "Config file [Default: #{defaults[:config]}#{' (currently not present)' unless File.readable?(defaults[:config])}]") { |config|
34
+ options[:config] = config
35
+ }
36
+
37
+ opts.separator ''
38
+
39
+ opts.on('-l', '--language LANG', "Language for processing [Default: #{defaults[:language]}]") { |language|
40
+ options[:language] = language
41
+ }
42
+
43
+ opts.separator ''
44
+
45
+ opts.on('-s', '--status', 'Print status information after processing') {
46
+ options[:status] = true
47
+ }
48
+
49
+ opts.on('-p', '--perfmon', 'Print performance details after processing') {
50
+ options[:perfmon] = true
51
+ }
52
+
53
+ opts.separator ''
54
+
55
+ opts.on('-L', '--log FILE', 'Log file to print debug and status information to') { |log|
56
+ options[:log] = @stderr.reopen(File.open(log, 'a+', encoding: ENC))
57
+ }
58
+ end
59
+
60
+ end
61
+
62
+ end
@@ -0,0 +1,104 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
+ # Mehrworterkennung und Relationierung.
6
+ #
7
+ # Copyright (C) 2005-2007 John Vorhauer
8
+ # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
+ #
10
+ # This program is free software; you can redistribute it and/or modify it under
11
+ # the terms of the GNU Affero General Public License as published by the Free
12
+ # Software Foundation; either version 3 of the License, or (at your option)
13
+ # any later version.
14
+ #
15
+ # This program is distributed in the hope that it will be useful, but WITHOUT
16
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
+ # details.
19
+ #
20
+ # You should have received a copy of the GNU Affero General Public License along
21
+ # with this program; if not, write to the Free Software Foundation, Inc.,
22
+ # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
+ #
24
+ # For more information visit http://www.lex-lingo.de or contact me at
25
+ # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
+ #
27
+ # Lex Lingo rules from here on
28
+ #++
29
+
30
+ require 'yaml'
31
+ require_relative 'cli'
32
+
33
+ class Lingo
34
+
35
+ class Config
36
+
37
+ def initialize(*args)
38
+ @cli, @opts = CLI.new, {}
39
+
40
+ @cli.execute(*args)
41
+ @cli.options.each { |key, val| @opts[key.to_s] = val }
42
+
43
+ load_config('language', :lang)
44
+ load_config('config')
45
+
46
+ Array(self['meeting/attendees']).each { |a|
47
+ r = a['textreader'] or next
48
+
49
+ f = @cli.files
50
+
51
+ if i = r['files']
52
+ r['files'] = i.strip == '$(files)' ?
53
+ f : i.split(STRING_SEPERATOR_PATTERN)
54
+ elsif !f.empty?
55
+ r['files'] = f
56
+ end
57
+
58
+ break
59
+ }
60
+ end
61
+
62
+ def [](key)
63
+ key_to_nodes(key).inject(@opts) { |value, node| value[node] }
64
+ end
65
+
66
+ def []=(key, value)
67
+ nodes = key_to_nodes(key); node = nodes.pop
68
+ (self[nodes_to_key(nodes)] ||= {})[node] = value
69
+ end
70
+
71
+ def stdin
72
+ @cli.stdin
73
+ end
74
+
75
+ def stdout
76
+ @cli.stdout
77
+ end
78
+
79
+ def stderr
80
+ @cli.stderr
81
+ end
82
+
83
+ def quit(*args)
84
+ @cli.send(:quit, *args)
85
+ end
86
+
87
+ private
88
+
89
+ def key_to_nodes(key)
90
+ key.downcase.split('/')
91
+ end
92
+
93
+ def nodes_to_key(nodes)
94
+ nodes.join('/')
95
+ end
96
+
97
+ def load_config(key, type = key.to_sym)
98
+ file = Lingo.find(type, @opts[key], &method(:quit))
99
+ @opts.update(File.open(file, encoding: ENC, &YAML.method(:load)))
100
+ end
101
+
102
+ end
103
+
104
+ end
@@ -0,0 +1,131 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
+ # Mehrworterkennung und Relationierung.
6
+ #
7
+ # Copyright (C) 2005-2007 John Vorhauer
8
+ # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
+ #
10
+ # This program is free software; you can redistribute it and/or modify it under
11
+ # the terms of the GNU Affero General Public License as published by the Free
12
+ # Software Foundation; either version 3 of the License, or (at your option)
13
+ # any later version.
14
+ #
15
+ # This program is distributed in the hope that it will be useful, but WITHOUT
16
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
+ # details.
19
+ #
20
+ # You should have received a copy of the GNU Affero General Public License along
21
+ # with this program; if not, write to the Free Software Foundation, Inc.,
22
+ # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
+ #
24
+ # For more information visit http://www.lex-lingo.de or contact me at
25
+ # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
+ #
27
+ # Lex Lingo rules from here on
28
+ #++
29
+
30
+ class Lingo
31
+
32
+ ENC = 'UTF-8'
33
+
34
+ STRING_SEPERATOR_PATTERN = /[; ,\|]/
35
+
36
+ # String-Konstanten im Datenstrom
37
+ CHAR_PUNCT = '.'
38
+
39
+ # Define printable characters for tokenizer for UTF-8 encoding
40
+ UTF_8_DIGIT = '[0-9]'
41
+ # Define Basic Latin printable characters for UTF-8 encoding from U+0000 to U+007f
42
+ UTF_8_BASLAT = '[A-Za-z]'
43
+ # Define Latin-1 Supplement printable characters for UTF-8 encoding from U+0080 to U+00ff
44
+ UTF_8_LAT1SP = '[\xc3\x80-\xc3\x96\xc3\x98-\xc3\xb6\xc3\xb8-\xc3\xbf]'
45
+ # Define Latin Extended-A printable characters for UTF-8 encoding from U+0100 to U+017f
46
+ UTF_8_LATEXA = '[\xc4\x80-\xc4\xbf\xc5\x80-\xc5\xbf]'
47
+ # Define Latin Extended-B printable characters for UTF-8 encoding from U+0180 to U+024f
48
+ UTF_8_LATEXB = '[\xc6\x80-\xc6\xbf\xc7\x80-\xc7\xbf\xc8\x80-\xc8\xbf\xc9\x80-\xc9\x8f]'
49
+ # Define IPA Extension printable characters for UTF-8 encoding from U+024f to U+02af
50
+ UTF_8_IPAEXT = '[\xc9\xa0-\xc9\xbf\xca\xa0-\xca\xaf]'
51
+ # Collect all UTF-8 printable characters in Unicode range U+0000 to U+02af
52
+ UTF_8_CHAR = "#{UTF_8_DIGIT}|#{UTF_8_BASLAT}|#{UTF_8_LAT1SP}|#{UTF_8_LATEXA}|#{UTF_8_LATEXB}|#{UTF_8_IPAEXT}"
53
+
54
+ PRINTABLE_CHAR = "#{UTF_8_CHAR}|[<>-]"
55
+
56
+ # Status vars
57
+ STA_FORMAT_INT = ' %-20s = %d'
58
+ STA_FORMAT_FLT = ' %-20s = %6.5f'
59
+ STA_NUM_COMMANDS = 'Received Commands'
60
+ STA_NUM_OBJECTS = 'Received Objects '
61
+ STA_TIM_COMMANDS = 'Time to control '
62
+ STA_TIM_OBJECTS = 'Time to process '
63
+ STA_PER_OBJECT = 'Time per object '
64
+ STA_PER_COMMAND = 'Time per command '
65
+
66
+ # Stream commands
67
+ STR_CMD_TALK = 'TALK'
68
+ STR_CMD_STATUS = 'STATUS'
69
+ STR_CMD_ERR = 'ERR'
70
+ STR_CMD_WARN = 'WARN'
71
+ STR_CMD_LIR = 'LIR-FORMAT'
72
+ STR_CMD_FILE = 'FILE'
73
+ STR_CMD_EOL = 'EOL'
74
+ STR_CMD_RECORD = 'RECORD'
75
+ STR_CMD_EOF = 'EOF'
76
+
77
+ # Token attributes
78
+ TA_WORD = 'WORD'
79
+ TA_PUNCTUATION = 'PUNC'
80
+ TA_NUMERICAL = 'NUMS'
81
+ TA_URL = 'URLS'
82
+ TA_ABREVIATION = 'ABRV'
83
+ TA_ABREVIATION1 = 'ABRS'
84
+ TA_OTHER = 'OTHR'
85
+ TA_STOPWORD = 'STOP'
86
+
87
+ # Word attributes
88
+ WA_UNSET = '-' # Standardattribut bei der Initialisierung eines Word-Objektes
89
+ WA_IDENTIFIED = 'IDF' # Status, nachdem das Word im Wörterbuch gefunden wurde
90
+ WA_UNKNOWN = '?' # Status, wenn das Word nicht gefunden werden konnte
91
+ WA_KOMPOSITUM = 'KOM' # Wort ist als Kompositum erkannt worden
92
+ WA_MULTIWORD = 'MUL' # Wort ist eine Mehrwortgruppe
93
+ WA_SEQUENCE = 'SEQ' # Wort ist eine Mehrwortgruppe
94
+ WA_UNKMULPART = 'MU?' # Word ist unbekannt, jedoch Teil einer Mehrwortgruppe
95
+
96
+ # Lexical attributes (Wortklassen)
97
+ LA_SUBSTANTIV = 's'
98
+ LA_ADJEKTIV = 'a'
99
+ LA_VERB = 'v'
100
+ LA_EIGENNAME = 'e'
101
+ LA_KOMPOSITUM = 'k'
102
+ LA_MULTIWORD = 'm'
103
+ LA_SEQUENCE = 'q'
104
+ LA_WORTFORM = 'w'
105
+ LA_SYNONYM = 'y'
106
+ LA_STOPWORD = 't'
107
+ LA_TAKEITASIS = 'x'
108
+ LA_UNKNOWN = '?'
109
+
110
+ LA_SORTORDER = [
111
+ LA_MULTIWORD,
112
+ LA_KOMPOSITUM,
113
+ LA_SUBSTANTIV,
114
+ LA_VERB,
115
+ LA_ADJEKTIV,
116
+ LA_EIGENNAME,
117
+ LA_WORTFORM,
118
+ LA_STOPWORD,
119
+ LA_TAKEITASIS,
120
+ LA_SYNONYM,
121
+ LA_UNKNOWN
122
+ ].reverse.join
123
+
124
+ # Field separators for DBM files
125
+ KEY_SEP = '='
126
+ FLD_SEP = '|'
127
+ IDX_REF = '^'
128
+ KEY_REF = '*'
129
+ SYS_KEY = '~'
130
+
131
+ end
data/lib/lingo/ctl.rb ADDED
@@ -0,0 +1,173 @@
1
+ require 'optparse'
2
+ require 'fileutils'
3
+ require 'nuggets/enumerable/minmax'
4
+
5
+ class Lingo
6
+
7
+ module Ctl
8
+
9
+ extend self
10
+
11
+ PROG, VERSION = $0, '0.0.1'
12
+ PROGNAME = File.basename(PROG)
13
+
14
+ COMMANDS = {}
15
+
16
+ { config: %w[configuration],
17
+ lang: %w[language],
18
+ dict: %w[dictionary dictionaries],
19
+ store: %w[store] }.each { |what, (sing, plur)|
20
+ COMMANDS["list#{what}"] = [
21
+ "List available #{plur || "#{sing}s"}", 'Arguments: [name...]'
22
+ ] if what != :store
23
+ COMMANDS["find#{what}"] = [
24
+ "Find #{sing} in Lingo search path", 'Arguments: name'
25
+ ]
26
+ COMMANDS["copy#{what}"] = [
27
+ "Copy #{sing} to local Lingo directory", 'Arguments: name'
28
+ ] if what != :store
29
+
30
+ %w[list find copy].each { |method|
31
+ class_eval %Q{def do_#{method}#{what}; #{method}(:#{what}); end}
32
+ }
33
+ }
34
+
35
+ COMMANDS.update(
36
+ 'path' => 'Print search path for dictionaries and configurations',
37
+ 'help' => 'Print help for available commands',
38
+ 'version' => 'Print Lingo version number'
39
+ )
40
+
41
+ USAGE = <<EOT
42
+ Usage: #{PROG} <command> [arguments] [options]
43
+ #{PROG} [-h|--help] [--version]
44
+ EOT
45
+
46
+ OPTIONS = {}
47
+
48
+ def do
49
+ parse_options
50
+ send("do_#{COMMANDS.has_key?(command = ARGV.shift) ? command : 'usage'}")
51
+ end
52
+
53
+ private
54
+
55
+ def list(what)
56
+ names = Regexp.union(*ARGV.empty? ? '' : ARGV)
57
+
58
+ Lingo.list(what, path: path_for_scope).each { |file|
59
+ puts file if File.basename(file) =~ names
60
+ }
61
+ end
62
+
63
+ def find(what, doit = true)
64
+ name = ARGV.shift or do_usage('Required argument `name\' missing.')
65
+ no_args
66
+
67
+ file = Lingo.find(what, name, path: path_for_scope, &method(:do_usage))
68
+ doit ? puts(file) : file
69
+ end
70
+
71
+ def copy(what)
72
+ do_usage('Source and target are the same.') if OPTIONS[:scope] == :local
73
+
74
+ source = find(what, false)
75
+ target = File.join(path_for_scope(:local), Lingo.basepath(what, source))
76
+
77
+ do_usage('Source and target are the same.') if source == target
78
+
79
+ FileUtils.mkdir_p(File.dirname(target))
80
+ FileUtils.cp(source, target, verbose: true)
81
+ end
82
+
83
+ def do_path
84
+ no_args
85
+ puts path_for_scope || PATH
86
+ end
87
+
88
+ def do_help(opts = nil)
89
+ no_args
90
+
91
+ msg = %w[Commands:]
92
+ msg.unshift(opts) if opts
93
+
94
+ max = COMMANDS.keys.max(:length)
95
+
96
+ COMMANDS.each { |command, description|
97
+ description = [*description]
98
+ msg << " %-#{max}s - %s" % [command, description.shift]
99
+
100
+ description.each { |extra|
101
+ msg << " %#{max}s + %s" % [' ', extra]
102
+ } unless opts
103
+ }
104
+
105
+ abort msg.join("\n")
106
+ end
107
+
108
+ def do_version(doit = true)
109
+ no_args
110
+
111
+ msg = "Lingo v#{Lingo::VERSION}"
112
+ doit ? puts(msg) : msg
113
+ end
114
+
115
+ def do_usage(msg = nil)
116
+ msg = msg ? "#{PROGNAME}: #{msg}\n\n" : ''
117
+ abort msg << USAGE
118
+ end
119
+
120
+ def parse_options
121
+ OptionParser.new(USAGE, 14) { |opts|
122
+ opts.separator ''
123
+ opts.separator 'Scope options:'
124
+
125
+ opts.on('--system', 'Restrict command to the system-wide Lingo directory') {
126
+ OPTIONS[:scope] = :system
127
+ }
128
+
129
+ opts.on('--global', 'Restrict command to the user\'s personal Lingo directory') {
130
+ OPTIONS[:scope] = :global
131
+ }
132
+
133
+ opts.on('--local', 'Restrict command to the local Lingo directory') {
134
+ OPTIONS[:scope] = :local
135
+ }
136
+
137
+ opts.separator ''
138
+ opts.separator 'Generic options:'
139
+
140
+ opts.on('-h', '--help', 'Print this help message and exit') {
141
+ do_help(opts)
142
+ }
143
+
144
+ opts.on('--version', 'Print program version and exit') {
145
+ abort "#{PROGNAME} v#{VERSION} (#{do_version(false)})"
146
+ }
147
+ }.parse!
148
+ end
149
+
150
+ def path_for_scope(scope = OPTIONS[:scope])
151
+ case scope
152
+ when :system then [BASE]
153
+ when :global then [HOME]
154
+ when :local then [CURR]
155
+ when nil
156
+ else do_usage("Invalid scope `#{scope.inspect}'.")
157
+ end
158
+ end
159
+
160
+ def no_args
161
+ do_usage('Too many arguments.') unless ARGV.empty?
162
+ end
163
+
164
+ end
165
+
166
+ def self.ctl
167
+ Ctl.do
168
+ rescue => err
169
+ raise if $VERBOSE
170
+ abort "#{err.backtrace.first}: #{err} (#{err.class})"
171
+ end
172
+
173
+ end