lingo 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -0,0 +1,173 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ class Dictionary
32
+
33
+ include Cachable
34
+ include Reportable
35
+
36
+ def initialize(config, lingo)
37
+ unless config.has_key?('source')
38
+ raise ArgumentError, 'Required parameter `source\' missing.'
39
+ end
40
+
41
+ init_cachable
42
+ init_reportable
43
+
44
+ @suffixes, @infixes = [], []
45
+
46
+ if suffix = lingo.dictionary_config['suffix']
47
+ suffix.each { |t, s|
48
+ t.downcase!
49
+
50
+ s.split.each { |suf|
51
+ su, ex = suf.split('/')
52
+
53
+ (t == 'f' ? @infixes : @suffixes) << [
54
+ Regexp.new(su << '$', 'i'), ex || '*', t
55
+ ]
56
+ }
57
+ }
58
+ end
59
+
60
+ @sources = config['source'].map { |src| lingo.lexical_hash(src) }
61
+ @all_sources = config['mode'].nil? || config['mode'].downcase == 'all'
62
+
63
+ lingo.dictionaries << self
64
+ end
65
+
66
+ def close
67
+ @sources.each(&:close)
68
+ end
69
+
70
+ def report
71
+ super.tap { |rep| @sources.each { |src| rep.update(src.report) } }
72
+ end
73
+
74
+ # _dic_.find_word( _aString_ ) -> _aNewWord_
75
+ #
76
+ # Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
77
+ def find_word(str)
78
+ if hit?(key = str.downcase)
79
+ inc('cache hits')
80
+ return retrieve(key).tap { |word| word.form = str }
81
+ end
82
+
83
+ word = Word.new(str, WA_UNKNOWN)
84
+
85
+ unless (lexicals = select_with_suffix(str)).empty?
86
+ word.lexicals = lexicals
87
+ word.attr = WA_IDENTIFIED
88
+ end
89
+
90
+ store(key, word)
91
+ end
92
+
93
+ def find_synonyms(obj)
94
+ lex = obj.lexicals
95
+ lex = [obj] if lex.empty? && obj.unknown?
96
+
97
+ # multiworder optimization
98
+ ref = %r{\A#{Regexp.escape(Database::KEY_REF)}\d+}o
99
+
100
+ lex.each_with_object([]) { |l, s|
101
+ next if l.attr == LA_SYNONYM
102
+ next if l.attr != LA_KOMPOSITUM && obj.attr == WA_KOMPOSITUM
103
+
104
+ select(l.form).each { |y| s << y unless y =~ ref }
105
+ }
106
+ end
107
+
108
+ # _dic_.select( _aString_ ) -> _ArrayOfLexicals_
109
+ #
110
+ # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
111
+ def select(str)
112
+ @sources.each_with_object([]) { |src, lex|
113
+ l = src[str] or next
114
+ lex.concat(l)
115
+ break lex unless @all_sources
116
+ }.tap { |lex| lex.sort!; lex.uniq! }
117
+ end
118
+
119
+ # _dic_.select_with_suffix( _aString_ ) -> _ArrayOfLexicals_
120
+ #
121
+ # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
122
+ # Sucht dabei auch Wörter, die um wortklassenspezifische Suffixe bereinigt wurden.
123
+ def select_with_suffix(str)
124
+ select_with_affix(:suffix, str)
125
+ end
126
+
127
+ # _dic_.select_with_infix( _aString_ ) -> _ArrayOfLexicals_
128
+ #
129
+ # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
130
+ # Sucht dabei auch Wörter, die eine Fugung am Ende haben.
131
+ def select_with_infix(str)
132
+ select_with_affix(:infix, str)
133
+ end
134
+
135
+ # _dic_.suffix_lexicals( _aString_ ) -> _ArrayOfLexicals_
136
+ #
137
+ # Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
138
+ #
139
+ # dic.suffix_lexicals("Hasens") -> [(hasen/s), (hasen/e), (has/e)]
140
+ def suffix_lexicals(str)
141
+ affix_lexicals(:suffix, str)
142
+ end
143
+
144
+ # _dic_.gap_lexicals( _aString_ ) -> _ArrayOfLexicals_
145
+ #
146
+ # Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
147
+ def infix_lexicals(str)
148
+ affix_lexicals(:infix, str)
149
+ end
150
+
151
+ private
152
+
153
+ def select_with_affix(affix, str)
154
+ select(str).tap { |l|
155
+ if l.empty?
156
+ affix_lexicals(affix, str).each { |a| select(a.form).each { |b|
157
+ l << b if affix != :suffix || a.attr == b.attr
158
+ } }
159
+ end
160
+ }
161
+ end
162
+
163
+ def affix_lexicals(affix, str)
164
+ instance_variable_get("@#{affix}es").each_with_object([]) { |(r, e, t), l|
165
+ l << Lexical.new("#{$`}#{e == '*' ? '' : e}#{$'}", t) if str =~ r
166
+ }
167
+ end
168
+
169
+ end
170
+
171
+ end
172
+
173
+ end
@@ -0,0 +1,211 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Grammar beinhaltet grammatikalische Spezialitäten einer Sprache. Derzeit findet die
32
+ # Kompositumerkennung hier ihren Platz, die mit der Methode find_compositum aufgerufen werden kann.
33
+ # Die Klasse Grammar wird genau wie ein Dictionary initialisiert. Das bei der Initialisierung angegebene Wörterbuch ist Grundlage
34
+ # für die Erkennung der Kompositumteile.
35
+
36
+ class Grammar
37
+
38
+ include Cachable
39
+ include Reportable
40
+
41
+ HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
42
+
43
+ # initialize(config, dictionary_config) -> _Grammar_
44
+ # config = Attendee-spezifische Parameter
45
+ # dictionary_config = Datenbankkonfiguration aus de.lang
46
+ def initialize(config, lingo)
47
+ init_cachable
48
+ init_reportable
49
+
50
+ @dic, @suggestions = Dictionary.new(config, lingo), []
51
+
52
+ cfg = lingo.dictionary_config['compositum']
53
+
54
+ # Ein Wort muss mindestens 8 Zeichen lang sein, damit
55
+ # überhaupt eine Prüfung stattfindet.
56
+ @min_word_size = (cfg['min-word-size'] || 8).to_i
57
+
58
+ # Die durchschnittliche Länge der Kompositum-Wortteile
59
+ # muss mindestens 4 Zeichen lang sein, sonst ist es kein
60
+ # gültiges Kompositum.
61
+ @min_avg_part_size = (cfg['min-avg-part-size'] || 4).to_i
62
+
63
+ # Der kürzeste Kompositum-Wortteil muss mindestens 1 Zeichen lang sein
64
+ @min_part_size = (cfg['min-part-size'] || 1).to_i
65
+
66
+ # Ein Kompositum darf aus höchstens 4 Wortteilen bestehen
67
+ @max_parts = (cfg['max-parts'] || 4).to_i
68
+
69
+ # Die Wortklasse eines Kompositum-Wortteils kann separat gekennzeichnet
70
+ # werden, um sie von Wortklassen normaler Wörter unterscheiden zu
71
+ # können z.B. Hausmeister => ['haus/s', 'meister/s'] oder Hausmeister
72
+ # => ['haus/s+', 'meister/s+'] mit append-wordclass = '+'
73
+ @append_wc = cfg.fetch('append-wordclass', '')
74
+
75
+ # Bestimmte Sequenzen können als ungültige Komposita erkannt werden,
76
+ # z.B. ist ein Kompositum aus zwei Adjetiven kein Kompositum, also
77
+ # skip-sequence = 'aa'
78
+ @sequences = cfg.fetch('skip-sequences', []).map(&:downcase)
79
+ end
80
+
81
+ def close
82
+ @dic.close
83
+ end
84
+
85
+ def report
86
+ super.update(@dic.report)
87
+ end
88
+
89
+ # find_compositum(str) -> word wenn level=1
90
+ # find_compositum(str) -> [lex, sta] wenn level!=1
91
+ #
92
+ # find_compositum arbeitet in verschiedenen Leveln, da die Methode auch rekursiv aufgerufen wird. Ein Level größer 1
93
+ # entspricht daher einem rekursiven Aufruf
94
+ def find_compositum(str, level = 1, tail = false)
95
+ key, top, empty = str.downcase, level == 1, [[], [], '']
96
+
97
+ if top && hit?(key)
98
+ inc('cache hits')
99
+ return retrieve(key)
100
+ end
101
+
102
+ com = Word.new(str, WA_UNKNOWN)
103
+
104
+ unless str.length > @min_word_size
105
+ inc('String zu kurz')
106
+ return top ? com : empty
107
+ end
108
+
109
+ inc('Komposita geprüft')
110
+
111
+ res = permute_compositum(key, level, tail)
112
+ val = !(lex = res.first).empty? && valid?(str, *res[1..-1])
113
+
114
+ if top
115
+ if val
116
+ inc('Komposita erkannt')
117
+
118
+ com.attr = WA_KOMPOSITUM
119
+ com.lexicals = lex.map { |l|
120
+ l.attr == LA_KOMPOSITUM ? l :
121
+ Lexical.new(l.form, l.attr + @append_wc)
122
+ }
123
+ end
124
+
125
+ store(key, com)
126
+ else
127
+ val ? res : empty
128
+ end
129
+ end
130
+
131
+ # permute_compositum( _aString_ ) -> [lex, sta, seq]
132
+ def permute_compositum(str, level, tail)
133
+ return test_compositum($1, '-', $2, level, tail) if str =~ HYPHEN_RE
134
+
135
+ sug, len = @suggestions[level] ||= [], str.length
136
+
137
+ 1.upto(len - 1) { |i|
138
+ res = test_compositum(str[0, i], '', str[i, len], level, tail)
139
+
140
+ unless (lex = res.first).empty?
141
+ return res unless lex.last.attr == LA_TAKEITASIS
142
+ sug << res
143
+ end
144
+ }
145
+
146
+ sug.empty? ? [[], [], ''] : sug.first.tap { sug.clear }
147
+ end
148
+
149
+ # test_compositum() -> [lex, sta, seq]
150
+ #
151
+ # Testet einen definiert zerlegten String auf Kompositum
152
+ def test_compositum(fstr, infix, bstr, level, tail)
153
+ sta, seq, empty = [fstr.length, bstr.length], %w[? ?], [[], [], '']
154
+
155
+ if !(blex = @dic.select_with_suffix(bstr)).sort!.empty?
156
+ # 1. Word w/ suffix
157
+ bform, seq[1] = tail ? bstr : blex.first.form, blex.first.attr
158
+ elsif tail && !(blex = @dic.select_with_infix(bstr)).sort!.empty?
159
+ # 2. Word w/ infix, unless tail part
160
+ bform, seq[1] = bstr, blex.first.attr
161
+ elsif infix == '-'
162
+ blex, bsta, bseq = find_compositum(bstr, level + 1, tail)
163
+
164
+ if !blex.sort!.empty?
165
+ # 3. Compositum
166
+ bform, seq[1], sta[1..-1] = blex.first.form, bseq, bsta
167
+ else
168
+ # 4. Take it as is
169
+ blex = [Lexical.new(bform = bstr, seq[1] = LA_TAKEITASIS)]
170
+ end
171
+ else
172
+ return empty
173
+ end
174
+
175
+ if !(flex = @dic.select_with_infix(fstr)).sort!.empty?
176
+ # 1. Word w/ infix
177
+ fform, seq[0] = fstr, flex.first.attr
178
+ else
179
+ flex, fsta, fseq = find_compositum(fstr, level + 1, true)
180
+
181
+ if !flex.sort!.empty?
182
+ # 2. Compositum
183
+ fform, seq[0], sta[0..0] = flex.first.form, fseq, fsta
184
+ elsif infix == '-'
185
+ # 3. Take it as is
186
+ flex = [Lexical.new(fform = fstr, seq[0] = LA_TAKEITASIS)]
187
+ else
188
+ return empty
189
+ end
190
+ end
191
+
192
+ flex.concat(blex).delete_if { |l| l.attr == LA_KOMPOSITUM }.
193
+ push(Lexical.new(fform + infix + bform, LA_KOMPOSITUM)).sort!
194
+
195
+ [flex, sta, seq.join]
196
+ end
197
+
198
+ private
199
+
200
+ def valid?(str, sta, seq)
201
+ sta.size <= @max_parts &&
202
+ sta.sort.first >= @min_part_size &&
203
+ str.length / sta.size >= @min_avg_part_size &&
204
+ (@sequences.empty? || !@sequences.include?(seq))
205
+ end
206
+
207
+ end
208
+
209
+ end
210
+
211
+ end
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Lexical, abgeleitet von der Klasse WordForm, stellt den Container
32
+ # für eine Grundform eines Wortes bereit, welches mit der Wortklasse versehen ist.
33
+ #
34
+ # Wird z.B. aus dem Wörterbuch eine Grundform gelesen, so wird dies in Form eines
35
+ # Lexical-Objektes zurückgegeben, z.B. Lexical.new('Rennen', 'S') -> (rennen/s)
36
+
37
+ class Lexical < WordForm
38
+
39
+ def <=>(other)
40
+ return 1 unless other.is_a?(self.class)
41
+
42
+ if attr == other.attr
43
+ form <=> other.form
44
+ else
45
+ attr.empty? ? 1 : other.attr.empty? ? -1 : begin
46
+ a = LA_SORTORDER.index(attr)
47
+ b = LA_SORTORDER.index(other.attr)
48
+
49
+ a ? b ? b <=> a : -1 : b ? 1 : attr <=> other.attr
50
+ end
51
+ end
52
+ end
53
+
54
+ def to_str
55
+ to_a.join('#')
56
+ end
57
+
58
+ def to_s
59
+ "(#{super})"
60
+ end
61
+
62
+ end
63
+
64
+ end
65
+
66
+ end