lingo 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -0,0 +1,173 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ class Dictionary
32
+
33
+ include Cachable
34
+ include Reportable
35
+
36
+ def initialize(config, lingo)
37
+ unless config.has_key?('source')
38
+ raise ArgumentError, 'Required parameter `source\' missing.'
39
+ end
40
+
41
+ init_cachable
42
+ init_reportable
43
+
44
+ @suffixes, @infixes = [], []
45
+
46
+ if suffix = lingo.dictionary_config['suffix']
47
+ suffix.each { |t, s|
48
+ t.downcase!
49
+
50
+ s.split.each { |suf|
51
+ su, ex = suf.split('/')
52
+
53
+ (t == 'f' ? @infixes : @suffixes) << [
54
+ Regexp.new(su << '$', 'i'), ex || '*', t
55
+ ]
56
+ }
57
+ }
58
+ end
59
+
60
+ @sources = config['source'].map { |src| lingo.lexical_hash(src) }
61
+ @all_sources = config['mode'].nil? || config['mode'].downcase == 'all'
62
+
63
+ lingo.dictionaries << self
64
+ end
65
+
66
+ def close
67
+ @sources.each(&:close)
68
+ end
69
+
70
+ def report
71
+ super.tap { |rep| @sources.each { |src| rep.update(src.report) } }
72
+ end
73
+
74
+ # _dic_.find_word( _aString_ ) -> _aNewWord_
75
+ #
76
+ # Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
77
+ def find_word(str)
78
+ if hit?(key = str.downcase)
79
+ inc('cache hits')
80
+ return retrieve(key).tap { |word| word.form = str }
81
+ end
82
+
83
+ word = Word.new(str, WA_UNKNOWN)
84
+
85
+ unless (lexicals = select_with_suffix(str)).empty?
86
+ word.lexicals = lexicals
87
+ word.attr = WA_IDENTIFIED
88
+ end
89
+
90
+ store(key, word)
91
+ end
92
+
93
+ def find_synonyms(obj)
94
+ lex = obj.lexicals
95
+ lex = [obj] if lex.empty? && obj.unknown?
96
+
97
+ # multiworder optimization
98
+ ref = %r{\A#{Regexp.escape(Database::KEY_REF)}\d+}o
99
+
100
+ lex.each_with_object([]) { |l, s|
101
+ next if l.attr == LA_SYNONYM
102
+ next if l.attr != LA_KOMPOSITUM && obj.attr == WA_KOMPOSITUM
103
+
104
+ select(l.form).each { |y| s << y unless y =~ ref }
105
+ }
106
+ end
107
+
108
+ # _dic_.select( _aString_ ) -> _ArrayOfLexicals_
109
+ #
110
+ # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
111
+ def select(str)
112
+ @sources.each_with_object([]) { |src, lex|
113
+ l = src[str] or next
114
+ lex.concat(l)
115
+ break lex unless @all_sources
116
+ }.tap { |lex| lex.sort!; lex.uniq! }
117
+ end
118
+
119
+ # _dic_.select_with_suffix( _aString_ ) -> _ArrayOfLexicals_
120
+ #
121
+ # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
122
+ # Sucht dabei auch Wörter, die um wortklassenspezifische Suffixe bereinigt wurden.
123
+ def select_with_suffix(str)
124
+ select_with_affix(:suffix, str)
125
+ end
126
+
127
+ # _dic_.select_with_infix( _aString_ ) -> _ArrayOfLexicals_
128
+ #
129
+ # Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
130
+ # Sucht dabei auch Wörter, die eine Fugung am Ende haben.
131
+ def select_with_infix(str)
132
+ select_with_affix(:infix, str)
133
+ end
134
+
135
+ # _dic_.suffix_lexicals( _aString_ ) -> _ArrayOfLexicals_
136
+ #
137
+ # Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
138
+ #
139
+ # dic.suffix_lexicals("Hasens") -> [(hasen/s), (hasen/e), (has/e)]
140
+ def suffix_lexicals(str)
141
+ affix_lexicals(:suffix, str)
142
+ end
143
+
144
+ # _dic_.gap_lexicals( _aString_ ) -> _ArrayOfLexicals_
145
+ #
146
+ # Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
147
+ def infix_lexicals(str)
148
+ affix_lexicals(:infix, str)
149
+ end
150
+
151
+ private
152
+
153
+ def select_with_affix(affix, str)
154
+ select(str).tap { |l|
155
+ if l.empty?
156
+ affix_lexicals(affix, str).each { |a| select(a.form).each { |b|
157
+ l << b if affix != :suffix || a.attr == b.attr
158
+ } }
159
+ end
160
+ }
161
+ end
162
+
163
+ def affix_lexicals(affix, str)
164
+ instance_variable_get("@#{affix}es").each_with_object([]) { |(r, e, t), l|
165
+ l << Lexical.new("#{$`}#{e == '*' ? '' : e}#{$'}", t) if str =~ r
166
+ }
167
+ end
168
+
169
+ end
170
+
171
+ end
172
+
173
+ end
@@ -0,0 +1,211 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Grammar beinhaltet grammatikalische Spezialitäten einer Sprache. Derzeit findet die
32
+ # Kompositumerkennung hier ihren Platz, die mit der Methode find_compositum aufgerufen werden kann.
33
+ # Die Klasse Grammar wird genau wie ein Dictionary initialisiert. Das bei der Initialisierung angegebene Wörterbuch ist Grundlage
34
+ # für die Erkennung der Kompositumteile.
35
+
36
+ class Grammar
37
+
38
+ include Cachable
39
+ include Reportable
40
+
41
+ HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
42
+
43
+ # initialize(config, dictionary_config) -> _Grammar_
44
+ # config = Attendee-spezifische Parameter
45
+ # dictionary_config = Datenbankkonfiguration aus de.lang
46
+ def initialize(config, lingo)
47
+ init_cachable
48
+ init_reportable
49
+
50
+ @dic, @suggestions = Dictionary.new(config, lingo), []
51
+
52
+ cfg = lingo.dictionary_config['compositum']
53
+
54
+ # Ein Wort muss mindestens 8 Zeichen lang sein, damit
55
+ # überhaupt eine Prüfung stattfindet.
56
+ @min_word_size = (cfg['min-word-size'] || 8).to_i
57
+
58
+ # Die durchschnittliche Länge der Kompositum-Wortteile
59
+ # muss mindestens 4 Zeichen lang sein, sonst ist es kein
60
+ # gültiges Kompositum.
61
+ @min_avg_part_size = (cfg['min-avg-part-size'] || 4).to_i
62
+
63
+ # Der kürzeste Kompositum-Wortteil muss mindestens 1 Zeichen lang sein
64
+ @min_part_size = (cfg['min-part-size'] || 1).to_i
65
+
66
+ # Ein Kompositum darf aus höchstens 4 Wortteilen bestehen
67
+ @max_parts = (cfg['max-parts'] || 4).to_i
68
+
69
+ # Die Wortklasse eines Kompositum-Wortteils kann separat gekennzeichnet
70
+ # werden, um sie von Wortklassen normaler Wörter unterscheiden zu
71
+ # können z.B. Hausmeister => ['haus/s', 'meister/s'] oder Hausmeister
72
+ # => ['haus/s+', 'meister/s+'] mit append-wordclass = '+'
73
+ @append_wc = cfg.fetch('append-wordclass', '')
74
+
75
+ # Bestimmte Sequenzen können als ungültige Komposita erkannt werden,
76
+ # z.B. ist ein Kompositum aus zwei Adjetiven kein Kompositum, also
77
+ # skip-sequence = 'aa'
78
+ @sequences = cfg.fetch('skip-sequences', []).map(&:downcase)
79
+ end
80
+
81
+ def close
82
+ @dic.close
83
+ end
84
+
85
+ def report
86
+ super.update(@dic.report)
87
+ end
88
+
89
+ # find_compositum(str) -> word wenn level=1
90
+ # find_compositum(str) -> [lex, sta] wenn level!=1
91
+ #
92
+ # find_compositum arbeitet in verschiedenen Leveln, da die Methode auch rekursiv aufgerufen wird. Ein Level größer 1
93
+ # entspricht daher einem rekursiven Aufruf
94
+ def find_compositum(str, level = 1, tail = false)
95
+ key, top, empty = str.downcase, level == 1, [[], [], '']
96
+
97
+ if top && hit?(key)
98
+ inc('cache hits')
99
+ return retrieve(key)
100
+ end
101
+
102
+ com = Word.new(str, WA_UNKNOWN)
103
+
104
+ unless str.length > @min_word_size
105
+ inc('String zu kurz')
106
+ return top ? com : empty
107
+ end
108
+
109
+ inc('Komposita geprüft')
110
+
111
+ res = permute_compositum(key, level, tail)
112
+ val = !(lex = res.first).empty? && valid?(str, *res[1..-1])
113
+
114
+ if top
115
+ if val
116
+ inc('Komposita erkannt')
117
+
118
+ com.attr = WA_KOMPOSITUM
119
+ com.lexicals = lex.map { |l|
120
+ l.attr == LA_KOMPOSITUM ? l :
121
+ Lexical.new(l.form, l.attr + @append_wc)
122
+ }
123
+ end
124
+
125
+ store(key, com)
126
+ else
127
+ val ? res : empty
128
+ end
129
+ end
130
+
131
+ # permute_compositum( _aString_ ) -> [lex, sta, seq]
132
+ def permute_compositum(str, level, tail)
133
+ return test_compositum($1, '-', $2, level, tail) if str =~ HYPHEN_RE
134
+
135
+ sug, len = @suggestions[level] ||= [], str.length
136
+
137
+ 1.upto(len - 1) { |i|
138
+ res = test_compositum(str[0, i], '', str[i, len], level, tail)
139
+
140
+ unless (lex = res.first).empty?
141
+ return res unless lex.last.attr == LA_TAKEITASIS
142
+ sug << res
143
+ end
144
+ }
145
+
146
+ sug.empty? ? [[], [], ''] : sug.first.tap { sug.clear }
147
+ end
148
+
149
+ # test_compositum() -> [lex, sta, seq]
150
+ #
151
+ # Testet einen definiert zerlegten String auf Kompositum
152
+ def test_compositum(fstr, infix, bstr, level, tail)
153
+ sta, seq, empty = [fstr.length, bstr.length], %w[? ?], [[], [], '']
154
+
155
+ if !(blex = @dic.select_with_suffix(bstr)).sort!.empty?
156
+ # 1. Word w/ suffix
157
+ bform, seq[1] = tail ? bstr : blex.first.form, blex.first.attr
158
+ elsif tail && !(blex = @dic.select_with_infix(bstr)).sort!.empty?
159
+ # 2. Word w/ infix, unless tail part
160
+ bform, seq[1] = bstr, blex.first.attr
161
+ elsif infix == '-'
162
+ blex, bsta, bseq = find_compositum(bstr, level + 1, tail)
163
+
164
+ if !blex.sort!.empty?
165
+ # 3. Compositum
166
+ bform, seq[1], sta[1..-1] = blex.first.form, bseq, bsta
167
+ else
168
+ # 4. Take it as is
169
+ blex = [Lexical.new(bform = bstr, seq[1] = LA_TAKEITASIS)]
170
+ end
171
+ else
172
+ return empty
173
+ end
174
+
175
+ if !(flex = @dic.select_with_infix(fstr)).sort!.empty?
176
+ # 1. Word w/ infix
177
+ fform, seq[0] = fstr, flex.first.attr
178
+ else
179
+ flex, fsta, fseq = find_compositum(fstr, level + 1, true)
180
+
181
+ if !flex.sort!.empty?
182
+ # 2. Compositum
183
+ fform, seq[0], sta[0..0] = flex.first.form, fseq, fsta
184
+ elsif infix == '-'
185
+ # 3. Take it as is
186
+ flex = [Lexical.new(fform = fstr, seq[0] = LA_TAKEITASIS)]
187
+ else
188
+ return empty
189
+ end
190
+ end
191
+
192
+ flex.concat(blex).delete_if { |l| l.attr == LA_KOMPOSITUM }.
193
+ push(Lexical.new(fform + infix + bform, LA_KOMPOSITUM)).sort!
194
+
195
+ [flex, sta, seq.join]
196
+ end
197
+
198
+ private
199
+
200
+ def valid?(str, sta, seq)
201
+ sta.size <= @max_parts &&
202
+ sta.sort.first >= @min_part_size &&
203
+ str.length / sta.size >= @min_avg_part_size &&
204
+ (@sequences.empty? || !@sequences.include?(seq))
205
+ end
206
+
207
+ end
208
+
209
+ end
210
+
211
+ end
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Lexical, abgeleitet von der Klasse WordForm, stellt den Container
32
+ # für eine Grundform eines Wortes bereit, welches mit der Wortklasse versehen ist.
33
+ #
34
+ # Wird z.B. aus dem Wörterbuch eine Grundform gelesen, so wird dies in Form eines
35
+ # Lexical-Objektes zurückgegeben, z.B. Lexical.new('Rennen', 'S') -> (rennen/s)
36
+
37
+ class Lexical < WordForm
38
+
39
+ def <=>(other)
40
+ return 1 unless other.is_a?(self.class)
41
+
42
+ if attr == other.attr
43
+ form <=> other.form
44
+ else
45
+ attr.empty? ? 1 : other.attr.empty? ? -1 : begin
46
+ a = LA_SORTORDER.index(attr)
47
+ b = LA_SORTORDER.index(other.attr)
48
+
49
+ a ? b ? b <=> a : -1 : b ? 1 : attr <=> other.attr
50
+ end
51
+ end
52
+ end
53
+
54
+ def to_str
55
+ to_a.join('#')
56
+ end
57
+
58
+ def to_s
59
+ "(#{super})"
60
+ end
61
+
62
+ end
63
+
64
+ end
65
+
66
+ end