lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
module Language
|
|
30
|
+
|
|
31
|
+
class Dictionary
|
|
32
|
+
|
|
33
|
+
include Cachable
|
|
34
|
+
include Reportable
|
|
35
|
+
|
|
36
|
+
def initialize(config, lingo)
|
|
37
|
+
unless config.has_key?('source')
|
|
38
|
+
raise ArgumentError, 'Required parameter `source\' missing.'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
init_cachable
|
|
42
|
+
init_reportable
|
|
43
|
+
|
|
44
|
+
@suffixes, @infixes = [], []
|
|
45
|
+
|
|
46
|
+
if suffix = lingo.dictionary_config['suffix']
|
|
47
|
+
suffix.each { |t, s|
|
|
48
|
+
t.downcase!
|
|
49
|
+
|
|
50
|
+
s.split.each { |suf|
|
|
51
|
+
su, ex = suf.split('/')
|
|
52
|
+
|
|
53
|
+
(t == 'f' ? @infixes : @suffixes) << [
|
|
54
|
+
Regexp.new(su << '$', 'i'), ex || '*', t
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
@sources = config['source'].map { |src| lingo.lexical_hash(src) }
|
|
61
|
+
@all_sources = config['mode'].nil? || config['mode'].downcase == 'all'
|
|
62
|
+
|
|
63
|
+
lingo.dictionaries << self
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def close
|
|
67
|
+
@sources.each(&:close)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def report
|
|
71
|
+
super.tap { |rep| @sources.each { |src| rep.update(src.report) } }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# _dic_.find_word( _aString_ ) -> _aNewWord_
|
|
75
|
+
#
|
|
76
|
+
# Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
|
|
77
|
+
def find_word(str)
|
|
78
|
+
if hit?(key = str.downcase)
|
|
79
|
+
inc('cache hits')
|
|
80
|
+
return retrieve(key).tap { |word| word.form = str }
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
word = Word.new(str, WA_UNKNOWN)
|
|
84
|
+
|
|
85
|
+
unless (lexicals = select_with_suffix(str)).empty?
|
|
86
|
+
word.lexicals = lexicals
|
|
87
|
+
word.attr = WA_IDENTIFIED
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
store(key, word)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def find_synonyms(obj)
|
|
94
|
+
lex = obj.lexicals
|
|
95
|
+
lex = [obj] if lex.empty? && obj.unknown?
|
|
96
|
+
|
|
97
|
+
# multiworder optimization
|
|
98
|
+
ref = %r{\A#{Regexp.escape(Database::KEY_REF)}\d+}o
|
|
99
|
+
|
|
100
|
+
lex.each_with_object([]) { |l, s|
|
|
101
|
+
next if l.attr == LA_SYNONYM
|
|
102
|
+
next if l.attr != LA_KOMPOSITUM && obj.attr == WA_KOMPOSITUM
|
|
103
|
+
|
|
104
|
+
select(l.form).each { |y| s << y unless y =~ ref }
|
|
105
|
+
}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# _dic_.select( _aString_ ) -> _ArrayOfLexicals_
|
|
109
|
+
#
|
|
110
|
+
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
|
|
111
|
+
def select(str)
|
|
112
|
+
@sources.each_with_object([]) { |src, lex|
|
|
113
|
+
l = src[str] or next
|
|
114
|
+
lex.concat(l)
|
|
115
|
+
break lex unless @all_sources
|
|
116
|
+
}.tap { |lex| lex.sort!; lex.uniq! }
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# _dic_.select_with_suffix( _aString_ ) -> _ArrayOfLexicals_
|
|
120
|
+
#
|
|
121
|
+
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
|
|
122
|
+
# Sucht dabei auch Wörter, die um wortklassenspezifische Suffixe bereinigt wurden.
|
|
123
|
+
def select_with_suffix(str)
|
|
124
|
+
select_with_affix(:suffix, str)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# _dic_.select_with_infix( _aString_ ) -> _ArrayOfLexicals_
|
|
128
|
+
#
|
|
129
|
+
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
|
|
130
|
+
# Sucht dabei auch Wörter, die eine Fugung am Ende haben.
|
|
131
|
+
def select_with_infix(str)
|
|
132
|
+
select_with_affix(:infix, str)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# _dic_.suffix_lexicals( _aString_ ) -> _ArrayOfLexicals_
|
|
136
|
+
#
|
|
137
|
+
# Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
|
|
138
|
+
#
|
|
139
|
+
# dic.suffix_lexicals("Hasens") -> [(hasen/s), (hasen/e), (has/e)]
|
|
140
|
+
def suffix_lexicals(str)
|
|
141
|
+
affix_lexicals(:suffix, str)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# _dic_.gap_lexicals( _aString_ ) -> _ArrayOfLexicals_
|
|
145
|
+
#
|
|
146
|
+
# Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
|
|
147
|
+
def infix_lexicals(str)
|
|
148
|
+
affix_lexicals(:infix, str)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
private
|
|
152
|
+
|
|
153
|
+
def select_with_affix(affix, str)
|
|
154
|
+
select(str).tap { |l|
|
|
155
|
+
if l.empty?
|
|
156
|
+
affix_lexicals(affix, str).each { |a| select(a.form).each { |b|
|
|
157
|
+
l << b if affix != :suffix || a.attr == b.attr
|
|
158
|
+
} }
|
|
159
|
+
end
|
|
160
|
+
}
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def affix_lexicals(affix, str)
|
|
164
|
+
instance_variable_get("@#{affix}es").each_with_object([]) { |(r, e, t), l|
|
|
165
|
+
l << Lexical.new("#{$`}#{e == '*' ? '' : e}#{$'}", t) if str =~ r
|
|
166
|
+
}
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
end
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
module Language
|
|
30
|
+
|
|
31
|
+
# Die Klasse Grammar beinhaltet grammatikalische Spezialitäten einer Sprache. Derzeit findet die
|
|
32
|
+
# Kompositumerkennung hier ihren Platz, die mit der Methode find_compositum aufgerufen werden kann.
|
|
33
|
+
# Die Klasse Grammar wird genau wie ein Dictionary initialisiert. Das bei der Initialisierung angegebene Wörterbuch ist Grundlage
|
|
34
|
+
# für die Erkennung der Kompositumteile.
|
|
35
|
+
|
|
36
|
+
class Grammar
|
|
37
|
+
|
|
38
|
+
include Cachable
|
|
39
|
+
include Reportable
|
|
40
|
+
|
|
41
|
+
HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
|
|
42
|
+
|
|
43
|
+
# initialize(config, dictionary_config) -> _Grammar_
|
|
44
|
+
# config = Attendee-spezifische Parameter
|
|
45
|
+
# dictionary_config = Datenbankkonfiguration aus de.lang
|
|
46
|
+
def initialize(config, lingo)
|
|
47
|
+
init_cachable
|
|
48
|
+
init_reportable
|
|
49
|
+
|
|
50
|
+
@dic, @suggestions = Dictionary.new(config, lingo), []
|
|
51
|
+
|
|
52
|
+
cfg = lingo.dictionary_config['compositum']
|
|
53
|
+
|
|
54
|
+
# Ein Wort muss mindestens 8 Zeichen lang sein, damit
|
|
55
|
+
# überhaupt eine Prüfung stattfindet.
|
|
56
|
+
@min_word_size = (cfg['min-word-size'] || 8).to_i
|
|
57
|
+
|
|
58
|
+
# Die durchschnittliche Länge der Kompositum-Wortteile
|
|
59
|
+
# muss mindestens 4 Zeichen lang sein, sonst ist es kein
|
|
60
|
+
# gültiges Kompositum.
|
|
61
|
+
@min_avg_part_size = (cfg['min-avg-part-size'] || 4).to_i
|
|
62
|
+
|
|
63
|
+
# Der kürzeste Kompositum-Wortteil muss mindestens 1 Zeichen lang sein
|
|
64
|
+
@min_part_size = (cfg['min-part-size'] || 1).to_i
|
|
65
|
+
|
|
66
|
+
# Ein Kompositum darf aus höchstens 4 Wortteilen bestehen
|
|
67
|
+
@max_parts = (cfg['max-parts'] || 4).to_i
|
|
68
|
+
|
|
69
|
+
# Die Wortklasse eines Kompositum-Wortteils kann separat gekennzeichnet
|
|
70
|
+
# werden, um sie von Wortklassen normaler Wörter unterscheiden zu
|
|
71
|
+
# können z.B. Hausmeister => ['haus/s', 'meister/s'] oder Hausmeister
|
|
72
|
+
# => ['haus/s+', 'meister/s+'] mit append-wordclass = '+'
|
|
73
|
+
@append_wc = cfg.fetch('append-wordclass', '')
|
|
74
|
+
|
|
75
|
+
# Bestimmte Sequenzen können als ungültige Komposita erkannt werden,
|
|
76
|
+
# z.B. ist ein Kompositum aus zwei Adjetiven kein Kompositum, also
|
|
77
|
+
# skip-sequence = 'aa'
|
|
78
|
+
@sequences = cfg.fetch('skip-sequences', []).map(&:downcase)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def close
|
|
82
|
+
@dic.close
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def report
|
|
86
|
+
super.update(@dic.report)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# find_compositum(str) -> word wenn level=1
|
|
90
|
+
# find_compositum(str) -> [lex, sta] wenn level!=1
|
|
91
|
+
#
|
|
92
|
+
# find_compositum arbeitet in verschiedenen Leveln, da die Methode auch rekursiv aufgerufen wird. Ein Level größer 1
|
|
93
|
+
# entspricht daher einem rekursiven Aufruf
|
|
94
|
+
def find_compositum(str, level = 1, tail = false)
|
|
95
|
+
key, top, empty = str.downcase, level == 1, [[], [], '']
|
|
96
|
+
|
|
97
|
+
if top && hit?(key)
|
|
98
|
+
inc('cache hits')
|
|
99
|
+
return retrieve(key)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
com = Word.new(str, WA_UNKNOWN)
|
|
103
|
+
|
|
104
|
+
unless str.length > @min_word_size
|
|
105
|
+
inc('String zu kurz')
|
|
106
|
+
return top ? com : empty
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
inc('Komposita geprüft')
|
|
110
|
+
|
|
111
|
+
res = permute_compositum(key, level, tail)
|
|
112
|
+
val = !(lex = res.first).empty? && valid?(str, *res[1..-1])
|
|
113
|
+
|
|
114
|
+
if top
|
|
115
|
+
if val
|
|
116
|
+
inc('Komposita erkannt')
|
|
117
|
+
|
|
118
|
+
com.attr = WA_KOMPOSITUM
|
|
119
|
+
com.lexicals = lex.map { |l|
|
|
120
|
+
l.attr == LA_KOMPOSITUM ? l :
|
|
121
|
+
Lexical.new(l.form, l.attr + @append_wc)
|
|
122
|
+
}
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
store(key, com)
|
|
126
|
+
else
|
|
127
|
+
val ? res : empty
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# permute_compositum( _aString_ ) -> [lex, sta, seq]
|
|
132
|
+
def permute_compositum(str, level, tail)
|
|
133
|
+
return test_compositum($1, '-', $2, level, tail) if str =~ HYPHEN_RE
|
|
134
|
+
|
|
135
|
+
sug, len = @suggestions[level] ||= [], str.length
|
|
136
|
+
|
|
137
|
+
1.upto(len - 1) { |i|
|
|
138
|
+
res = test_compositum(str[0, i], '', str[i, len], level, tail)
|
|
139
|
+
|
|
140
|
+
unless (lex = res.first).empty?
|
|
141
|
+
return res unless lex.last.attr == LA_TAKEITASIS
|
|
142
|
+
sug << res
|
|
143
|
+
end
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
sug.empty? ? [[], [], ''] : sug.first.tap { sug.clear }
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# test_compositum() -> [lex, sta, seq]
|
|
150
|
+
#
|
|
151
|
+
# Testet einen definiert zerlegten String auf Kompositum
|
|
152
|
+
def test_compositum(fstr, infix, bstr, level, tail)
|
|
153
|
+
sta, seq, empty = [fstr.length, bstr.length], %w[? ?], [[], [], '']
|
|
154
|
+
|
|
155
|
+
if !(blex = @dic.select_with_suffix(bstr)).sort!.empty?
|
|
156
|
+
# 1. Word w/ suffix
|
|
157
|
+
bform, seq[1] = tail ? bstr : blex.first.form, blex.first.attr
|
|
158
|
+
elsif tail && !(blex = @dic.select_with_infix(bstr)).sort!.empty?
|
|
159
|
+
# 2. Word w/ infix, unless tail part
|
|
160
|
+
bform, seq[1] = bstr, blex.first.attr
|
|
161
|
+
elsif infix == '-'
|
|
162
|
+
blex, bsta, bseq = find_compositum(bstr, level + 1, tail)
|
|
163
|
+
|
|
164
|
+
if !blex.sort!.empty?
|
|
165
|
+
# 3. Compositum
|
|
166
|
+
bform, seq[1], sta[1..-1] = blex.first.form, bseq, bsta
|
|
167
|
+
else
|
|
168
|
+
# 4. Take it as is
|
|
169
|
+
blex = [Lexical.new(bform = bstr, seq[1] = LA_TAKEITASIS)]
|
|
170
|
+
end
|
|
171
|
+
else
|
|
172
|
+
return empty
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
if !(flex = @dic.select_with_infix(fstr)).sort!.empty?
|
|
176
|
+
# 1. Word w/ infix
|
|
177
|
+
fform, seq[0] = fstr, flex.first.attr
|
|
178
|
+
else
|
|
179
|
+
flex, fsta, fseq = find_compositum(fstr, level + 1, true)
|
|
180
|
+
|
|
181
|
+
if !flex.sort!.empty?
|
|
182
|
+
# 2. Compositum
|
|
183
|
+
fform, seq[0], sta[0..0] = flex.first.form, fseq, fsta
|
|
184
|
+
elsif infix == '-'
|
|
185
|
+
# 3. Take it as is
|
|
186
|
+
flex = [Lexical.new(fform = fstr, seq[0] = LA_TAKEITASIS)]
|
|
187
|
+
else
|
|
188
|
+
return empty
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
flex.concat(blex).delete_if { |l| l.attr == LA_KOMPOSITUM }.
|
|
193
|
+
push(Lexical.new(fform + infix + bform, LA_KOMPOSITUM)).sort!
|
|
194
|
+
|
|
195
|
+
[flex, sta, seq.join]
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
private
|
|
199
|
+
|
|
200
|
+
def valid?(str, sta, seq)
|
|
201
|
+
sta.size <= @max_parts &&
|
|
202
|
+
sta.sort.first >= @min_part_size &&
|
|
203
|
+
str.length / sta.size >= @min_avg_part_size &&
|
|
204
|
+
(@sequences.empty? || !@sequences.include?(seq))
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
module Language
|
|
30
|
+
|
|
31
|
+
# Die Klasse Lexical, abgeleitet von der Klasse WordForm, stellt den Container
|
|
32
|
+
# für eine Grundform eines Wortes bereit, welches mit der Wortklasse versehen ist.
|
|
33
|
+
#
|
|
34
|
+
# Wird z.B. aus dem Wörterbuch eine Grundform gelesen, so wird dies in Form eines
|
|
35
|
+
# Lexical-Objektes zurückgegeben, z.B. Lexical.new('Rennen', 'S') -> (rennen/s)
|
|
36
|
+
|
|
37
|
+
class Lexical < WordForm
|
|
38
|
+
|
|
39
|
+
def <=>(other)
|
|
40
|
+
return 1 unless other.is_a?(self.class)
|
|
41
|
+
|
|
42
|
+
if attr == other.attr
|
|
43
|
+
form <=> other.form
|
|
44
|
+
else
|
|
45
|
+
attr.empty? ? 1 : other.attr.empty? ? -1 : begin
|
|
46
|
+
a = LA_SORTORDER.index(attr)
|
|
47
|
+
b = LA_SORTORDER.index(other.attr)
|
|
48
|
+
|
|
49
|
+
a ? b ? b <=> a : -1 : b ? 1 : attr <=> other.attr
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def to_str
|
|
55
|
+
to_a.join('#')
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def to_s
|
|
59
|
+
"(#{super})"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
end
|