lingo 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -1,98 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
28
- #++
29
-
30
- class Lingo
31
-
32
- # Das Modul Reportable ermöglicht das setzen und hochzählen von statistischen Werten.
33
-
34
- module Reportable
35
-
36
- def init_reportable
37
- @counters = Hash.new(0)
38
- @prefix = ''
39
- end
40
-
41
- def report_prefix(prefix)
42
- @prefix = prefix
43
- end
44
-
45
- def inc(counter)
46
- @counters[counter] += 1
47
- end
48
-
49
- def add(counter, value)
50
- @counters[counter] += value
51
- end
52
-
53
- def set(counter, value)
54
- @counters[counter] = value
55
- end
56
-
57
- def get(counter)
58
- @counters[counter]
59
- end
60
-
61
- def report
62
- rep = Hash.new
63
- @counters.each_pair { |stat, value|
64
- name = (@prefix=='') ? stat : @prefix+': '+stat
65
- rep[name] = value
66
- }
67
- rep
68
- end
69
-
70
- end
71
-
72
- # Das Modul Cachable ermöglicht das Verwerten von zwischengespeicherten Ergebnisse
73
- # für einen schnelleren Zugriff.
74
-
75
- module Cachable
76
-
77
- def init_cachable
78
- @cache = Hash.new(false)
79
- end
80
-
81
- def hit?(key)
82
- @cache.has_key?(key)
83
- end
84
-
85
- def store(key, value)
86
- res = value.nil? ? nil : value.dup
87
- @cache[key] = res
88
- value
89
- end
90
-
91
- def retrieve(key)
92
- value = @cache[key]
93
- value.nil? ? nil : value.dup
94
- end
95
-
96
- end
97
-
98
- end
@@ -1,285 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
28
- #++
29
-
30
- class Lingo
31
-
32
- # Die Klasse StringA ist die Basisklasse für weitere Klassen, die im Rahmen der
33
- # Objektstruktur eines Wortes benötigt werden. Die Klasse stellt eine Zeichenkette bereit,
34
- # die mit einem Attribut versehen werden kann.
35
-
36
- class StringA
37
-
38
- include Comparable
39
-
40
- attr_accessor :form, :attr
41
-
42
- def initialize(form, attr='-')
43
- @form = form || ''
44
- @attr = attr || ''
45
- end
46
-
47
- def <=>(other)
48
- return 1 if other.nil?
49
- if @form==other.form
50
- @attr<=>other.attr
51
- else
52
- @form<=>other.form
53
- end
54
- end
55
-
56
- def to_s
57
- @form + '/' + @attr
58
- end
59
-
60
- def inspect
61
- to_s
62
- end
63
-
64
- def hash
65
- to_s.hash
66
- end
67
-
68
- def eql?(other)
69
- self.class.equal?(other.class) && to_s == other.to_s
70
- end
71
-
72
- alias_method :==, :eql?
73
-
74
- end
75
-
76
- # Die Klasse Token, abgeleitet von der Klasse StringA, stellt den Container
77
- # für ein einzelnes Wort eines Textes dar. Das Wort wird mit einem Attribut versehen,
78
- # welches der Regel entspricht, die dieses Wort identifiziert hat.
79
- #
80
- # Steht z.B. in ruby.cfg eine Regel zur Erkennung einer Zahl, die mit NUM bezeichnet wird,
81
- # so wird dies dem Token angeheftet, z.B. Token.new('100', 'NUM') -> #100/NUM#
82
-
83
- class Token < StringA
84
-
85
- def to_s
86
- ':' + super + ':'
87
- end
88
-
89
- end
90
-
91
- # Die Klasse Lexical, abgeleitet von der Klasse StringA, stellt den Container
92
- # für eine Grundform eines Wortes bereit, welches mit der Wortklasse versehen ist.
93
- #
94
- # Wird z.B. aus dem Wörterbuch eine Grundform gelesen, so wird dies in Form eines
95
- # Lexical-Objektes zurückgegeben, z.B. Lexical.new('Rennen', 'S') -> (rennen/s)
96
-
97
- class Lexical < StringA
98
-
99
- def <=>(other)
100
- #v TODO: v1.5.1
101
- return 1 unless other.is_a?(Lexical)
102
- #v
103
- if self.attr==other.attr
104
- # gleiche attribute
105
- self.form<=>other.form
106
- else
107
- case # leeres attribut unterliegt
108
- when self.attr=='' then 1
109
- when other.attr=='' then -1
110
- else # vergleich der attribute
111
- ss = LA_SORTORDER.index(self.attr) || -1 # ' -weavsk'
112
- os = LA_SORTORDER.index(other.attr) || -1
113
- case
114
- when ss==-1 && os==-1 # beides unpriviligierte attribute (und nicht gleich)
115
- self.attr<=>other.attr
116
- when ss==-1 && os>-1 then 1
117
- when ss>-1 && os==-1 then -1
118
- when ss>-1 && os>-1 # beides priviligierte attribute (und nicht gleich)
119
- os<=>ss
120
- end
121
- end
122
- end
123
- end
124
-
125
- #v TODO: v1.5.1
126
- def to_a
127
- [@form, @attr]
128
- end
129
-
130
- def to_str
131
- @form + '#' + @attr
132
- end
133
-
134
- #v
135
- def to_s
136
- '(' + super + ')'
137
- end
138
-
139
- end
140
-
141
- # Die Klasse Word bündelt spezifische Eigenschaften eines Wortes mit den
142
- # dazu notwendigen Methoden.
143
-
144
- class Word < StringA
145
-
146
- def self.new_lexical(form, attr, lex_attr)
147
- new(form, attr) << Lexical.new(form, lex_attr)
148
- end
149
-
150
- # Exakte Representation der originären Zeichenkette, so wie sie im Satz
151
- # gefunden wurde, z.B. <tt>form = "RubyLing"</tt>
152
- #
153
- # Ergebnis der Wörterbuch-Suche. Sie stellt die Grundform des Wortes dar.
154
- # Dabei kann es mehrere mögliche Grundformen geben, z.B. kann +abgeschoben+
155
- # als Grundform das _Adjektiv_ +abgeschoben+ sein, oder aber das _Verb_
156
- # +abschieben+.
157
- #
158
- # <tt>lemma = [['abgeschoben', '#a'], ['abschieben', '#v']]</tt>.
159
- #
160
- # <b>Achtung: Lemma wird nicht durch die Word-Klasse bestückt, sondern extern
161
- # durch die Klasse Dictionary</b>
162
-
163
- def initialize(form, attr=WA_UNSET)
164
- super
165
- @lexicals = Array.new
166
- end
167
-
168
- def lexicals(compound_parts = true)
169
- if !compound_parts && attr == WA_KOMPOSITUM
170
- @lexicals.select { |lex| lex.attr == LA_KOMPOSITUM }
171
- else
172
- @lexicals
173
- end
174
- end
175
-
176
- def lexicals=(lexis)
177
- if lexis.is_a?(Array)
178
- @lexicals = lexis.sort.uniq
179
- else
180
- raise 'Falscher Typ bei Zuweisung'
181
- end
182
- end
183
-
184
- def attrs(compound_parts = true)
185
- lexicals(compound_parts).map { |lex| lex.attr }
186
- end
187
-
188
- # für Compositum
189
- def parts
190
- 1
191
- end
192
-
193
- def min_part_size
194
- @form.size
195
- end
196
-
197
- # Gibt genau die Grundform der Wortklasse zurück, die der RegExp des Übergabe-Parameters
198
- # entspricht, z.B. <tt>word.get_wc(/a/) = ['abgeschoben', '#a']</tt>
199
- def get_class(wc_re)
200
- wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
201
-
202
- unless @lexicals.empty?
203
- @lexicals.select { |lex| lex.attr =~ wc_re }
204
- else
205
- attr =~ wc_re ? [self] : []
206
- end
207
- end
208
-
209
- def norm
210
- if @attr == WA_IDENTIFIED
211
- lexicals[0].form
212
- else
213
- @form
214
- end
215
- end
216
-
217
- def compo_form
218
- if @attr==WA_KOMPOSITUM
219
- get_class(LA_KOMPOSITUM)[0]
220
- else
221
- nil
222
- end
223
- end
224
-
225
- def unknown?
226
- [WA_UNKNOWN, WA_UNKMULPART].include?(attr)
227
- end
228
-
229
- def <<(other)
230
- case other
231
- when Lexical then @lexicals << other
232
- when Array then @lexicals += other
233
- end
234
- self
235
- end
236
-
237
- def <=>(other)
238
- return 1 if other.nil?
239
- if @form==other.form
240
- if @attr==other.attr
241
- @lexicals<=>other.lexicals
242
- else
243
- @attr<=>other.attr
244
- end
245
- else
246
- @form<=>other.form
247
- end
248
- end
249
-
250
- def to_s
251
- s = '<' + @form
252
- s << '|' + @attr unless @attr==WA_IDENTIFIED
253
- s << ' = ' + @lexicals.inspect unless @lexicals.empty?
254
- s << '>'
255
- end
256
-
257
- end
258
-
259
- class AgendaItem
260
-
261
- include Comparable
262
-
263
- attr_reader :cmd, :param
264
-
265
- def initialize(cmd, param='')
266
- @cmd = cmd || ''
267
- @param = param || ''
268
- end
269
-
270
- def <=>(other)
271
- return 1 unless other.is_a?(AgendaItem)
272
- if self.cmd==other.cmd
273
- self.param<=>other.param
274
- else
275
- self.cmd<=>other.cmd
276
- end
277
- end
278
-
279
- def inspect
280
- "*#{cmd.upcase}('#{param}')"
281
- end
282
-
283
- end
284
-
285
- end
@@ -1,40 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
28
- #++
29
-
30
- require 'unicode'
31
-
32
- class String
33
-
34
- alias_method :_lingo_original_downcase, :downcase
35
-
36
- def downcase
37
- Unicode.downcase(self)
38
- end
39
-
40
- end