lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
|
@@ -1,158 +1,163 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
#--
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
# Copyright (C) 2007
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
|
26
|
-
#
|
|
27
|
-
# Lex Lingo rules from here on
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
28
25
|
#++
|
|
29
26
|
|
|
30
27
|
class Lingo
|
|
31
28
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
29
|
+
class Attendee
|
|
30
|
+
|
|
31
|
+
# Die Hauptaufgabe des VectorFilter ist die Erstellung eines Dokumenten-Index-Vektor.
|
|
32
|
+
# Dabei werden die durch die anderen Attendees ermittelten Grundformen eines Wortes
|
|
33
|
+
# gespeichert und bei einem Datei- oder Record-Wechsel weitergeleitet. Der VectorFilter
|
|
34
|
+
# kann bestimmte Wortklassen filtern und die Ergebnisse in verschiedenen Arten aufbereiten.
|
|
35
|
+
# Dabei werden Funktionen wie das einfache Zählen der Häufigkeit innerhalb eines Dokuments,
|
|
36
|
+
# aber auch die Term-Frequenz und unterschiedliche Ausgabeformate unterstützt.
|
|
37
|
+
#
|
|
38
|
+
# === Mögliche Verlinkung
|
|
39
|
+
# Erwartet:: Daten vom Typ *Word*, z.B. von Abbreviator, Wordsearcher, Decomposer, Synonymer, Multiworder, Sequencer
|
|
40
|
+
# Erzeugt:: Daten vom Typ *String*, z.B. für Textwriter
|
|
41
|
+
#
|
|
42
|
+
# === Parameter
|
|
43
|
+
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
|
44
|
+
# Alle anderen Parameter müssen zwingend angegeben werden.
|
|
45
|
+
# <b>in</b>:: siehe allgemeine Beschreibung des Attendee
|
|
46
|
+
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
|
47
|
+
# <b><i>lexicals</i></b>:: (Standard: '[sy]' => die Wortklassen Substantiv und Synonyme werden gefiltert)
|
|
48
|
+
# Es können in eckige Klammern beliebige Wortklassen angegeben werden (siehe lib/strings.rb).
|
|
49
|
+
# Der Parameter wird als regulärer Ausdruck ausgewertet.
|
|
50
|
+
# <b><i>sort</i></b>:: (Standard: 'normal')
|
|
51
|
+
# Der Parameter +sort+ beeinflußt Verarbeitung und Ausgabeformat des VectorFilters.
|
|
52
|
+
# normal:: Jedes gefilterte Wort wird einmalig (keine Doppelnennungen!) in
|
|
53
|
+
# alphabetischer Reihenfolge in der Form "wort" ausgegeben.
|
|
54
|
+
# term_abs:: Jedes gefilterte Wort wird einmalig in absteigender Häufigkeit mit Angabe
|
|
55
|
+
# der absoluten Häufigkeit im Dokument in der Form "12 wort" ausgegeben.
|
|
56
|
+
# term_rel:: Jedes gefilterte Wort wird einmalig in absteigender Häufigkeit mit Angabe
|
|
57
|
+
# der relativen Häufigkeit im Dokument in der Form "0.1234 wort" ausgegeben.
|
|
58
|
+
# sto_abs:: Jedes gefilterte Wort wird einmalig in absteigender Häufigkeit mit Angabe
|
|
59
|
+
# der absoluten Häufigkeit im Dokument in der Form "wort {12}" ausgegeben.
|
|
60
|
+
# sto_rel:: Jedes gefilterte Wort wird einmalig in absteigender Häufigkeit mit Angabe
|
|
61
|
+
# der relativen Häufigkeit im Dokument in der Form "wort {0.1234}" ausgegeben.
|
|
62
|
+
# <b><i>skip</i></b>:: (Standard: TA_PUNCTUATION und TA_OTHER) Hiermit wird angegeben, welche Objekte nicht
|
|
63
|
+
# verarbeitet werden sollen. Die +skip+-Angabe bezieht sich auf das Attribut +attr+ von
|
|
64
|
+
# Token oder Word-Objekten.
|
|
65
|
+
#
|
|
66
|
+
# === Beispiele
|
|
67
|
+
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
|
68
|
+
# meeting:
|
|
69
|
+
# attendees:
|
|
70
|
+
# - text_reader: { out: lines, files: '$(files)' }
|
|
71
|
+
# - tokenizer: { in: lines, out: token }
|
|
72
|
+
# - word_searcher: { in: token, out: words, source: 'sys-dic' }
|
|
73
|
+
# - vector_filter: { in: words, out: filtr, sort: 'term_rel' }
|
|
74
|
+
# - debugger: { in: filtr, prompt: 'out>' }
|
|
75
|
+
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
|
76
|
+
# out> *FILE('test.txt')
|
|
77
|
+
# out> "0.28571 indexierung"
|
|
78
|
+
# out> *EOF('test.txt')
|
|
79
|
+
|
|
80
|
+
class VectorFilter < self
|
|
81
|
+
|
|
82
|
+
protected
|
|
83
|
+
|
|
84
|
+
def init
|
|
85
|
+
@lexis = Regexp.new(get_key('lexicals', '[sy]').downcase)
|
|
86
|
+
@sort = get_key('sort', 'normal').downcase
|
|
87
|
+
@skip = get_array('skip', TA_PUNCTUATION+','+TA_OTHER).collect {|s| s.upcase }
|
|
88
|
+
@vectors = Array.new
|
|
89
|
+
@word_count = 0
|
|
90
|
+
|
|
91
|
+
if @debug = get_key('debug', false)
|
|
92
|
+
@prompt = get_key('prompt', 'lex:) ')
|
|
93
|
+
end
|
|
94
94
|
end
|
|
95
|
-
end
|
|
96
95
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
96
|
+
def control(cmd, par)
|
|
97
|
+
case cmd
|
|
98
|
+
when STR_CMD_EOL
|
|
99
|
+
skip_command
|
|
100
|
+
when STR_CMD_FILE, STR_CMD_RECORD, STR_CMD_EOF
|
|
101
|
+
@debug ? @vectors.each(&method(:forward)) : sendVector
|
|
102
|
+
@vectors.clear
|
|
103
|
+
end
|
|
104
104
|
end
|
|
105
|
-
end
|
|
106
105
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
106
|
+
def process(obj)
|
|
107
|
+
if @debug
|
|
108
|
+
@vectors << "#{@prompt} #{obj.inspect}" if eval(@debug)
|
|
109
|
+
elsif obj.is_a?(Word)
|
|
110
|
+
@word_count += 1 if @skip.index(obj.attr).nil?
|
|
111
|
+
unless obj.lexicals.nil?
|
|
112
|
+
lexis = obj.get_class(@lexis) #lexicals.collect { |lex| (lex.attr =~ @lexis) ? lex : nil }.compact # get_class(@lexis)
|
|
113
|
+
lexis.each { |lex| @vectors << lex.form.downcase }
|
|
114
|
+
add('Anzahl von Vektor-Wörtern', lexis.size)
|
|
115
|
+
end
|
|
116
116
|
end
|
|
117
117
|
end
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
private
|
|
121
118
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
119
|
+
private
|
|
120
|
+
|
|
121
|
+
def sendVector
|
|
122
|
+
return if @vectors.size==0
|
|
123
|
+
|
|
124
|
+
add('Objekte gefiltert', @vectors.size)
|
|
125
|
+
|
|
126
|
+
# Array der Vector-Wörter zählen und nach Häufigkeit sortieren
|
|
127
|
+
if @sort=='normal'
|
|
128
|
+
@vectors = @vectors.compact.sort.uniq
|
|
129
|
+
else
|
|
130
|
+
cnt = Hash.new(0)
|
|
131
|
+
@vectors.compact.each { |e| cnt[e]+=1 }
|
|
132
|
+
@vectors = cnt.to_a.sort { |x,y|
|
|
133
|
+
if (y[1]<=>x[1])==0
|
|
134
|
+
x[0]<=>y[0]
|
|
135
|
+
else
|
|
136
|
+
y[1]<=>x[1]
|
|
137
|
+
end
|
|
138
|
+
}
|
|
139
|
+
end
|
|
126
140
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
x[0]<=>y[0]
|
|
136
|
-
else
|
|
137
|
-
y[1]<=>x[1]
|
|
141
|
+
# Vectoren je nach Parameter formatiert weiterleiten
|
|
142
|
+
@vectors.collect { |vec|
|
|
143
|
+
case @sort
|
|
144
|
+
when 'term_abs' then sprintf "%d %s", vec[1], vec[0]
|
|
145
|
+
when 'term_rel' then sprintf "%6.5f %s", vec[1].to_f/@word_count, vec[0]
|
|
146
|
+
when 'sto_abs' then sprintf "%s {%d}", vec[0], vec[1]
|
|
147
|
+
when 'sto_rel' then sprintf "%s {%6.5f}", vec[0], vec[1].to_f/@word_count
|
|
148
|
+
else sprintf "%s", vec
|
|
138
149
|
end
|
|
139
|
-
}
|
|
140
|
-
end
|
|
150
|
+
}.each(&method(:forward))
|
|
141
151
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
case @sort
|
|
145
|
-
when 'term_abs' then sprintf "%d %s", vec[1], vec[0]
|
|
146
|
-
when 'term_rel' then sprintf "%6.5f %s", vec[1].to_f/@word_count, vec[0]
|
|
147
|
-
when 'sto_abs' then sprintf "%s {%d}", vec[0], vec[1]
|
|
148
|
-
when 'sto_rel' then sprintf "%s {%6.5f}", vec[0], vec[1].to_f/@word_count
|
|
149
|
-
else sprintf "%s", vec
|
|
150
|
-
end
|
|
151
|
-
}.each { |str| forward(str) }
|
|
152
|
+
@word_count = 0 if @sort == 'sto_rel'
|
|
153
|
+
end
|
|
152
154
|
|
|
153
|
-
@word_count = 0 if @sort == 'sto_rel'
|
|
154
155
|
end
|
|
155
156
|
|
|
157
|
+
# For backwards compatibility.
|
|
158
|
+
Vectorfilter = VectorFilter
|
|
159
|
+
Vector_filter = VectorFilter
|
|
160
|
+
|
|
156
161
|
end
|
|
157
162
|
|
|
158
163
|
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class Attendee
|
|
30
|
+
|
|
31
|
+
# Der WordSearcher ist das Herzstück von Lingo. Er macht die Hauptarbeit und versucht
|
|
32
|
+
# alle Token die nach einem sinnvollen Wort aussehen, in den ihm angegebenen
|
|
33
|
+
# Wörterbüchern zu finden und aufzulösen. Dabei werden die im Wörterbuch gefundenen
|
|
34
|
+
# Grundformen inkl. Wortklassen an das Word-Objekt angehängt.
|
|
35
|
+
#
|
|
36
|
+
# === Mögliche Verlinkung
|
|
37
|
+
# Erwartet:: Daten vom Typ *Token* (andere werden einfach durchgereicht) z.B. von Tokenizer, Abbreviator
|
|
38
|
+
# Erzeugt:: Daten vom Typ *Word* für erkannte Wörter z.B. für Synonymer, Decomposer, Ocr_variator, Multiworder, Sequencer, Noneword_filter, Vector_filter
|
|
39
|
+
#
|
|
40
|
+
# === Parameter
|
|
41
|
+
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
|
42
|
+
# Alle anderen Parameter müssen zwingend angegeben werden.
|
|
43
|
+
# <b>in</b>:: siehe allgemeine Beschreibung des Attendee
|
|
44
|
+
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
|
45
|
+
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
|
46
|
+
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
|
47
|
+
#
|
|
48
|
+
# === Beispiele
|
|
49
|
+
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
|
50
|
+
# meeting:
|
|
51
|
+
# attendees:
|
|
52
|
+
# - text_reader: { out: lines, files: '$(files)' }
|
|
53
|
+
# - tokenizer: { in: lines, out: token }
|
|
54
|
+
# - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
|
|
55
|
+
# - word_searcher: { in: abbrev, out: words, source: 'sys-dic' }
|
|
56
|
+
# - debugger: { in: words, prompt: 'out>' }
|
|
57
|
+
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
|
58
|
+
# out> *FILE('test.txt')
|
|
59
|
+
# out> <Dies = [(dies/w)]>
|
|
60
|
+
# out> <ist = [(sein/v)]>
|
|
61
|
+
# out> <ggf. = [(gegebenenfalls/w)]>
|
|
62
|
+
# out> <eine = [(einen/v), (ein/w)]>
|
|
63
|
+
# out> <Abk³rzung = [(abk³rzung/s)]>
|
|
64
|
+
# out> :./PUNC:
|
|
65
|
+
# out> *EOL('test.txt')
|
|
66
|
+
# out> *EOF('test.txt')
|
|
67
|
+
|
|
68
|
+
class WordSearcher < self
|
|
69
|
+
|
|
70
|
+
def init
|
|
71
|
+
set_dic
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def control(cmd, par)
|
|
75
|
+
@dic.report.each_pair { |key, value|
|
|
76
|
+
set(key, value)
|
|
77
|
+
} if cmd == STR_CMD_STATUS
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def process(obj)
|
|
81
|
+
if obj.is_a?(Token) && obj.attr == TA_WORD
|
|
82
|
+
inc('Anzahl gesuchter Wörter')
|
|
83
|
+
word = @dic.find_word(obj.form)
|
|
84
|
+
inc('Anzahl gefundener Wörter') unless word.unknown?
|
|
85
|
+
obj = word
|
|
86
|
+
end
|
|
87
|
+
forward(obj)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# For backwards compatibility.
|
|
93
|
+
Wordsearcher = WordSearcher
|
|
94
|
+
Word_searcher = WordSearcher
|
|
95
|
+
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class BufferedAttendee < Attendee
|
|
30
|
+
|
|
31
|
+
BufferInsert = Struct.new(:position, :object)
|
|
32
|
+
|
|
33
|
+
def initialize(config, lingo)
|
|
34
|
+
@buffer, @inserts = [], []
|
|
35
|
+
super
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
protected
|
|
39
|
+
|
|
40
|
+
def process(obj)
|
|
41
|
+
@buffer.push(obj)
|
|
42
|
+
process_buffer if process_buffer?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def forward_buffer
|
|
48
|
+
@inserts.sort_by!(&:position).each { |i|
|
|
49
|
+
@buffer.insert(i.position, i.object)
|
|
50
|
+
}.clear
|
|
51
|
+
|
|
52
|
+
@buffer.each(&method(:forward)).clear
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def process_buffer?
|
|
56
|
+
true
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def process_buffer
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def deferred_insert(pos, obj)
|
|
64
|
+
@inserts << BufferInsert.new(pos, obj)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
end
|