lingo 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse LexicalHash ermöglicht den Zugriff auf die Lingodatenbanken. Im Gegensatz zur
32
+ # Klasse Database, welche nur Strings als Ergebnis zurück gibt, wird hier als Ergebnis ein
33
+ # Array von Lexical-Objekten zurück gegeben.
34
+
35
+ class LexicalHash
36
+
37
+ include Cachable
38
+ include Reportable
39
+
40
+ def initialize(id, lingo)
41
+ init_cachable
42
+ init_reportable(id)
43
+
44
+ @wc = lingo.database_config(id).fetch('def-wc', LA_UNKNOWN)
45
+ @src = Database.open(id, lingo)
46
+ end
47
+
48
+ def close
49
+ @src.close
50
+ end
51
+
52
+ def [](key)
53
+ inc('total requests')
54
+ key = key.downcase
55
+
56
+ if hit?(key)
57
+ inc('cache hits')
58
+ return retrieve(key)
59
+ end
60
+
61
+ inc('source reads')
62
+
63
+ if record = @src[key]
64
+ record = record.map { |str|
65
+ case str
66
+ when /^\*\d+$/ then str
67
+ when /^#(.)$/ then Lexical.new(key, $1)
68
+ when /^([^#]+?)\s*#(.)$/ then Lexical.new($1, $2)
69
+ when /^([^#]+)$/ then Lexical.new($1, @wc)
70
+ else str
71
+ end
72
+ }
73
+
74
+ record.compact!
75
+ record.sort!
76
+ record.uniq!
77
+
78
+ inc('data found')
79
+ end
80
+
81
+ store(key, record)
82
+ end
83
+
84
+ end
85
+
86
+ end
87
+
88
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Token, abgeleitet von der Klasse WordForm, stellt den Container
32
+ # für ein einzelnes Wort eines Textes dar. Das Wort wird mit einem Attribut versehen,
33
+ # welches der Regel entspricht, die dieses Wort identifiziert hat.
34
+ #
35
+ # Steht z.B. in ruby.cfg eine Regel zur Erkennung einer Zahl, die mit NUM bezeichnet wird,
36
+ # so wird dies dem Token angeheftet, z.B. Token.new('100', 'NUM') -> #100/NUM#
37
+
38
+ class Token < WordForm
39
+
40
+ def to_s
41
+ ":#{super}:"
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+
48
+ end
@@ -0,0 +1,130 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Word bündelt spezifische Eigenschaften eines Wortes mit den
32
+ # dazu notwendigen Methoden.
33
+
34
+ class Word < WordForm
35
+
36
+ def self.new_lexical(form, attr, lex_attr)
37
+ new(form, attr) << Lexical.new(form, lex_attr)
38
+ end
39
+
40
+ # Exakte Representation der originären Zeichenkette, so wie sie im Satz
41
+ # gefunden wurde, z.B. <tt>form = "RubyLing"</tt>
42
+ #
43
+ # Ergebnis der Wörterbuch-Suche. Sie stellt die Grundform des Wortes dar.
44
+ # Dabei kann es mehrere mögliche Grundformen geben, z.B. kann +abgeschoben+
45
+ # als Grundform das _Adjektiv_ +abgeschoben+ sein, oder aber das _Verb_
46
+ # +abschieben+.
47
+ #
48
+ # <tt>lemma = [['abgeschoben', '#a'], ['abschieben', '#v']]</tt>.
49
+ #
50
+ # <b>Achtung: Lemma wird nicht durch die Word-Klasse bestückt, sondern extern
51
+ # durch die Klasse Dictionary</b>
52
+
53
+ def initialize(form, attr = WA_UNSET)
54
+ super
55
+ @lexicals = []
56
+ end
57
+
58
+ def lexicals(compound_parts = true)
59
+ if !compound_parts && attr == WA_KOMPOSITUM
60
+ @lexicals.select { |lex| lex.attr == LA_KOMPOSITUM }
61
+ else
62
+ @lexicals
63
+ end
64
+ end
65
+
66
+ def lexicals=(lexis)
67
+ if lexis.is_a?(Array)
68
+ @lexicals = lexis.sort.uniq
69
+ else
70
+ raise TypeError, "wrong argument type #{lexis.class} (expected Array)"
71
+ end
72
+ end
73
+
74
+ def attrs(compound_parts = true)
75
+ lexicals(compound_parts).map { |lex| lex.attr }
76
+ end
77
+
78
+ def parts
79
+ 1
80
+ end
81
+
82
+ def min_part_size
83
+ form.length
84
+ end
85
+
86
+ # Gibt genau die Grundform der Wortklasse zurück, die der RegExp des Übergabe-Parameters
87
+ # entspricht, z.B. <tt>word.get_wc(/a/) = ['abgeschoben', '#a']</tt>
88
+ def get_class(wc_re)
89
+ wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
90
+
91
+ unless lexicals.empty?
92
+ lexicals.select { |lex| lex.attr =~ wc_re }
93
+ else
94
+ attr =~ wc_re ? [self] : []
95
+ end
96
+ end
97
+
98
+ def norm
99
+ identified? ? lexicals.first.form : form
100
+ end
101
+
102
+ def compo_form
103
+ if attr == WA_KOMPOSITUM
104
+ get_class(LA_KOMPOSITUM).first
105
+ else
106
+ nil
107
+ end
108
+ end
109
+
110
+ def <<(*other)
111
+ lexicals.concat(other.flatten)
112
+ self
113
+ end
114
+
115
+ def <=>(other)
116
+ other.nil? ? 1 : to_a.push(lexicals) <=> other.to_a.push(other.lexicals)
117
+ end
118
+
119
+ def to_s
120
+ s = "<#{form}"
121
+ s << "|#{attr}" unless identified?
122
+ s << " = #{lexicals.inspect}" unless lexicals.empty?
123
+ s << '>'
124
+ end
125
+
126
+ end
127
+
128
+ end
129
+
130
+ end
@@ -0,0 +1,83 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse WordForm ist die Basisklasse für weitere Klassen, die im Rahmen der
32
+ # Objektstruktur eines Wortes benötigt werden. Die Klasse stellt eine Zeichenkette bereit,
33
+ # die mit einem Attribut versehen werden kann.
34
+
35
+ class WordForm
36
+
37
+ include Comparable
38
+
39
+ attr_accessor :form, :attr
40
+
41
+ def initialize(form, attr = '-')
42
+ @form, @attr = form || '', attr || ''
43
+ end
44
+
45
+ def unknown?
46
+ [WA_UNKNOWN, WA_UNKMULPART].include?(attr)
47
+ end
48
+
49
+ def identified?
50
+ attr == WA_IDENTIFIED
51
+ end
52
+
53
+ def <=>(other)
54
+ other.nil? ? 1 : to_a <=> other.to_a
55
+ end
56
+
57
+ def to_a
58
+ [form, attr]
59
+ end
60
+
61
+ def to_s
62
+ to_a.join('/')
63
+ end
64
+
65
+ def inspect
66
+ to_s
67
+ end
68
+
69
+ def hash
70
+ to_s.hash
71
+ end
72
+
73
+ def eql?(other)
74
+ self.class.equal?(other.class) && to_s == other.to_s
75
+ end
76
+
77
+ alias_method :==, :eql?
78
+
79
+ end
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ # Provides counters.
30
+
31
+ module Reportable
32
+
33
+ def init_reportable(prefix = nil)
34
+ @counters, @prefix = Hash.new(0), prefix ? "#{prefix}: " : ''
35
+ end
36
+
37
+ def inc(counter)
38
+ @counters[counter] += 1
39
+ end
40
+
41
+ def add(counter, value)
42
+ @counters[counter] += value
43
+ end
44
+
45
+ def set(counter, value)
46
+ @counters[counter] = value
47
+ end
48
+
49
+ def get(counter)
50
+ @counters[counter]
51
+ end
52
+
53
+ def report
54
+ @counters.each_with_object({}) { |(k, v), r| r["#{@prefix}#{k}"] = v }
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -4,7 +4,7 @@ class Lingo
4
4
 
5
5
  MAJOR = 1
6
6
  MINOR = 8
7
- TINY = 0
7
+ TINY = 1
8
8
 
9
9
  class << self
10
10
 
@@ -11,7 +11,7 @@ meeting:
11
11
  #
12
12
 
13
13
  # Angegebene Datei zeilenweise einlesen und verarbeitet
14
- - textreader: { files: '$(files)' }
14
+ - text_reader: { files: '$(files)' }
15
15
 
16
16
 
17
17
  ########################################
@@ -25,7 +25,7 @@ meeting:
25
25
  # - abbreviator: { source: 'sys-abk' }
26
26
 
27
27
  # Verbleibende Token im Wörterbuch suchen
28
- - wordsearcher: { source: 'sys-dic', mode: 'first' }
28
+ - word_searcher: { source: 'sys-dic', mode: 'first' }
29
29
 
30
30
  # Schreibweisen variieren und erneut suchen
31
31
  # - variator: { source: 'sys-dic' }
@@ -37,7 +37,7 @@ meeting:
37
37
  # - decomposer: { source: 'sys-dic' }
38
38
 
39
39
  # Mehrwortgruppen im Strom erkennen
40
- # - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
40
+ # - multi_worder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
41
41
 
42
42
  # Wortsequenzen anhand von Regeln identifizieren
43
43
  # - sequencer: { stopper: 'PUNC,OTHR' }
@@ -56,30 +56,34 @@ meeting:
56
56
  # Ergebnisse ausgeben
57
57
  #
58
58
 
59
+ # Erstelle Datei mit Endung .log für Datenstrom
60
+ # - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
61
+ # - text_writer: { ext: log, sep: "\n" }
62
+
59
63
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
60
64
  # - noneword_filter: { in: syn }
61
- # - textwriter: { ext: non, sep: "\n" }
65
+ # - text_writer: { ext: non, sep: "\n" }
62
66
 
63
67
  # Erstelle Datei mit Endung .vec für erkannte Indexterme
64
68
  # - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
65
- # - textwriter: { ext: vec, sep: "\n" }
69
+ # - text_writer: { ext: vec, sep: "\n" }
66
70
 
67
71
  # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
68
72
  # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
69
- # - textwriter: { ext: ven, sep: "\n" }
73
+ # - text_writer: { ext: ven, sep: "\n" }
70
74
 
71
75
  # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
72
76
  # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
73
- # - textwriter: { ext: ver, sep: "\n" }
77
+ # - text_writer: { ext: ver, sep: "\n" }
74
78
 
75
79
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
76
80
  # - vector_filter: { in: syn, lexicals: m }
77
- # - textwriter: { ext: mul, sep: "\n" }
81
+ # - text_writer: { ext: mul, sep: "\n" }
78
82
 
79
83
  # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
80
84
  # - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
81
- # - textwriter: { ext: seq, sep: "\n" }
85
+ # - text_writer: { ext: seq, sep: "\n" }
82
86
 
83
87
  # Erstelle Datei mit Endung .syn für erkannte Synonyme
84
88
  # - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
85
- # - textwriter: { ext: syn, sep: "\n" }
89
+ # - text_writer: { ext: syn, sep: "\n" }