lingo 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse LexicalHash ermöglicht den Zugriff auf die Lingodatenbanken. Im Gegensatz zur
32
+ # Klasse Database, welche nur Strings als Ergebnis zurück gibt, wird hier als Ergebnis ein
33
+ # Array von Lexical-Objekten zurück gegeben.
34
+
35
+ class LexicalHash
36
+
37
+ include Cachable
38
+ include Reportable
39
+
40
+ def initialize(id, lingo)
41
+ init_cachable
42
+ init_reportable(id)
43
+
44
+ @wc = lingo.database_config(id).fetch('def-wc', LA_UNKNOWN)
45
+ @src = Database.open(id, lingo)
46
+ end
47
+
48
+ def close
49
+ @src.close
50
+ end
51
+
52
+ def [](key)
53
+ inc('total requests')
54
+ key = key.downcase
55
+
56
+ if hit?(key)
57
+ inc('cache hits')
58
+ return retrieve(key)
59
+ end
60
+
61
+ inc('source reads')
62
+
63
+ if record = @src[key]
64
+ record = record.map { |str|
65
+ case str
66
+ when /^\*\d+$/ then str
67
+ when /^#(.)$/ then Lexical.new(key, $1)
68
+ when /^([^#]+?)\s*#(.)$/ then Lexical.new($1, $2)
69
+ when /^([^#]+)$/ then Lexical.new($1, @wc)
70
+ else str
71
+ end
72
+ }
73
+
74
+ record.compact!
75
+ record.sort!
76
+ record.uniq!
77
+
78
+ inc('data found')
79
+ end
80
+
81
+ store(key, record)
82
+ end
83
+
84
+ end
85
+
86
+ end
87
+
88
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Token, abgeleitet von der Klasse WordForm, stellt den Container
32
+ # für ein einzelnes Wort eines Textes dar. Das Wort wird mit einem Attribut versehen,
33
+ # welches der Regel entspricht, die dieses Wort identifiziert hat.
34
+ #
35
+ # Steht z.B. in ruby.cfg eine Regel zur Erkennung einer Zahl, die mit NUM bezeichnet wird,
36
+ # so wird dies dem Token angeheftet, z.B. Token.new('100', 'NUM') -> #100/NUM#
37
+
38
+ class Token < WordForm
39
+
40
+ def to_s
41
+ ":#{super}:"
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+
48
+ end
@@ -0,0 +1,130 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse Word bündelt spezifische Eigenschaften eines Wortes mit den
32
+ # dazu notwendigen Methoden.
33
+
34
+ class Word < WordForm
35
+
36
+ def self.new_lexical(form, attr, lex_attr)
37
+ new(form, attr) << Lexical.new(form, lex_attr)
38
+ end
39
+
40
+ # Exakte Representation der originären Zeichenkette, so wie sie im Satz
41
+ # gefunden wurde, z.B. <tt>form = "RubyLing"</tt>
42
+ #
43
+ # Ergebnis der Wörterbuch-Suche. Sie stellt die Grundform des Wortes dar.
44
+ # Dabei kann es mehrere mögliche Grundformen geben, z.B. kann +abgeschoben+
45
+ # als Grundform das _Adjektiv_ +abgeschoben+ sein, oder aber das _Verb_
46
+ # +abschieben+.
47
+ #
48
+ # <tt>lemma = [['abgeschoben', '#a'], ['abschieben', '#v']]</tt>.
49
+ #
50
+ # <b>Achtung: Lemma wird nicht durch die Word-Klasse bestückt, sondern extern
51
+ # durch die Klasse Dictionary</b>
52
+
53
+ def initialize(form, attr = WA_UNSET)
54
+ super
55
+ @lexicals = []
56
+ end
57
+
58
+ def lexicals(compound_parts = true)
59
+ if !compound_parts && attr == WA_KOMPOSITUM
60
+ @lexicals.select { |lex| lex.attr == LA_KOMPOSITUM }
61
+ else
62
+ @lexicals
63
+ end
64
+ end
65
+
66
+ def lexicals=(lexis)
67
+ if lexis.is_a?(Array)
68
+ @lexicals = lexis.sort.uniq
69
+ else
70
+ raise TypeError, "wrong argument type #{lexis.class} (expected Array)"
71
+ end
72
+ end
73
+
74
+ def attrs(compound_parts = true)
75
+ lexicals(compound_parts).map { |lex| lex.attr }
76
+ end
77
+
78
+ def parts
79
+ 1
80
+ end
81
+
82
+ def min_part_size
83
+ form.length
84
+ end
85
+
86
+ # Gibt genau die Grundform der Wortklasse zurück, die der RegExp des Übergabe-Parameters
87
+ # entspricht, z.B. <tt>word.get_wc(/a/) = ['abgeschoben', '#a']</tt>
88
+ def get_class(wc_re)
89
+ wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
90
+
91
+ unless lexicals.empty?
92
+ lexicals.select { |lex| lex.attr =~ wc_re }
93
+ else
94
+ attr =~ wc_re ? [self] : []
95
+ end
96
+ end
97
+
98
+ def norm
99
+ identified? ? lexicals.first.form : form
100
+ end
101
+
102
+ def compo_form
103
+ if attr == WA_KOMPOSITUM
104
+ get_class(LA_KOMPOSITUM).first
105
+ else
106
+ nil
107
+ end
108
+ end
109
+
110
+ def <<(*other)
111
+ lexicals.concat(other.flatten)
112
+ self
113
+ end
114
+
115
+ def <=>(other)
116
+ other.nil? ? 1 : to_a.push(lexicals) <=> other.to_a.push(other.lexicals)
117
+ end
118
+
119
+ def to_s
120
+ s = "<#{form}"
121
+ s << "|#{attr}" unless identified?
122
+ s << " = #{lexicals.inspect}" unless lexicals.empty?
123
+ s << '>'
124
+ end
125
+
126
+ end
127
+
128
+ end
129
+
130
+ end
@@ -0,0 +1,83 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module Language
30
+
31
+ # Die Klasse WordForm ist die Basisklasse für weitere Klassen, die im Rahmen der
32
+ # Objektstruktur eines Wortes benötigt werden. Die Klasse stellt eine Zeichenkette bereit,
33
+ # die mit einem Attribut versehen werden kann.
34
+
35
+ class WordForm
36
+
37
+ include Comparable
38
+
39
+ attr_accessor :form, :attr
40
+
41
+ def initialize(form, attr = '-')
42
+ @form, @attr = form || '', attr || ''
43
+ end
44
+
45
+ def unknown?
46
+ [WA_UNKNOWN, WA_UNKMULPART].include?(attr)
47
+ end
48
+
49
+ def identified?
50
+ attr == WA_IDENTIFIED
51
+ end
52
+
53
+ def <=>(other)
54
+ other.nil? ? 1 : to_a <=> other.to_a
55
+ end
56
+
57
+ def to_a
58
+ [form, attr]
59
+ end
60
+
61
+ def to_s
62
+ to_a.join('/')
63
+ end
64
+
65
+ def inspect
66
+ to_s
67
+ end
68
+
69
+ def hash
70
+ to_s.hash
71
+ end
72
+
73
+ def eql?(other)
74
+ self.class.equal?(other.class) && to_s == other.to_s
75
+ end
76
+
77
+ alias_method :==, :eql?
78
+
79
+ end
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ # Provides counters.
30
+
31
+ module Reportable
32
+
33
+ def init_reportable(prefix = nil)
34
+ @counters, @prefix = Hash.new(0), prefix ? "#{prefix}: " : ''
35
+ end
36
+
37
+ def inc(counter)
38
+ @counters[counter] += 1
39
+ end
40
+
41
+ def add(counter, value)
42
+ @counters[counter] += value
43
+ end
44
+
45
+ def set(counter, value)
46
+ @counters[counter] = value
47
+ end
48
+
49
+ def get(counter)
50
+ @counters[counter]
51
+ end
52
+
53
+ def report
54
+ @counters.each_with_object({}) { |(k, v), r| r["#{@prefix}#{k}"] = v }
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -4,7 +4,7 @@ class Lingo
4
4
 
5
5
  MAJOR = 1
6
6
  MINOR = 8
7
- TINY = 0
7
+ TINY = 1
8
8
 
9
9
  class << self
10
10
 
@@ -11,7 +11,7 @@ meeting:
11
11
  #
12
12
 
13
13
  # Angegebene Datei zeilenweise einlesen und verarbeitet
14
- - textreader: { files: '$(files)' }
14
+ - text_reader: { files: '$(files)' }
15
15
 
16
16
 
17
17
  ########################################
@@ -25,7 +25,7 @@ meeting:
25
25
  # - abbreviator: { source: 'sys-abk' }
26
26
 
27
27
  # Verbleibende Token im Wörterbuch suchen
28
- - wordsearcher: { source: 'sys-dic', mode: 'first' }
28
+ - word_searcher: { source: 'sys-dic', mode: 'first' }
29
29
 
30
30
  # Schreibweisen variieren und erneut suchen
31
31
  # - variator: { source: 'sys-dic' }
@@ -37,7 +37,7 @@ meeting:
37
37
  # - decomposer: { source: 'sys-dic' }
38
38
 
39
39
  # Mehrwortgruppen im Strom erkennen
40
- # - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
40
+ # - multi_worder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
41
41
 
42
42
  # Wortsequenzen anhand von Regeln identifizieren
43
43
  # - sequencer: { stopper: 'PUNC,OTHR' }
@@ -56,30 +56,34 @@ meeting:
56
56
  # Ergebnisse ausgeben
57
57
  #
58
58
 
59
+ # Erstelle Datei mit Endung .log für Datenstrom
60
+ # - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
61
+ # - text_writer: { ext: log, sep: "\n" }
62
+
59
63
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
60
64
  # - noneword_filter: { in: syn }
61
- # - textwriter: { ext: non, sep: "\n" }
65
+ # - text_writer: { ext: non, sep: "\n" }
62
66
 
63
67
  # Erstelle Datei mit Endung .vec für erkannte Indexterme
64
68
  # - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
65
- # - textwriter: { ext: vec, sep: "\n" }
69
+ # - text_writer: { ext: vec, sep: "\n" }
66
70
 
67
71
  # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
68
72
  # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
69
- # - textwriter: { ext: ven, sep: "\n" }
73
+ # - text_writer: { ext: ven, sep: "\n" }
70
74
 
71
75
  # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
72
76
  # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
73
- # - textwriter: { ext: ver, sep: "\n" }
77
+ # - text_writer: { ext: ver, sep: "\n" }
74
78
 
75
79
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
76
80
  # - vector_filter: { in: syn, lexicals: m }
77
- # - textwriter: { ext: mul, sep: "\n" }
81
+ # - text_writer: { ext: mul, sep: "\n" }
78
82
 
79
83
  # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
80
84
  # - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
81
- # - textwriter: { ext: seq, sep: "\n" }
85
+ # - text_writer: { ext: seq, sep: "\n" }
82
86
 
83
87
  # Erstelle Datei mit Endung .syn für erkannte Synonyme
84
88
  # - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
85
- # - textwriter: { ext: syn, sep: "\n" }
89
+ # - text_writer: { ext: syn, sep: "\n" }