lingo 1.8.1 → 1.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. data/ChangeLog +23 -5
  2. data/README +1 -1
  3. data/Rakefile +5 -7
  4. data/TODO +2 -0
  5. data/bin/lingo +5 -1
  6. data/de.lang +1 -1
  7. data/en/lingo-syn.txt +0 -0
  8. data/en.lang +2 -1
  9. data/lib/lingo/attendee/abbreviator.rb +8 -9
  10. data/lib/lingo/attendee/debugger.rb +5 -4
  11. data/lib/lingo/attendee/decomposer.rb +8 -3
  12. data/lib/lingo/attendee/dehyphenizer.rb +19 -63
  13. data/lib/lingo/attendee/formatter.rb +1 -1
  14. data/lib/lingo/attendee/multi_worder.rb +67 -155
  15. data/lib/lingo/attendee/noneword_filter.rb +16 -9
  16. data/lib/lingo/attendee/object_filter.rb +1 -1
  17. data/lib/lingo/attendee/sequencer.rb +32 -63
  18. data/lib/lingo/attendee/stemmer/porter.rb +343 -0
  19. data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
  20. data/lib/lingo/attendee/synonymer.rb +10 -9
  21. data/lib/lingo/attendee/text_reader.rb +102 -76
  22. data/lib/lingo/attendee/text_writer.rb +23 -26
  23. data/lib/lingo/attendee/tokenizer.rb +13 -27
  24. data/lib/lingo/attendee/variator.rb +26 -66
  25. data/lib/lingo/attendee/vector_filter.rb +42 -43
  26. data/lib/lingo/attendee/word_searcher.rb +6 -7
  27. data/lib/lingo/attendee.rb +25 -7
  28. data/lib/lingo/buffered_attendee.rb +36 -10
  29. data/lib/lingo/cachable.rb +8 -8
  30. data/lib/lingo/config.rb +5 -6
  31. data/lib/lingo/ctl.rb +2 -3
  32. data/lib/lingo/database/crypter.rb +9 -26
  33. data/lib/lingo/database/gdbm_store.rb +3 -5
  34. data/lib/lingo/database/libcdb_store.rb +4 -6
  35. data/lib/lingo/database/sdbm_store.rb +11 -6
  36. data/lib/lingo/database/show_progress.rb +3 -43
  37. data/lib/lingo/database/source/key_value.rb +2 -6
  38. data/lib/lingo/database/source/multi_key.rb +3 -5
  39. data/lib/lingo/database/source/multi_value.rb +2 -6
  40. data/lib/lingo/database/source/single_word.rb +4 -6
  41. data/lib/lingo/database/source/word_class.rb +4 -10
  42. data/lib/lingo/database/source.rb +20 -18
  43. data/lib/lingo/database.rb +84 -59
  44. data/lib/lingo/error.rb +57 -1
  45. data/lib/lingo/language/dictionary.rb +21 -18
  46. data/lib/lingo/language/grammar.rb +40 -49
  47. data/lib/lingo/language/lexical.rb +6 -6
  48. data/lib/lingo/language/lexical_hash.rb +6 -0
  49. data/lib/lingo/language/word.rb +32 -15
  50. data/lib/lingo/language/word_form.rb +1 -1
  51. data/lib/lingo/language.rb +14 -25
  52. data/lib/lingo/reportable.rb +12 -10
  53. data/lib/lingo/show_progress.rb +81 -0
  54. data/lib/lingo/version.rb +1 -1
  55. data/lib/lingo.rb +63 -24
  56. data/lingo-call.cfg +6 -10
  57. data/lingo.cfg +60 -44
  58. data/lir.cfg +42 -41
  59. data/test/attendee/ts_abbreviator.rb +3 -5
  60. data/test/attendee/ts_decomposer.rb +3 -5
  61. data/test/attendee/ts_multi_worder.rb +87 -145
  62. data/test/attendee/ts_noneword_filter.rb +5 -3
  63. data/test/attendee/ts_object_filter.rb +5 -3
  64. data/test/attendee/ts_sequencer.rb +3 -5
  65. data/test/attendee/ts_stemmer.rb +309 -0
  66. data/test/attendee/ts_synonymer.rb +15 -11
  67. data/test/attendee/ts_text_reader.rb +12 -15
  68. data/test/attendee/ts_text_writer.rb +24 -29
  69. data/test/attendee/ts_tokenizer.rb +9 -7
  70. data/test/attendee/ts_variator.rb +4 -4
  71. data/test/attendee/ts_vector_filter.rb +24 -16
  72. data/test/attendee/ts_word_searcher.rb +20 -36
  73. data/test/{lir.csv → lir.vec} +0 -0
  74. data/test/ref/artikel.vec +943 -943
  75. data/test/ref/artikel.ven +943 -943
  76. data/test/ref/lir.non +201 -201
  77. data/test/ref/lir.seq +178 -178
  78. data/test/ref/lir.syn +49 -49
  79. data/test/ref/lir.vec +329 -0
  80. data/test/test_helper.rb +20 -36
  81. data/test/ts_database.rb +10 -10
  82. data/test/ts_language.rb +279 -319
  83. metadata +93 -104
  84. data/info/Objekte.png +0 -0
  85. data/info/Typen.png +0 -0
  86. data/info/database.png +0 -0
  87. data/info/db_small.png +0 -0
  88. data/info/download.png +0 -0
  89. data/info/kerze.png +0 -0
  90. data/info/language.png +0 -0
  91. data/info/lingo.png +0 -0
  92. data/info/logo.png +0 -0
  93. data/info/meeting.png +0 -0
  94. data/info/types.png +0 -0
  95. data/lingo-all.cfg +0 -89
  96. data/porter/stem.cfg +0 -311
  97. data/porter/stem.rb +0 -150
  98. data/test/ref/lir.csv +0 -329
  99. data/test.cfg +0 -79
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.1
4
+ version: 1.8.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,11 +10,11 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2012-02-19 00:00:00.000000000 Z
13
+ date: 2012-04-19 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ruby-nuggets
17
- requirement: &10045620 !ruby/object:Gem::Requirement
17
+ requirement: &12303700 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ! '>='
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 0.8.5
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *10045620
25
+ version_requirements: *12303700
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: unicode
28
- requirement: &10045140 !ruby/object:Gem::Requirement
28
+ requirement: &12302600 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ! '>='
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *10045140
36
+ version_requirements: *12302600
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: highline
39
- requirement: &10044660 !ruby/object:Gem::Requirement
39
+ requirement: &12317680 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ! '>='
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: '0'
45
45
  type: :runtime
46
46
  prerelease: false
47
- version_requirements: *10044660
47
+ version_requirements: *12317680
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: diff-lcs
50
- requirement: &10044140 !ruby/object:Gem::Requirement
50
+ requirement: &12315360 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ! '>='
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: 1.1.3
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *10044140
58
+ version_requirements: *12315360
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: open4
61
- requirement: &10043720 !ruby/object:Gem::Requirement
61
+ requirement: &12314340 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ! '>='
@@ -66,7 +66,7 @@ dependencies:
66
66
  version: '0'
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *10043720
69
+ version_requirements: *12314340
70
70
  description: ! "Lingo is an open source indexing system for research and teachings.\nThe
71
71
  main functions of Lingo are:\n\n* identification of (i.e. reduction to) basic word
72
72
  form by means of\n dictionaries and suffix lists\n* algorithmic decomposition\n*
@@ -76,144 +76,133 @@ email:
76
76
  - lingo@vorhauer.de
77
77
  - jens.wille@uni-koeln.de
78
78
  executables:
79
- - lingo
80
79
  - lingoctl
80
+ - lingo
81
81
  extensions: []
82
82
  extra_rdoc_files:
83
83
  - README
84
84
  - COPYING
85
85
  - ChangeLog
86
86
  files:
87
- - lib/lingo/ctl.rb
87
+ - lib/lingo.rb
88
+ - lib/lingo/show_progress.rb
89
+ - lib/lingo/config.rb
88
90
  - lib/lingo/database.rb
89
- - lib/lingo/error.rb
90
- - lib/lingo/version.rb
91
- - lib/lingo/database/source.rb
92
- - lib/lingo/database/libcdb_store.rb
93
- - lib/lingo/database/sdbm_store.rb
94
- - lib/lingo/database/show_progress.rb
95
- - lib/lingo/database/crypter.rb
96
- - lib/lingo/database/source/multi_key.rb
97
- - lib/lingo/database/source/key_value.rb
98
- - lib/lingo/database/source/single_word.rb
99
- - lib/lingo/database/source/word_class.rb
100
- - lib/lingo/database/source/multi_value.rb
101
- - lib/lingo/database/gdbm_store.rb
102
- - lib/lingo/database/hash_store.rb
103
- - lib/lingo/cli.rb
104
- - lib/lingo/cachable.rb
105
- - lib/lingo/attendee/variator.rb
106
- - lib/lingo/attendee/debugger.rb
91
+ - lib/lingo/language/dictionary.rb
92
+ - lib/lingo/language/word_form.rb
93
+ - lib/lingo/language/lexical.rb
94
+ - lib/lingo/language/grammar.rb
95
+ - lib/lingo/language/lexical_hash.rb
96
+ - lib/lingo/language/token.rb
97
+ - lib/lingo/language/word.rb
98
+ - lib/lingo/attendee/stemmer/porter.rb
99
+ - lib/lingo/attendee/vector_filter.rb
100
+ - lib/lingo/attendee/noneword_filter.rb
107
101
  - lib/lingo/attendee/object_filter.rb
108
- - lib/lingo/attendee/synonymer.rb
109
- - lib/lingo/attendee/text_writer.rb
102
+ - lib/lingo/attendee/variator.rb
110
103
  - lib/lingo/attendee/multi_worder.rb
111
104
  - lib/lingo/attendee/text_reader.rb
105
+ - lib/lingo/attendee/synonymer.rb
106
+ - lib/lingo/attendee/word_searcher.rb
112
107
  - lib/lingo/attendee/dehyphenizer.rb
108
+ - lib/lingo/attendee/sequencer.rb
109
+ - lib/lingo/attendee/debugger.rb
110
+ - lib/lingo/attendee/text_writer.rb
111
+ - lib/lingo/attendee/stemmer.rb
113
112
  - lib/lingo/attendee/tokenizer.rb
114
113
  - lib/lingo/attendee/abbreviator.rb
115
- - lib/lingo/attendee/formatter.rb
116
- - lib/lingo/attendee/noneword_filter.rb
117
- - lib/lingo/attendee/sequencer.rb
118
114
  - lib/lingo/attendee/decomposer.rb
119
- - lib/lingo/attendee/word_searcher.rb
120
- - lib/lingo/attendee/vector_filter.rb
121
- - lib/lingo/config.rb
115
+ - lib/lingo/attendee/formatter.rb
116
+ - lib/lingo/database/hash_store.rb
117
+ - lib/lingo/database/show_progress.rb
118
+ - lib/lingo/database/sdbm_store.rb
119
+ - lib/lingo/database/source.rb
120
+ - lib/lingo/database/crypter.rb
121
+ - lib/lingo/database/source/multi_value.rb
122
+ - lib/lingo/database/source/word_class.rb
123
+ - lib/lingo/database/source/key_value.rb
124
+ - lib/lingo/database/source/multi_key.rb
125
+ - lib/lingo/database/source/single_word.rb
126
+ - lib/lingo/database/gdbm_store.rb
127
+ - lib/lingo/database/libcdb_store.rb
128
+ - lib/lingo/call.rb
129
+ - lib/lingo/attendee.rb
130
+ - lib/lingo/version.rb
131
+ - lib/lingo/ctl.rb
132
+ - lib/lingo/cli.rb
122
133
  - lib/lingo/core_ext.rb
123
- - lib/lingo/agenda_item.rb
124
134
  - lib/lingo/buffered_attendee.rb
125
- - lib/lingo/reportable.rb
135
+ - lib/lingo/agenda_item.rb
136
+ - lib/lingo/cachable.rb
126
137
  - lib/lingo/language.rb
127
- - lib/lingo/language/dictionary.rb
128
- - lib/lingo/language/word.rb
129
- - lib/lingo/language/lexical.rb
130
- - lib/lingo/language/word_form.rb
131
- - lib/lingo/language/token.rb
132
- - lib/lingo/language/grammar.rb
133
- - lib/lingo/language/lexical_hash.rb
134
- - lib/lingo/attendee.rb
135
- - lib/lingo/call.rb
136
- - lib/lingo.rb
137
- - bin/lingo
138
+ - lib/lingo/error.rb
139
+ - lib/lingo/reportable.rb
138
140
  - bin/lingoctl
141
+ - bin/lingo
139
142
  - lingo.rb
140
143
  - lingo.cfg
141
- - lingo-all.cfg
142
144
  - lingo-call.cfg
145
+ - lir.cfg
143
146
  - de.lang
144
147
  - en.lang
145
- - de/lingo-syn.txt
146
- - de/lingo-abk.txt
147
148
  - de/lingo-dic.txt
149
+ - de/lingo-abk.txt
150
+ - de/lingo-syn.txt
148
151
  - de/lingo-mul.txt
149
152
  - de/user-dic.txt
153
+ - de/test_syn.txt
154
+ - de/test_dic.txt
155
+ - de/test_syn2.txt
156
+ - de/test_singleword.txt
157
+ - de/test_mul.txt
158
+ - de/test_mul2.txt
150
159
  - en/lingo-dic.txt
160
+ - en/lingo-syn.txt
151
161
  - en/lingo-mul.txt
152
162
  - en/user-dic.txt
153
163
  - txt/artikel.txt
154
164
  - txt/artikel-en.txt
155
- - info/gpl-hdr.txt
156
- - info/kerze.png
157
- - info/meeting.png
158
- - info/lingo.png
159
- - info/types.png
160
- - info/logo.png
161
- - info/language.png
162
- - info/Typen.png
163
- - info/Objekte.png
164
- - info/download.png
165
- - info/database.png
166
- - info/db_small.png
167
- - lir.cfg
168
165
  - txt/lir.txt
169
- - porter/stem.rb
170
- - porter/stem.cfg
171
- - test.cfg
172
- - de/test_mul.txt
173
- - de/test_singleword.txt
174
- - de/test_mul2.txt
175
- - de/test_syn.txt
176
- - de/test_dic.txt
177
- - de/test_syn2.txt
178
- - TODO
179
- - README
180
166
  - ChangeLog
181
167
  - COPYING
168
+ - README
182
169
  - Rakefile
170
+ - TODO
183
171
  - spec/spec_helper.rb
184
172
  - .rspec
185
- - test/lir.csv
186
- - test/attendee/ts_abbreviator.rb
187
- - test/attendee/ts_noneword_filter.rb
188
- - test/attendee/ts_word_searcher.rb
189
- - test/attendee/ts_object_filter.rb
190
- - test/attendee/ts_vector_filter.rb
191
- - test/attendee/ts_text_writer.rb
192
- - test/attendee/ts_decomposer.rb
193
- - test/attendee/ts_sequencer.rb
194
- - test/attendee/ts_synonymer.rb
195
- - test/attendee/ts_tokenizer.rb
196
- - test/attendee/ts_variator.rb
197
- - test/attendee/ts_text_reader.rb
198
- - test/attendee/ts_multi_worder.rb
199
- - test/mul.txt
200
- - test/test_helper.rb
201
173
  - test/ref/artikel.ven
202
- - test/ref/lir.csv
203
- - test/ref/artikel.vec
204
174
  - test/ref/lir.mul
205
- - test/ref/artikel.syn
175
+ - test/ref/lir.vec
176
+ - test/ref/artikel.vec
206
177
  - test/ref/lir.syn
207
178
  - test/ref/artikel.mul
179
+ - test/ref/artikel.syn
208
180
  - test/ref/artikel.seq
209
- - test/ref/lir.seq
210
181
  - test/ref/artikel.non
211
- - test/ref/artikel.ver
212
182
  - test/ref/lir.non
183
+ - test/ref/lir.seq
184
+ - test/ref/artikel.ver
185
+ - test/ts_language.rb
213
186
  - test/lir2.txt
214
- - test/ts_database.rb
187
+ - test/attendee/ts_noneword_filter.rb
188
+ - test/attendee/ts_text_writer.rb
189
+ - test/attendee/ts_sequencer.rb
190
+ - test/attendee/ts_object_filter.rb
191
+ - test/attendee/ts_text_reader.rb
192
+ - test/attendee/ts_multi_worder.rb
193
+ - test/attendee/ts_variator.rb
194
+ - test/attendee/ts_decomposer.rb
195
+ - test/attendee/ts_abbreviator.rb
196
+ - test/attendee/ts_stemmer.rb
197
+ - test/attendee/ts_tokenizer.rb
198
+ - test/attendee/ts_vector_filter.rb
199
+ - test/attendee/ts_word_searcher.rb
200
+ - test/attendee/ts_synonymer.rb
201
+ - test/lir.vec
202
+ - test/test_helper.rb
215
203
  - test/lir.txt
216
- - test/ts_language.rb
204
+ - test/mul.txt
205
+ - test/ts_database.rb
217
206
  homepage: http://lex-lingo.de
218
207
  licenses: []
219
208
  post_install_message:
@@ -223,7 +212,7 @@ rdoc_options:
223
212
  - --line-numbers
224
213
  - --all
225
214
  - --title
226
- - lingo Application documentation (v1.8.1)
215
+ - lingo Application documentation (v1.8.2)
227
216
  - --main
228
217
  - README
229
218
  require_paths:
data/info/Objekte.png DELETED
Binary file
data/info/Typen.png DELETED
Binary file
data/info/database.png DELETED
Binary file
data/info/db_small.png DELETED
Binary file
data/info/download.png DELETED
Binary file
data/info/kerze.png DELETED
Binary file
data/info/language.png DELETED
Binary file
data/info/lingo.png DELETED
Binary file
data/info/logo.png DELETED
Binary file
data/info/meeting.png DELETED
Binary file
data/info/types.png DELETED
Binary file
data/lingo-all.cfg DELETED
@@ -1,89 +0,0 @@
1
- #
2
- # Lingo-Konfiguration
3
- #
4
- ---
5
- meeting:
6
-
7
- attendees:
8
-
9
- ########################################
10
- # Text bereitstellen
11
- #
12
-
13
- # Angegebene Datei zeilenweise einlesen und verarbeitet
14
- - text_reader: { files: '$(files)' }
15
-
16
-
17
- ########################################
18
- # Inhalte verarbeiten
19
- #
20
-
21
- # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
22
- - tokenizer: { }
23
-
24
- # Abkürzungen erkennen und auflösen
25
- # - abbreviator: { source: 'sys-abk' }
26
-
27
- # Verbleibende Token im Wörterbuch suchen
28
- - word_searcher: { source: 'sys-dic', mode: 'first' }
29
-
30
- # Schreibweisen variieren und erneut suchen
31
- # - variator: { source: 'sys-dic' }
32
-
33
- # Bindestrichergänzungen rekonstruieren
34
- # - dehyphenizer: { source: 'sys-dic' }
35
-
36
- # Nicht erkannte Wörter auf Kompositum testen
37
- # - decomposer: { source: 'sys-dic' }
38
-
39
- # Mehrwortgruppen im Strom erkennen
40
- # - multi_worder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
41
-
42
- # Wortsequenzen anhand von Regeln identifizieren
43
- # - sequencer: { stopper: 'PUNC,OTHR' }
44
-
45
- # Relationierungen einfügen
46
- # - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
47
-
48
-
49
- ########################################
50
- # Datenstrom anzeigen
51
- #
52
- - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
53
-
54
-
55
- ########################################
56
- # Ergebnisse ausgeben
57
- #
58
-
59
- # Erstelle Datei mit Endung .log für Datenstrom
60
- # - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
61
- # - text_writer: { ext: log, sep: "\n" }
62
-
63
- # Erstelle Datei mit Endung .non für nicht erkannte Wörter
64
- # - noneword_filter: { in: syn }
65
- # - text_writer: { ext: non, sep: "\n" }
66
-
67
- # Erstelle Datei mit Endung .vec für erkannte Indexterme
68
- # - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
69
- # - text_writer: { ext: vec, sep: "\n" }
70
-
71
- # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
72
- # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
73
- # - text_writer: { ext: ven, sep: "\n" }
74
-
75
- # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
76
- # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
77
- # - text_writer: { ext: ver, sep: "\n" }
78
-
79
- # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
80
- # - vector_filter: { in: syn, lexicals: m }
81
- # - text_writer: { ext: mul, sep: "\n" }
82
-
83
- # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
84
- # - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
85
- # - text_writer: { ext: seq, sep: "\n" }
86
-
87
- # Erstelle Datei mit Endung .syn für erkannte Synonyme
88
- # - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
89
- # - text_writer: { ext: syn, sep: "\n" }