lingo 1.9.0.pre1 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -206,7 +206,7 @@ class TestAttendeeTextReader < AttendeeTestCase
206
206
  end
207
207
 
208
208
  def test_article_pdf
209
- meet({ 'files' => file = 'test/article.pdf', 'filter' => true }, nil, [
209
+ meet({ 'files' => file = 'test/article.pdf', 'filter' => 'pdf' }, nil, [
210
210
  ai("FILE|#{path = File.expand_path(file)}"),
211
211
  [" Klaus Lepsky: Ist automatische Normierung mögich?\n", 75],
212
212
  [" ──────────────────────────────────────────────────────────────────────\n", 287],
@@ -356,7 +356,7 @@ class TestAttendeeTextReader < AttendeeTestCase
356
356
  end
357
357
 
358
358
  def test_article_xml
359
- meet({ 'files' => file = 'test/article.xml', 'filter' => true }, nil, [
359
+ meet({ 'files' => file = 'test/article.xml', 'filter' => 'xml' }, nil, [
360
360
  ai("FILE|#{path = File.expand_path(file)}"),
361
361
  ["\n", 1],
362
362
  ["\t\n", 3],
@@ -477,7 +477,7 @@ class TestAttendeeTextReader < AttendeeTestCase
477
477
  end
478
478
 
479
479
  def test_article_html
480
- meet({ 'files' => file = 'test/article.html', 'filter' => true }, nil, [
480
+ meet({ 'files' => file = 'test/article.html', 'filter' => 'html' }, nil, [
481
481
  ai("FILE|#{path = File.expand_path(file)}"),
482
482
  ["\n", 1],
483
483
  ["test/article-html.html\n", 24],
@@ -540,6 +540,6 @@ class TestAttendeeTextReader < AttendeeTestCase
540
540
  ai("EOF|#{path}"),
541
541
  ai('EOT|')
542
542
  ])
543
- end
543
+ end unless RUBY_ENGINE == 'jruby'
544
544
 
545
545
  end
@@ -30,7 +30,15 @@ class TestAttendeeTextWriter < AttendeeTestCase
30
30
 
31
31
  assert_equal([
32
32
  "Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n"
33
- ], File.readlines('test/test.tst', encoding: Lingo::ENC))
33
+ ], readlines('test.tst'))
34
+ end
35
+
36
+ def test_format
37
+ meet({ 'ext' => '%c-%l.tst', 'sep' => ',' }, @input)
38
+
39
+ assert_equal([
40
+ "Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n"
41
+ ], readlines('test.lingo-de.tst'))
34
42
  end
35
43
 
36
44
  def test_complex
@@ -38,7 +46,7 @@ class TestAttendeeTextWriter < AttendeeTestCase
38
46
 
39
47
  assert_equal([
40
48
  "Dies-ist-eine-Zeile-.\n", "Dies-ist-eine-zweite-Zeile-.\n"
41
- ], File.readlines('test/test.yip', encoding: Lingo::ENC))
49
+ ], readlines('test.yip'))
42
50
  end
43
51
 
44
52
  def test_crlf
@@ -46,7 +54,7 @@ class TestAttendeeTextWriter < AttendeeTestCase
46
54
 
47
55
  assert_equal([
48
56
  "Dies\n", "ist\n", "eine\n", "Zeile\n", ".\n", "Dies\n", "ist\n", "eine\n", "zweite\n", "Zeile\n", ".\n"
49
- ], File.readlines('test/test.txt2', encoding: Lingo::ENC))
57
+ ], readlines('test.txt2'))
50
58
  end
51
59
 
52
60
  def test_lir_file
@@ -73,7 +81,7 @@ cen. 056: Die intellektuelle Erschließung des Internet befindet sich in einer K
73
81
  FG-Projekt GERHARD.\n",
74
82
  "00239*020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter. 056: \"Das Buch ist ein praxisbezogenes VADEMECUM\
75
83
  für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.\n"
76
- ], File.readlines('test/lir.vec', encoding: Lingo::ENC))
84
+ ], readlines('lir.vec'))
77
85
  end
78
86
 
79
87
  def test_nonewords
@@ -83,7 +91,13 @@ FG-Projekt GERHARD.\n",
83
91
 
84
92
  assert_equal([
85
93
  "Nonwörter\n", "Nonsense"
86
- ], File.readlines('test/text.non', encoding: Lingo::ENC))
94
+ ], readlines('text.non'))
95
+ end
96
+
97
+ private
98
+
99
+ def readlines(file)
100
+ File.readlines("test/#{file}", encoding: Lingo::ENCODING)
87
101
  end
88
102
 
89
103
  end
@@ -3,6 +3,8 @@
3
3
  require 'test/unit'
4
4
  require 'lingo'
5
5
 
6
+ warn 'NOTICE: Skipping slow tests...' if ENV['LINGO_DISABLE_SLOW_TESTS']
7
+
6
8
  class LingoTestCase < Test::Unit::TestCase
7
9
 
8
10
  unless const_defined?(:TEST_FILE)
@@ -2,7 +2,7 @@
2
2
 
3
3
  require_relative 'test_helper'
4
4
 
5
- class TestDatabase < LingoTestCase
5
+ class DatabaseTestCase < LingoTestCase
6
6
 
7
7
  def setup
8
8
  @lingo = Lingo.new
@@ -64,6 +64,214 @@ Wort2=
64
64
  EOT
65
65
  end
66
66
 
67
+ def write(config, input)
68
+ FileUtils.mkdir_p(File.dirname(TEST_FILE))
69
+ File.open(TEST_FILE, 'w', encoding: Lingo::ENCODING) { |f| f.write(input) }
70
+
71
+ yield set_config('tst', config.merge('name' => TEST_FILE))
72
+ ensure
73
+ cleanup_store
74
+ end
75
+
76
+ def set_config(id, config)
77
+ "_test_#{id}_".tap { |i| @lingo.config["language/dictionary/databases/#{i}"] = config }
78
+ end
79
+
80
+ end
81
+
82
+ class TestSource < DatabaseTestCase
83
+
84
+ def test_dump_singleword
85
+ compare({
86
+ 'txt-format' => 'SingleWord'
87
+ }, @singleword)
88
+ end
89
+
90
+ def test_dump_singleword_defwc
91
+ compare({
92
+ 'txt-format' => 'SingleWord',
93
+ 'def-wc' => '*'
94
+ }, @singleword)
95
+ end
96
+
97
+ def test_dump_singleword_defmulwc
98
+ compare({
99
+ 'txt-format' => 'SingleWord',
100
+ 'def-mul-wc' => 'm'
101
+ }, @singleword)
102
+ end
103
+
104
+ def test_dump_singleword_uselex
105
+ compare({
106
+ 'txt-format' => 'SingleWord',
107
+ 'use-lex' => set_config('lex',
108
+ 'name' => 'de/lingo-dic.txt',
109
+ 'txt-format' => 'WordClass',
110
+ 'separator' => '='
111
+ )
112
+ }, @singleword)
113
+ end
114
+
115
+ def test_dump_singleword_inflect
116
+ compare({
117
+ 'txt-format' => 'SingleWord',
118
+ 'use-lex' => set_config('lex',
119
+ 'name' => 'de/lingo-dic.txt',
120
+ 'txt-format' => 'WordClass',
121
+ 'separator' => '='
122
+ ),
123
+ 'inflect' => true
124
+ }, @singleword_inflect)
125
+ end
126
+
127
+ def test_dump_singleword_inflect_s
128
+ compare({
129
+ 'txt-format' => 'SingleWord',
130
+ 'use-lex' => set_config('lex',
131
+ 'name' => 'de/lingo-dic.txt',
132
+ 'txt-format' => 'WordClass',
133
+ 'separator' => '='
134
+ ),
135
+ 'inflect' => 's'
136
+ }, @singleword_inflect)
137
+ end
138
+
139
+ def test_dump_singleword_inflect_e
140
+ compare({
141
+ 'txt-format' => 'SingleWord',
142
+ 'use-lex' => set_config('lex',
143
+ 'name' => 'de/lingo-dic.txt',
144
+ 'txt-format' => 'WordClass',
145
+ 'separator' => '='
146
+ ),
147
+ 'inflect' => 'e'
148
+ }, @singleword_inflect)
149
+ end
150
+
151
+ def test_dump_singleword_hyphenate
152
+ compare({
153
+ 'txt-format' => 'SingleWord',
154
+ 'use-lex' => set_config('lex',
155
+ 'name' => 'de/lingo-dic.txt',
156
+ 'txt-format' => 'WordClass',
157
+ 'separator' => '='
158
+ ),
159
+ 'hyphenate' => true
160
+ }, @singleword)
161
+ end
162
+
163
+ def test_dump_singleword_crypt
164
+ compare({
165
+ 'txt-format' => 'SingleWord',
166
+ 'crypt' => true
167
+ }, @singleword)
168
+ end
169
+
170
+ def test_dump_keyvalue
171
+ compare({
172
+ 'txt-format' => 'KeyValue'
173
+ }, @keyvalue.gsub(' * ', '*'))
174
+ end
175
+
176
+ def test_dump_keyvalue_separator
177
+ compare({
178
+ 'txt-format' => 'KeyValue',
179
+ 'separator' => '*'
180
+ }, @keyvalue.gsub(' * ', '*'))
181
+ end
182
+
183
+ def test_dump_keyvalue_defwc
184
+ compare({
185
+ 'txt-format' => 'KeyValue',
186
+ 'separator' => '*',
187
+ 'def-wc' => 's'
188
+ }, @keyvalue.gsub(' * ', '*'))
189
+ end
190
+
191
+ def test_dump_wordclass
192
+ compare({
193
+ 'txt-format' => 'WordClass',
194
+ 'separator' => '='
195
+ }, %q{
196
+ Wort1=Projektion1 #h
197
+ Wort2=Projektion2 #i
198
+ Wort3=Projektion3 #e
199
+ Wort1=Projektion4 #e
200
+ Wort1=#s
201
+ Wort2=
202
+ Wort4.illegal
203
+ Wort4=still illegal
204
+ Wort4=still illegal#s!
205
+ Wort4=now we're talking #s+
206
+ })
207
+ end
208
+
209
+ def test_dump_wordclass_gender
210
+ compare({
211
+ 'txt-format' => 'WordClass'
212
+ }, %q{
213
+ substantiv,substantiv #a|s.n
214
+ mehr,mehr #s|w.n mehren #v
215
+ wort,wort #s.n
216
+ gruppe,gruppe #s.f
217
+ modul,modul #s.m|n
218
+ nocken,nock #s.f|m|n nocke #s.f nocken #s.m
219
+ albern,albern #a|v
220
+ fortuna,fortuna #e|s.f
221
+ })
222
+ end
223
+
224
+ def test_dump_wordclass_gender_noncompact
225
+ compare({
226
+ 'txt-format' => 'WordClass'
227
+ }, %q{
228
+ substantiv,substantiv #a substantiv #s.n
229
+ mehr,mehr #s.n mehr #w mehren #v
230
+ wort,wort #s.n
231
+ gruppe,gruppe #s.f
232
+ modul,modul #s.m modul #s.n
233
+ nocken,nock #s.f nock #s.m nock #s.n nocke #s.f nocken #s.m
234
+ albern,albern #a albern #v
235
+ fortuna,fortuna #e.f fortuna #s.f
236
+ }, nil, nil, false)
237
+ end
238
+
239
+ def test_dump_multivalue
240
+ compare({
241
+ 'txt-format' => 'MultiValue',
242
+ 'separator' => ';'
243
+ }, %q{
244
+ Hasen;Nasen;Vasen;Rasen
245
+ Gold;Edelmetall;Mehrwert
246
+ Rasen;Gras;Grüne Fläche
247
+ Rasen;Rennen;Wettrennen
248
+ })
249
+ end
250
+
251
+ def test_dump_multikey
252
+ compare({
253
+ 'txt-format' => 'MultiKey'
254
+ }, %q{
255
+ Hasen;Nasen;Vasen;Rasen
256
+ Gold;Edelmetall;Mehrwert
257
+ })
258
+ end
259
+
260
+ def compare(config, input, *args)
261
+ write(config, input) { |id|
262
+ src, dump, lines = Lingo::Database::Source.from_id(id, @lingo), [], []
263
+
264
+ src.each_lexical { |key, lex| dump << src.dump_line(key, lex, *args) }
265
+ src.each_line { |line,| lines << line }
266
+
267
+ assert_equal dump, lines
268
+ }
269
+ end
270
+
271
+ end
272
+
273
+ class TestDatabase < DatabaseTestCase
274
+
67
275
  def test_singleword
68
276
  compare({
69
277
  'txt-format' => 'SingleWord'
@@ -482,22 +690,13 @@ Wort2=
482
690
  end
483
691
 
484
692
  def compare(config, input, output = nil)
485
- FileUtils.mkdir_p(File.dirname(TEST_FILE))
486
- File.open(TEST_FILE, 'w', encoding: Lingo::ENC) { |f| f.write(input) }
693
+ err = nil
487
694
 
488
- id, err = set_config('tst', config.merge('name' => TEST_FILE)), nil
489
-
490
- Lingo::Database.open(id, @lingo) { |db| begin
491
- block_given? ? yield(db) : assert_equal(output, db.to_h
492
- .tap { |h| h.delete(Lingo::Database::SYS_KEY) }); rescue => err; end }
695
+ write(config, input) { |id| Lingo::Database.open(id, @lingo) { |db| begin
696
+ block_given? ? yield(db) : assert_equal(output, db.to_h.tap { |h|
697
+ h.delete(Lingo::Database::SYS_KEY) }); rescue => err; end } }
493
698
 
494
699
  raise err if err
495
- ensure
496
- cleanup_store
497
- end
498
-
499
- def set_config(id, config)
500
- "_test_#{id}_".tap { |i| @lingo.config["language/dictionary/databases/#{i}"] = config }
501
700
  end
502
701
 
503
702
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0.pre1
4
+ version: 1.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Vorhauer
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-02 00:00:00.000000000 Z
12
+ date: 2016-09-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: cyclops
@@ -31,42 +31,42 @@ dependencies:
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: '1.4'
34
+ version: '1.5'
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: '1.4'
41
+ version: '1.5'
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: rubyzip
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
46
  - - "~>"
47
47
  - !ruby/object:Gem::Version
48
- version: '1.1'
48
+ version: '1.2'
49
49
  type: :runtime
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '1.1'
55
+ version: '1.2'
56
56
  - !ruby/object:Gem::Dependency
57
57
  name: sinatra-bells
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '0.3'
62
+ version: '0.4'
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '0.3'
69
+ version: '0.4'
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: unicode
72
72
  requirement: !ruby/object:Gem::Requirement
@@ -129,14 +129,14 @@ dependencies:
129
129
  requirements:
130
130
  - - "~>"
131
131
  - !ruby/object:Gem::Version
132
- version: '1.3'
132
+ version: '1.4'
133
133
  type: :development
134
134
  prerelease: false
135
135
  version_requirements: !ruby/object:Gem::Requirement
136
136
  requirements:
137
137
  - - "~>"
138
138
  - !ruby/object:Gem::Version
139
- version: '1.3'
139
+ version: '1.4'
140
140
  - !ruby/object:Gem::Dependency
141
141
  name: hen
142
142
  requirement: !ruby/object:Gem::Requirement
@@ -146,7 +146,7 @@ dependencies:
146
146
  version: '0.8'
147
147
  - - ">="
148
148
  - !ruby/object:Gem::Version
149
- version: 0.8.3
149
+ version: 0.8.5
150
150
  type: :development
151
151
  prerelease: false
152
152
  version_requirements: !ruby/object:Gem::Requirement
@@ -156,7 +156,7 @@ dependencies:
156
156
  version: '0.8'
157
157
  - - ">="
158
158
  - !ruby/object:Gem::Version
159
- version: 0.8.3
159
+ version: 0.8.5
160
160
  - !ruby/object:Gem::Dependency
161
161
  name: rake
162
162
  requirement: !ruby/object:Gem::Requirement
@@ -247,6 +247,7 @@ files:
247
247
  - lang/ru.lang
248
248
  - lib/lingo.rb
249
249
  - lib/lingo/app.rb
250
+ - lib/lingo/array_utils.rb
250
251
  - lib/lingo/attendee.rb
251
252
  - lib/lingo/attendee/abbreviator.rb
252
253
  - lib/lingo/attendee/analysis_filter.rb
@@ -370,20 +371,31 @@ licenses:
370
371
  metadata: {}
371
372
  post_install_message: |2+
372
373
 
373
- lingo-1.9.0 [unreleased]:
374
+ lingo-1.9.0 [2016-09-13]:
374
375
 
376
+ * <b>Dropped support for Ruby 1.9.</b>
375
377
  * Removed support for deprecated options and attendee names (+old+ → +new+):
376
- * Lingo::Language::Grammar : +compositum+ → +compound+
377
- * Lingo::Attendee::TextReader : +lir-record-pattern+ → +records+
378
- * Lingo::Config : +multiworder+ → +multi_worder+, +objectfilter+ →
379
- +object_filter+, +textreader+ → +text_reader+, +textwriter+ →
380
- +text_writer+, +vectorfilter+ → +vector_filter+, +wordsearcher+ →
381
- +word_searcher+
378
+ * Lingo::Language::Grammar<b></b>:
379
+ +compositum+ → +compound+
380
+ * Lingo::Attendee::TextReader<b></b>:
381
+ +lir-record-pattern+ → +records+
382
+ * Lingo::Config<b></b>:
383
+ +multiworder+ → +multi_worder+,
384
+ +objectfilter+ → +object_filter+,
385
+ +textreader+ → +text_reader+,
386
+ +textwriter+ → +text_writer+,
387
+ +vectorfilter+ → +vector_filter+,
388
+ +wordsearcher+ → +word_searcher+
389
+ * Lingo::Attendee::TextWriter learned format directives for +ext+ option
390
+ (currently supported are: <tt>%c</tt> = config name, <tt>%l</tt> = language
391
+ name, <tt>%d</tt> = current date, <tt>%t</tt> = current time).
392
+ * Lingo::Attendee::Sequencer remembers word form of sequences.
393
+ * Updated and extended English system dictionary and suffix list.
382
394
  * Fixed errors with XML input (issue #15 by Thomas Berger).
383
395
 
384
396
  rdoc_options:
385
397
  - "--title"
386
- - lingo Application documentation (v1.9.0.pre1)
398
+ - lingo Application documentation (v1.9.0)
387
399
  - "--charset"
388
400
  - UTF-8
389
401
  - "--line-numbers"
@@ -396,15 +408,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
396
408
  requirements:
397
409
  - - ">="
398
410
  - !ruby/object:Gem::Version
399
- version: 1.9.3
411
+ version: '2.0'
400
412
  required_rubygems_version: !ruby/object:Gem::Requirement
401
413
  requirements:
402
- - - ">"
414
+ - - ">="
403
415
  - !ruby/object:Gem::Version
404
- version: 1.3.1
416
+ version: '0'
405
417
  requirements: []
406
418
  rubyforge_project:
407
- rubygems_version: 2.5.2
419
+ rubygems_version: 2.6.6
408
420
  signing_key:
409
421
  specification_version: 4
410
422
  summary: The full-featured automatic indexing system