lingo 1.9.0.pre1 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -206,7 +206,7 @@ class TestAttendeeTextReader < AttendeeTestCase
206
206
  end
207
207
 
208
208
  def test_article_pdf
209
- meet({ 'files' => file = 'test/article.pdf', 'filter' => true }, nil, [
209
+ meet({ 'files' => file = 'test/article.pdf', 'filter' => 'pdf' }, nil, [
210
210
  ai("FILE|#{path = File.expand_path(file)}"),
211
211
  [" Klaus Lepsky: Ist automatische Normierung mögich?\n", 75],
212
212
  [" ──────────────────────────────────────────────────────────────────────\n", 287],
@@ -356,7 +356,7 @@ class TestAttendeeTextReader < AttendeeTestCase
356
356
  end
357
357
 
358
358
  def test_article_xml
359
- meet({ 'files' => file = 'test/article.xml', 'filter' => true }, nil, [
359
+ meet({ 'files' => file = 'test/article.xml', 'filter' => 'xml' }, nil, [
360
360
  ai("FILE|#{path = File.expand_path(file)}"),
361
361
  ["\n", 1],
362
362
  ["\t\n", 3],
@@ -477,7 +477,7 @@ class TestAttendeeTextReader < AttendeeTestCase
477
477
  end
478
478
 
479
479
  def test_article_html
480
- meet({ 'files' => file = 'test/article.html', 'filter' => true }, nil, [
480
+ meet({ 'files' => file = 'test/article.html', 'filter' => 'html' }, nil, [
481
481
  ai("FILE|#{path = File.expand_path(file)}"),
482
482
  ["\n", 1],
483
483
  ["test/article-html.html\n", 24],
@@ -540,6 +540,6 @@ class TestAttendeeTextReader < AttendeeTestCase
540
540
  ai("EOF|#{path}"),
541
541
  ai('EOT|')
542
542
  ])
543
- end
543
+ end unless RUBY_ENGINE == 'jruby'
544
544
 
545
545
  end
@@ -30,7 +30,15 @@ class TestAttendeeTextWriter < AttendeeTestCase
30
30
 
31
31
  assert_equal([
32
32
  "Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n"
33
- ], File.readlines('test/test.tst', encoding: Lingo::ENC))
33
+ ], readlines('test.tst'))
34
+ end
35
+
36
+ def test_format
37
+ meet({ 'ext' => '%c-%l.tst', 'sep' => ',' }, @input)
38
+
39
+ assert_equal([
40
+ "Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n"
41
+ ], readlines('test.lingo-de.tst'))
34
42
  end
35
43
 
36
44
  def test_complex
@@ -38,7 +46,7 @@ class TestAttendeeTextWriter < AttendeeTestCase
38
46
 
39
47
  assert_equal([
40
48
  "Dies-ist-eine-Zeile-.\n", "Dies-ist-eine-zweite-Zeile-.\n"
41
- ], File.readlines('test/test.yip', encoding: Lingo::ENC))
49
+ ], readlines('test.yip'))
42
50
  end
43
51
 
44
52
  def test_crlf
@@ -46,7 +54,7 @@ class TestAttendeeTextWriter < AttendeeTestCase
46
54
 
47
55
  assert_equal([
48
56
  "Dies\n", "ist\n", "eine\n", "Zeile\n", ".\n", "Dies\n", "ist\n", "eine\n", "zweite\n", "Zeile\n", ".\n"
49
- ], File.readlines('test/test.txt2', encoding: Lingo::ENC))
57
+ ], readlines('test.txt2'))
50
58
  end
51
59
 
52
60
  def test_lir_file
@@ -73,7 +81,7 @@ cen. 056: Die intellektuelle Erschließung des Internet befindet sich in einer K
73
81
  FG-Projekt GERHARD.\n",
74
82
  "00239*020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter. 056: \"Das Buch ist ein praxisbezogenes VADEMECUM\
75
83
  für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.\n"
76
- ], File.readlines('test/lir.vec', encoding: Lingo::ENC))
84
+ ], readlines('lir.vec'))
77
85
  end
78
86
 
79
87
  def test_nonewords
@@ -83,7 +91,13 @@ FG-Projekt GERHARD.\n",
83
91
 
84
92
  assert_equal([
85
93
  "Nonwörter\n", "Nonsense"
86
- ], File.readlines('test/text.non', encoding: Lingo::ENC))
94
+ ], readlines('text.non'))
95
+ end
96
+
97
+ private
98
+
99
+ def readlines(file)
100
+ File.readlines("test/#{file}", encoding: Lingo::ENCODING)
87
101
  end
88
102
 
89
103
  end
@@ -3,6 +3,8 @@
3
3
  require 'test/unit'
4
4
  require 'lingo'
5
5
 
6
+ warn 'NOTICE: Skipping slow tests...' if ENV['LINGO_DISABLE_SLOW_TESTS']
7
+
6
8
  class LingoTestCase < Test::Unit::TestCase
7
9
 
8
10
  unless const_defined?(:TEST_FILE)
@@ -2,7 +2,7 @@
2
2
 
3
3
  require_relative 'test_helper'
4
4
 
5
- class TestDatabase < LingoTestCase
5
+ class DatabaseTestCase < LingoTestCase
6
6
 
7
7
  def setup
8
8
  @lingo = Lingo.new
@@ -64,6 +64,214 @@ Wort2=
64
64
  EOT
65
65
  end
66
66
 
67
+ def write(config, input)
68
+ FileUtils.mkdir_p(File.dirname(TEST_FILE))
69
+ File.open(TEST_FILE, 'w', encoding: Lingo::ENCODING) { |f| f.write(input) }
70
+
71
+ yield set_config('tst', config.merge('name' => TEST_FILE))
72
+ ensure
73
+ cleanup_store
74
+ end
75
+
76
+ def set_config(id, config)
77
+ "_test_#{id}_".tap { |i| @lingo.config["language/dictionary/databases/#{i}"] = config }
78
+ end
79
+
80
+ end
81
+
82
+ class TestSource < DatabaseTestCase
83
+
84
+ def test_dump_singleword
85
+ compare({
86
+ 'txt-format' => 'SingleWord'
87
+ }, @singleword)
88
+ end
89
+
90
+ def test_dump_singleword_defwc
91
+ compare({
92
+ 'txt-format' => 'SingleWord',
93
+ 'def-wc' => '*'
94
+ }, @singleword)
95
+ end
96
+
97
+ def test_dump_singleword_defmulwc
98
+ compare({
99
+ 'txt-format' => 'SingleWord',
100
+ 'def-mul-wc' => 'm'
101
+ }, @singleword)
102
+ end
103
+
104
+ def test_dump_singleword_uselex
105
+ compare({
106
+ 'txt-format' => 'SingleWord',
107
+ 'use-lex' => set_config('lex',
108
+ 'name' => 'de/lingo-dic.txt',
109
+ 'txt-format' => 'WordClass',
110
+ 'separator' => '='
111
+ )
112
+ }, @singleword)
113
+ end
114
+
115
+ def test_dump_singleword_inflect
116
+ compare({
117
+ 'txt-format' => 'SingleWord',
118
+ 'use-lex' => set_config('lex',
119
+ 'name' => 'de/lingo-dic.txt',
120
+ 'txt-format' => 'WordClass',
121
+ 'separator' => '='
122
+ ),
123
+ 'inflect' => true
124
+ }, @singleword_inflect)
125
+ end
126
+
127
+ def test_dump_singleword_inflect_s
128
+ compare({
129
+ 'txt-format' => 'SingleWord',
130
+ 'use-lex' => set_config('lex',
131
+ 'name' => 'de/lingo-dic.txt',
132
+ 'txt-format' => 'WordClass',
133
+ 'separator' => '='
134
+ ),
135
+ 'inflect' => 's'
136
+ }, @singleword_inflect)
137
+ end
138
+
139
+ def test_dump_singleword_inflect_e
140
+ compare({
141
+ 'txt-format' => 'SingleWord',
142
+ 'use-lex' => set_config('lex',
143
+ 'name' => 'de/lingo-dic.txt',
144
+ 'txt-format' => 'WordClass',
145
+ 'separator' => '='
146
+ ),
147
+ 'inflect' => 'e'
148
+ }, @singleword_inflect)
149
+ end
150
+
151
+ def test_dump_singleword_hyphenate
152
+ compare({
153
+ 'txt-format' => 'SingleWord',
154
+ 'use-lex' => set_config('lex',
155
+ 'name' => 'de/lingo-dic.txt',
156
+ 'txt-format' => 'WordClass',
157
+ 'separator' => '='
158
+ ),
159
+ 'hyphenate' => true
160
+ }, @singleword)
161
+ end
162
+
163
+ def test_dump_singleword_crypt
164
+ compare({
165
+ 'txt-format' => 'SingleWord',
166
+ 'crypt' => true
167
+ }, @singleword)
168
+ end
169
+
170
+ def test_dump_keyvalue
171
+ compare({
172
+ 'txt-format' => 'KeyValue'
173
+ }, @keyvalue.gsub(' * ', '*'))
174
+ end
175
+
176
+ def test_dump_keyvalue_separator
177
+ compare({
178
+ 'txt-format' => 'KeyValue',
179
+ 'separator' => '*'
180
+ }, @keyvalue.gsub(' * ', '*'))
181
+ end
182
+
183
+ def test_dump_keyvalue_defwc
184
+ compare({
185
+ 'txt-format' => 'KeyValue',
186
+ 'separator' => '*',
187
+ 'def-wc' => 's'
188
+ }, @keyvalue.gsub(' * ', '*'))
189
+ end
190
+
191
+ def test_dump_wordclass
192
+ compare({
193
+ 'txt-format' => 'WordClass',
194
+ 'separator' => '='
195
+ }, %q{
196
+ Wort1=Projektion1 #h
197
+ Wort2=Projektion2 #i
198
+ Wort3=Projektion3 #e
199
+ Wort1=Projektion4 #e
200
+ Wort1=#s
201
+ Wort2=
202
+ Wort4.illegal
203
+ Wort4=still illegal
204
+ Wort4=still illegal#s!
205
+ Wort4=now we're talking #s+
206
+ })
207
+ end
208
+
209
+ def test_dump_wordclass_gender
210
+ compare({
211
+ 'txt-format' => 'WordClass'
212
+ }, %q{
213
+ substantiv,substantiv #a|s.n
214
+ mehr,mehr #s|w.n mehren #v
215
+ wort,wort #s.n
216
+ gruppe,gruppe #s.f
217
+ modul,modul #s.m|n
218
+ nocken,nock #s.f|m|n nocke #s.f nocken #s.m
219
+ albern,albern #a|v
220
+ fortuna,fortuna #e|s.f
221
+ })
222
+ end
223
+
224
+ def test_dump_wordclass_gender_noncompact
225
+ compare({
226
+ 'txt-format' => 'WordClass'
227
+ }, %q{
228
+ substantiv,substantiv #a substantiv #s.n
229
+ mehr,mehr #s.n mehr #w mehren #v
230
+ wort,wort #s.n
231
+ gruppe,gruppe #s.f
232
+ modul,modul #s.m modul #s.n
233
+ nocken,nock #s.f nock #s.m nock #s.n nocke #s.f nocken #s.m
234
+ albern,albern #a albern #v
235
+ fortuna,fortuna #e.f fortuna #s.f
236
+ }, nil, nil, false)
237
+ end
238
+
239
+ def test_dump_multivalue
240
+ compare({
241
+ 'txt-format' => 'MultiValue',
242
+ 'separator' => ';'
243
+ }, %q{
244
+ Hasen;Nasen;Vasen;Rasen
245
+ Gold;Edelmetall;Mehrwert
246
+ Rasen;Gras;Grüne Fläche
247
+ Rasen;Rennen;Wettrennen
248
+ })
249
+ end
250
+
251
+ def test_dump_multikey
252
+ compare({
253
+ 'txt-format' => 'MultiKey'
254
+ }, %q{
255
+ Hasen;Nasen;Vasen;Rasen
256
+ Gold;Edelmetall;Mehrwert
257
+ })
258
+ end
259
+
260
+ def compare(config, input, *args)
261
+ write(config, input) { |id|
262
+ src, dump, lines = Lingo::Database::Source.from_id(id, @lingo), [], []
263
+
264
+ src.each_lexical { |key, lex| dump << src.dump_line(key, lex, *args) }
265
+ src.each_line { |line,| lines << line }
266
+
267
+ assert_equal dump, lines
268
+ }
269
+ end
270
+
271
+ end
272
+
273
+ class TestDatabase < DatabaseTestCase
274
+
67
275
  def test_singleword
68
276
  compare({
69
277
  'txt-format' => 'SingleWord'
@@ -482,22 +690,13 @@ Wort2=
482
690
  end
483
691
 
484
692
  def compare(config, input, output = nil)
485
- FileUtils.mkdir_p(File.dirname(TEST_FILE))
486
- File.open(TEST_FILE, 'w', encoding: Lingo::ENC) { |f| f.write(input) }
693
+ err = nil
487
694
 
488
- id, err = set_config('tst', config.merge('name' => TEST_FILE)), nil
489
-
490
- Lingo::Database.open(id, @lingo) { |db| begin
491
- block_given? ? yield(db) : assert_equal(output, db.to_h
492
- .tap { |h| h.delete(Lingo::Database::SYS_KEY) }); rescue => err; end }
695
+ write(config, input) { |id| Lingo::Database.open(id, @lingo) { |db| begin
696
+ block_given? ? yield(db) : assert_equal(output, db.to_h.tap { |h|
697
+ h.delete(Lingo::Database::SYS_KEY) }); rescue => err; end } }
493
698
 
494
699
  raise err if err
495
- ensure
496
- cleanup_store
497
- end
498
-
499
- def set_config(id, config)
500
- "_test_#{id}_".tap { |i| @lingo.config["language/dictionary/databases/#{i}"] = config }
501
700
  end
502
701
 
503
702
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.0.pre1
4
+ version: 1.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Vorhauer
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-02 00:00:00.000000000 Z
12
+ date: 2016-09-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: cyclops
@@ -31,42 +31,42 @@ dependencies:
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: '1.4'
34
+ version: '1.5'
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: '1.4'
41
+ version: '1.5'
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: rubyzip
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
46
  - - "~>"
47
47
  - !ruby/object:Gem::Version
48
- version: '1.1'
48
+ version: '1.2'
49
49
  type: :runtime
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '1.1'
55
+ version: '1.2'
56
56
  - !ruby/object:Gem::Dependency
57
57
  name: sinatra-bells
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '0.3'
62
+ version: '0.4'
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '0.3'
69
+ version: '0.4'
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: unicode
72
72
  requirement: !ruby/object:Gem::Requirement
@@ -129,14 +129,14 @@ dependencies:
129
129
  requirements:
130
130
  - - "~>"
131
131
  - !ruby/object:Gem::Version
132
- version: '1.3'
132
+ version: '1.4'
133
133
  type: :development
134
134
  prerelease: false
135
135
  version_requirements: !ruby/object:Gem::Requirement
136
136
  requirements:
137
137
  - - "~>"
138
138
  - !ruby/object:Gem::Version
139
- version: '1.3'
139
+ version: '1.4'
140
140
  - !ruby/object:Gem::Dependency
141
141
  name: hen
142
142
  requirement: !ruby/object:Gem::Requirement
@@ -146,7 +146,7 @@ dependencies:
146
146
  version: '0.8'
147
147
  - - ">="
148
148
  - !ruby/object:Gem::Version
149
- version: 0.8.3
149
+ version: 0.8.5
150
150
  type: :development
151
151
  prerelease: false
152
152
  version_requirements: !ruby/object:Gem::Requirement
@@ -156,7 +156,7 @@ dependencies:
156
156
  version: '0.8'
157
157
  - - ">="
158
158
  - !ruby/object:Gem::Version
159
- version: 0.8.3
159
+ version: 0.8.5
160
160
  - !ruby/object:Gem::Dependency
161
161
  name: rake
162
162
  requirement: !ruby/object:Gem::Requirement
@@ -247,6 +247,7 @@ files:
247
247
  - lang/ru.lang
248
248
  - lib/lingo.rb
249
249
  - lib/lingo/app.rb
250
+ - lib/lingo/array_utils.rb
250
251
  - lib/lingo/attendee.rb
251
252
  - lib/lingo/attendee/abbreviator.rb
252
253
  - lib/lingo/attendee/analysis_filter.rb
@@ -370,20 +371,31 @@ licenses:
370
371
  metadata: {}
371
372
  post_install_message: |2+
372
373
 
373
- lingo-1.9.0 [unreleased]:
374
+ lingo-1.9.0 [2016-09-13]:
374
375
 
376
+ * <b>Dropped support for Ruby 1.9.</b>
375
377
  * Removed support for deprecated options and attendee names (+old+ → +new+):
376
- * Lingo::Language::Grammar : +compositum+ → +compound+
377
- * Lingo::Attendee::TextReader : +lir-record-pattern+ → +records+
378
- * Lingo::Config : +multiworder+ → +multi_worder+, +objectfilter+ →
379
- +object_filter+, +textreader+ → +text_reader+, +textwriter+ →
380
- +text_writer+, +vectorfilter+ → +vector_filter+, +wordsearcher+ →
381
- +word_searcher+
378
+ * Lingo::Language::Grammar<b></b>:
379
+ +compositum+ → +compound+
380
+ * Lingo::Attendee::TextReader<b></b>:
381
+ +lir-record-pattern+ → +records+
382
+ * Lingo::Config<b></b>:
383
+ +multiworder+ → +multi_worder+,
384
+ +objectfilter+ → +object_filter+,
385
+ +textreader+ → +text_reader+,
386
+ +textwriter+ → +text_writer+,
387
+ +vectorfilter+ → +vector_filter+,
388
+ +wordsearcher+ → +word_searcher+
389
+ * Lingo::Attendee::TextWriter learned format directives for +ext+ option
390
+ (currently supported are: <tt>%c</tt> = config name, <tt>%l</tt> = language
391
+ name, <tt>%d</tt> = current date, <tt>%t</tt> = current time).
392
+ * Lingo::Attendee::Sequencer remembers word form of sequences.
393
+ * Updated and extended English system dictionary and suffix list.
382
394
  * Fixed errors with XML input (issue #15 by Thomas Berger).
383
395
 
384
396
  rdoc_options:
385
397
  - "--title"
386
- - lingo Application documentation (v1.9.0.pre1)
398
+ - lingo Application documentation (v1.9.0)
387
399
  - "--charset"
388
400
  - UTF-8
389
401
  - "--line-numbers"
@@ -396,15 +408,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
396
408
  requirements:
397
409
  - - ">="
398
410
  - !ruby/object:Gem::Version
399
- version: 1.9.3
411
+ version: '2.0'
400
412
  required_rubygems_version: !ruby/object:Gem::Requirement
401
413
  requirements:
402
- - - ">"
414
+ - - ">="
403
415
  - !ruby/object:Gem::Version
404
- version: 1.3.1
416
+ version: '0'
405
417
  requirements: []
406
418
  rubyforge_project:
407
- rubygems_version: 2.5.2
419
+ rubygems_version: 2.6.6
408
420
  signing_key:
409
421
  specification_version: 4
410
422
  summary: The full-featured automatic indexing system