lingo 1.8.6 → 1.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +40 -4
  3. data/README +22 -51
  4. data/Rakefile +3 -17
  5. data/config/lingo.cfg +24 -15
  6. data/config/lir.cfg +25 -16
  7. data/dict/de/test_muh.txt +6 -0
  8. data/dict/en/lingo-dic.txt +2 -3
  9. data/lang/de.lang +10 -9
  10. data/lang/en.lang +1 -1
  11. data/lib/lingo.rb +4 -4
  12. data/lib/lingo/attendee.rb +27 -7
  13. data/lib/lingo/attendee/analysis_filter.rb +81 -0
  14. data/lib/lingo/attendee/debug_filter.rb +42 -0
  15. data/lib/lingo/attendee/debugger.rb +2 -11
  16. data/lib/lingo/attendee/decomposer.rb +6 -3
  17. data/lib/lingo/attendee/formatter.rb +6 -6
  18. data/lib/lingo/attendee/hal_filter.rb +94 -0
  19. data/lib/lingo/attendee/lsi_filter.rb +99 -0
  20. data/lib/lingo/attendee/multi_worder.rb +69 -43
  21. data/lib/lingo/attendee/sequencer.rb +32 -19
  22. data/lib/lingo/attendee/synonymer.rb +2 -2
  23. data/lib/lingo/attendee/text_reader.rb +63 -92
  24. data/lib/lingo/attendee/text_writer.rb +12 -21
  25. data/lib/lingo/attendee/tokenizer.rb +32 -21
  26. data/lib/lingo/attendee/variator.rb +3 -3
  27. data/lib/lingo/attendee/vector_filter.rb +7 -9
  28. data/lib/lingo/attendee/word_searcher.rb +3 -3
  29. data/lib/lingo/buffered_attendee.rb +3 -36
  30. data/lib/lingo/config.rb +1 -1
  31. data/lib/lingo/ctl.rb +7 -155
  32. data/lib/lingo/ctl/analysis.rb +136 -0
  33. data/lib/lingo/ctl/files.rb +86 -0
  34. data/lib/lingo/ctl/other.rb +140 -0
  35. data/lib/lingo/database.rb +64 -60
  36. data/lib/lingo/database/crypter.rb +7 -5
  37. data/lib/lingo/error.rb +5 -4
  38. data/lib/lingo/language.rb +13 -5
  39. data/lib/lingo/language/grammar.rb +13 -7
  40. data/lib/lingo/language/token.rb +6 -0
  41. data/lib/lingo/language/word.rb +23 -36
  42. data/lib/lingo/language/word_form.rb +5 -1
  43. data/lib/lingo/srv.rb +2 -2
  44. data/lib/lingo/text_utils.rb +96 -0
  45. data/lib/lingo/version.rb +1 -1
  46. data/lib/lingo/web/views/index.erb +1 -1
  47. data/test/attendee/ts_decomposer.rb +23 -5
  48. data/test/attendee/ts_multi_worder.rb +66 -0
  49. data/test/attendee/ts_sequencer.rb +28 -4
  50. data/test/attendee/ts_text_reader.rb +20 -0
  51. data/test/attendee/ts_tokenizer.rb +20 -0
  52. data/test/attendee/ts_variator.rb +1 -1
  53. data/test/attendee/ts_word_searcher.rb +39 -3
  54. data/test/lir3.txt +12 -0
  55. data/test/ref/artikel.non +1 -12
  56. data/test/ref/artikel.seq +3 -1
  57. data/test/ref/artikel.vec +1 -0
  58. data/test/ref/artikel.vef +35 -34
  59. data/test/ref/artikel.ven +8 -7
  60. data/test/ref/artikel.ver +34 -33
  61. data/test/ref/artikel.vet +2573 -2563
  62. data/test/ref/lir.non +77 -78
  63. data/test/ref/lir.seq +9 -7
  64. data/test/ref/lir.syn +1 -1
  65. data/test/ref/lir.vec +41 -41
  66. data/test/ref/lir.vef +210 -210
  67. data/test/ref/lir.ven +46 -46
  68. data/test/ref/lir.ver +72 -72
  69. data/test/ref/lir.vet +329 -329
  70. data/test/ts_database.rb +166 -62
  71. data/test/ts_language.rb +23 -23
  72. metadata +53 -34
  73. data/lib/lingo/attendee/dehyphenizer.rb +0 -120
  74. data/lib/lingo/attendee/noneword_filter.rb +0 -115
  75. data/test/attendee/ts_noneword_filter.rb +0 -15
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -31,14 +31,16 @@ class Lingo
31
31
 
32
32
  class Database
33
33
 
34
- class Crypter
34
+ module Crypter
35
35
 
36
- def self.digest(key)
36
+ extend self
37
+
38
+ def digest(key)
37
39
  Digest::SHA1.hexdigest(key)
38
40
  end
39
41
 
40
42
  def encode(key, val)
41
- [self.class.digest(key), crypt(:encrypt, key, val)]
43
+ [digest(key), crypt(:encrypt, key, val)]
42
44
  end
43
45
 
44
46
  def decode(key, val)
@@ -49,7 +51,7 @@ class Lingo
49
51
 
50
52
  def crypt(method, key, val)
51
53
  cipher = OpenSSL::Cipher.new('aes-128-cbc').send(method)
52
- cipher.iv = cipher.key = self.class.digest(key)
54
+ cipher.iv = cipher.key = digest(key)
53
55
  cipher.update(val) + cipher.final
54
56
  end
55
57
 
@@ -68,14 +68,15 @@ class Lingo
68
68
 
69
69
  class BackendNotAvailableError < LingoError
70
70
 
71
- attr_reader :mod, :file
71
+ attr_reader :name, :file, :err
72
72
 
73
- def initialize(mod, file)
74
- @mod, @file = mod, file
73
+ def initialize(name, file, err)
74
+ @name, @file, @err = name, file, err
75
75
  end
76
76
 
77
77
  def to_s
78
- "Backend not available `#{mod}' for `#{file}'."
78
+ msg = "Backend not available `#{name}'"
79
+ error(file ? msg << " for `#{file}'" : msg)
79
80
  end
80
81
 
81
82
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,14 +39,22 @@ class Lingo
39
39
 
40
40
  CHAR_PUNCT = '.'
41
41
 
42
- TA_WORD = 'WORD'
43
- TA_PUNCTUATION = 'PUNC'
44
- TA_OTHER = 'OTHR'
42
+ TA_ABBREVIATION = 'ABRV'
43
+ TA_HELP = 'HELP'
44
+ TA_HTML = 'HTML'
45
+ TA_NUMBER = 'NUMS'
46
+ TA_OTHER = 'OTHR'
47
+ TA_PUNCTUATION = 'PUNC'
48
+ TA_SKIP = 'SKIP'
49
+ TA_SPACE = 'SPAC'
50
+ TA_URL = 'URLS'
51
+ TA_WIKI = 'WIKI'
52
+ TA_WORD = 'WORD'
45
53
 
46
54
  WA_UNSET = '-'
47
55
  WA_IDENTIFIED = 'IDF'
48
56
  WA_UNKNOWN = '?'
49
- WA_COMPOUND = 'KOM'
57
+ WA_COMPOUND = 'COM'
50
58
  WA_MULTIWORD = 'MUL'
51
59
  WA_SEQUENCE = 'SEQ'
52
60
  WA_UNKMULPART = 'MU?'
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,6 +39,10 @@ class Lingo
39
39
 
40
40
  HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
41
41
 
42
+ DEFAULTS = {
43
+ min_word_size: 8, min_avg_part_size: 4, min_part_size: 1, max_parts: 4
44
+ }
45
+
42
46
  def self.open(*args)
43
47
  yield grammar = new(*args)
44
48
  ensure
@@ -55,11 +59,8 @@ class Lingo
55
59
  cfg = lingo.dictionary_config['compound'] ||
56
60
  lingo.dictionary_config['compositum'] # DEPRECATE compositum
57
61
 
58
- {
59
- min_word_size: 8, min_avg_part_size: 4, min_part_size: 1, max_parts: 4
60
- }.each { |k, v|
61
- instance_variable_set("@#{k}", cfg.fetch(k.to_s.tr('_', '-'), v).to_i)
62
- }
62
+ DEFAULTS.each { |k, v| instance_variable_set(
63
+ "@#{k}", cfg.fetch(k.to_s.tr('_', '-'), v).to_i) }
63
64
 
64
65
  #--
65
66
  # Die Wortklasse eines Kompositum-Wortteils kann separat gekennzeichnet
@@ -87,6 +88,11 @@ class Lingo
87
88
  permute_compound([[], [], ''], str, level, tail)
88
89
  end
89
90
 
91
+ def find_compound_head(str)
92
+ compound = find_compound(str)
93
+ compound.head || compound if compound && !compound.unknown?
94
+ end
95
+
90
96
  private
91
97
 
92
98
  def permute_compound(ret, str, level, tail)
@@ -113,7 +119,7 @@ class Lingo
113
119
 
114
120
  level > 1 ? ret = res : ret.identify(lex.each { |l|
115
121
  l.attr += @append_wc unless l.attr == LA_COMPOUND
116
- }, WA_COMPOUND) if !lex.empty? &&
122
+ }, WA_COMPOUND, seq) if !lex.empty? &&
117
123
  sta.size <= @max_parts &&
118
124
  sta.min >= @min_part_size &&
119
125
  str.length / sta.size >= @min_avg_part_size &&
@@ -56,6 +56,12 @@ class Lingo
56
56
  attr == TA_WORD
57
57
  end
58
58
 
59
+ alias_method :word_token?, :word?
60
+
61
+ def number?
62
+ attr == TA_NUMBER
63
+ end
64
+
59
65
  def position_and_offset
60
66
  "#{position}#{POSITION_SEP}#{offset}"
61
67
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -81,65 +81,52 @@ class Lingo
81
81
  super
82
82
  end
83
83
 
84
- attr_reader :token
85
-
86
- attr_writer :lexicals
87
-
88
- def lexicals(compound_parts = true)
89
- if !compound_parts && attr == WA_COMPOUND
90
- @lexicals.select { |lex| lex.attr == LA_COMPOUND }
91
- else
92
- @lexicals
93
- end
94
- end
84
+ attr_accessor :lexicals, :pattern
95
85
 
96
86
  def add_lexicals(lex)
97
- @lexicals.concat(lex - @lexicals)
87
+ lexicals.concat(lex - lexicals)
98
88
  end
99
89
 
100
90
  def attr?(*attr)
101
91
  !(attrs & attr).empty?
102
92
  end
103
93
 
104
- def attrs(compound_parts = true)
105
- lexicals(compound_parts).map { |i| i.attr }
94
+ def attrs
95
+ lexicals.map(&:attr)
96
+ end
97
+
98
+ def compound_attrs
99
+ attr == WA_COMPOUND ? attrs.grep(LA_COMPOUND) : attrs
106
100
  end
107
101
 
108
- def genders(compound_parts = true)
109
- lexicals(compound_parts).map { |i| i.gender }
102
+ def genders
103
+ lexicals.map(&:gender)
110
104
  end
111
105
 
112
- def identify(lex, wc = nil)
106
+ def identify(lex, wc = nil, seq = nil)
113
107
  return self if lex.empty?
114
108
 
115
- self.lexicals = lex
109
+ self.lexicals, self.pattern = lex, seq
116
110
  self.attr = wc ||= attr?(LA_COMPOUND) ? WA_COMPOUND : WA_IDENTIFIED
117
111
  self.head = self.class.new_compound_head(lex) if wc == WA_COMPOUND
118
112
 
119
113
  self
120
114
  end
121
115
 
122
- def get_class(wc_re)
123
- wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
124
-
125
- lexicals.empty? ? attr =~ wc_re ? [self] : [] :
126
- lexicals.select { |lex| lex.attr =~ wc_re }
127
- end
116
+ def each_lex(wc_re = //)
117
+ return enum_for(:each_lex, wc_re) unless block_given?
128
118
 
129
- def norm
130
- identified? ? lexicals.first.form : form
131
- end
119
+ wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
132
120
 
133
- def compo_form
134
- get_class(LA_COMPOUND).first if attr == WA_COMPOUND
135
- end
121
+ lexicals.empty? ? attr =~ wc_re ? yield(self) : nil :
122
+ lexicals.each { |lex| yield lex if lex.attr =~ wc_re }
136
123
 
137
- def full_compound?
138
- attr == WA_COMPOUND && get_class('x+').empty?
124
+ nil
139
125
  end
140
126
 
141
- def multiword_size(wc_re = LA_MULTIWORD)
142
- lex = get_class(wc_re).first and lex.form.count(' ') + 1
127
+ def lex_form(wc_re = //)
128
+ each_lex(wc_re) { |lex|
129
+ break block_given? ? yield(lex.form) : lex.form }
143
130
  end
144
131
 
145
132
  def position_and_offset
@@ -148,7 +135,7 @@ class Lingo
148
135
 
149
136
  def <<(*lex)
150
137
  lex.flatten!
151
- @lexicals.concat(lex)
138
+ lexicals.concat(lex)
152
139
  self
153
140
  end
154
141
 
@@ -43,7 +43,7 @@ class Lingo
43
43
  @form, @attr, @src = form || '', attr || '', src
44
44
  end
45
45
 
46
- attr_accessor :form, :attr, :gender, :src, :head
46
+ attr_accessor :form, :attr, :gender, :src, :token, :head
47
47
 
48
48
  def unknown?
49
49
  [WA_UNKNOWN, WA_UNKMULPART].include?(attr)
@@ -53,6 +53,10 @@ class Lingo
53
53
  attr == WA_IDENTIFIED
54
54
  end
55
55
 
56
+ def word_token?
57
+ false
58
+ end
59
+
56
60
  def <=>(other)
57
61
  other.nil? ? 1 : to_a <=> other.to_a
58
62
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -58,7 +58,7 @@ class Lingo
58
58
  r = LINGO.talk(q) unless q.empty?
59
59
  return r unless r && SRC_SEP
60
60
 
61
- s = Hash.nest { [] }
61
+ s = Hash.array
62
62
 
63
63
  r.each { |t|
64
64
  a, b = t.split(SRC_SEP, 2)
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ require 'nuggets/file/ext'
28
+
29
+ class Lingo
30
+
31
+ module TextUtils
32
+
33
+ STDIN_EXT = %w[STDIN -].freeze
34
+
35
+ STDOUT_EXT = %w[STDOUT -].freeze
36
+
37
+ GZIP_RE = %r{\.gz(?:ip)?\z}i
38
+
39
+ private
40
+
41
+ def stdin?(path)
42
+ STDIN_EXT.include?(path)
43
+ end
44
+
45
+ def stdout?(path)
46
+ STDOUT_EXT.include?(path)
47
+ end
48
+
49
+ def open_stdin
50
+ stdin = set_encoding(lingo.config.stdin)
51
+ @progress ? StringIO.new(stdin.read) : stdin
52
+ end
53
+
54
+ def open_stdout
55
+ set_encoding(lingo.config.stdout)
56
+ end
57
+
58
+ def open_path(path, mode = 'rb')
59
+ path =~ GZIP_RE ? open_gzip(path, mode) : open_file(path, mode)
60
+ end
61
+
62
+ def open_file(path, mode)
63
+ File.open(path, mode, encoding: bom_encoding(mode))
64
+ end
65
+
66
+ def open_gzip(path, mode)
67
+ require_lib('zlib')
68
+
69
+ case mode
70
+ when 'r', 'rb'
71
+ @progress = false
72
+ Zlib::GzipReader.open(path, encoding: @encoding)
73
+ when 'w', 'wb'
74
+ Zlib::GzipWriter.open(path, encoding: @encoding)
75
+ else
76
+ raise ArgumentError, 'invalid access mode %s' % mode
77
+ end
78
+ end
79
+
80
+ def set_ext(path, ext)
81
+ File.set_ext(path.sub(GZIP_RE, ''), ".#{ext}")
82
+ end
83
+
84
+ def set_encoding(io, encoding = @encoding)
85
+ io.set_encoding(encoding)
86
+ io
87
+ end
88
+
89
+ def bom_encoding(mode = 'r', encoding = @encoding)
90
+ (mode.include?('r') || mode.include?('+')) &&
91
+ encoding.name.start_with?('UTF-') ? "BOM|#{encoding}" : encoding
92
+ end
93
+
94
+ end
95
+
96
+ end
@@ -4,7 +4,7 @@ class Lingo
4
4
 
5
5
  MAJOR = 1
6
6
  MINOR = 8
7
- TINY = 6
7
+ TINY = 7
8
8
 
9
9
  class << self
10
10
 
@@ -98,7 +98,7 @@
98
98
  <tr><th>m (=MUL)</th><td><%= t 'Phrase',
99
99
  'Mehrwortbegriff',
100
100
  'составная лексема' %></td></tr>
101
- <tr><th>k (=KOM)</th><td><%= t 'Compound',
101
+ <tr><th>k (=COM)</th><td><%= t 'Compound',
102
102
  'Kompositum',
103
103
  'сложное слово' %></td></tr>
104
104
  <tr><th>+ </th><td><%= t 'Part of a compound',
@@ -12,11 +12,29 @@ class TestAttendeeDecomposer < AttendeeTestCase
12
12
  wd('Kompositumzerlegung|?'),
13
13
  wd('Kompositumzerlegung|?')
14
14
  ], [
15
- wd('Kleinseite|KOM', 'kleinseite|k', 'klein|a+', 'seite|s+'),
16
- wd('Arrafat-Nachfolger|KOM', 'arrafat-nachfolger|k', 'arrafat|x+', 'nachfolger|s+'),
17
- wd('Afganistan-Reisen|KOM', 'afganistan-reisen|k', 'afganistan|x+', 'reisen|v+', 'reise|s+'),
18
- wd('Kompositumzerlegung|KOM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+'),
19
- wd('Kompositumzerlegung|KOM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+')
15
+ wd('Kleinseite|COM', 'kleinseite|k', 'klein|a+', 'seite|s+'),
16
+ wd('Arrafat-Nachfolger|COM', 'arrafat-nachfolger|k', 'arrafat|x+', 'nachfolger|s+'),
17
+ wd('Afganistan-Reisen|COM', 'afganistan-reisen|k', 'afganistan|x+', 'reisen|v+', 'reise|s+'),
18
+ wd('Kompositumzerlegung|COM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+'),
19
+ wd('Kompositumzerlegung|COM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+')
20
+ ])
21
+ end
22
+
23
+ def test_nums
24
+ meet({ 'source' => 'sys-dic' }, [
25
+ wd('123-Reisen|?'),
26
+ wd('abc123-Reisen|?'),
27
+ wd('Reisen-24|?'),
28
+ wd('Reisen-123|?'),
29
+ wd('Reisen-24-Seite|?'),
30
+ wd('Reisen-123-Seite|?')
31
+ ], [
32
+ wd('123-Reisen|COM', '123-reisen|k', '123|x+', 'reisen|v+', 'reise|s+'),
33
+ wd('abc123-Reisen|COM', 'abc123-reisen|k', 'abc123|x+', 'reisen|v+', 'reise|s+'),
34
+ wd('Reisen-24|?'),
35
+ wd('Reisen-123|COM', 'reisen-123|k', 'reisen|v+', 'reise|s+', '123|x+'),
36
+ wd('Reisen-24-Seite|COM', 'reisen-24-seite|k', 'reisen-24|x+', 'seite|s+'),
37
+ wd('Reisen-123-Seite|COM', 'reisen-123-seite|k', 'reisen|v+', 'reise|s+', '123|x+', 'seite|s+')
20
38
  ])
21
39
  end
22
40