lingo 1.8.6 → 1.8.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +40 -4
  3. data/README +22 -51
  4. data/Rakefile +3 -17
  5. data/config/lingo.cfg +24 -15
  6. data/config/lir.cfg +25 -16
  7. data/dict/de/test_muh.txt +6 -0
  8. data/dict/en/lingo-dic.txt +2 -3
  9. data/lang/de.lang +10 -9
  10. data/lang/en.lang +1 -1
  11. data/lib/lingo.rb +4 -4
  12. data/lib/lingo/attendee.rb +27 -7
  13. data/lib/lingo/attendee/analysis_filter.rb +81 -0
  14. data/lib/lingo/attendee/debug_filter.rb +42 -0
  15. data/lib/lingo/attendee/debugger.rb +2 -11
  16. data/lib/lingo/attendee/decomposer.rb +6 -3
  17. data/lib/lingo/attendee/formatter.rb +6 -6
  18. data/lib/lingo/attendee/hal_filter.rb +94 -0
  19. data/lib/lingo/attendee/lsi_filter.rb +99 -0
  20. data/lib/lingo/attendee/multi_worder.rb +69 -43
  21. data/lib/lingo/attendee/sequencer.rb +32 -19
  22. data/lib/lingo/attendee/synonymer.rb +2 -2
  23. data/lib/lingo/attendee/text_reader.rb +63 -92
  24. data/lib/lingo/attendee/text_writer.rb +12 -21
  25. data/lib/lingo/attendee/tokenizer.rb +32 -21
  26. data/lib/lingo/attendee/variator.rb +3 -3
  27. data/lib/lingo/attendee/vector_filter.rb +7 -9
  28. data/lib/lingo/attendee/word_searcher.rb +3 -3
  29. data/lib/lingo/buffered_attendee.rb +3 -36
  30. data/lib/lingo/config.rb +1 -1
  31. data/lib/lingo/ctl.rb +7 -155
  32. data/lib/lingo/ctl/analysis.rb +136 -0
  33. data/lib/lingo/ctl/files.rb +86 -0
  34. data/lib/lingo/ctl/other.rb +140 -0
  35. data/lib/lingo/database.rb +64 -60
  36. data/lib/lingo/database/crypter.rb +7 -5
  37. data/lib/lingo/error.rb +5 -4
  38. data/lib/lingo/language.rb +13 -5
  39. data/lib/lingo/language/grammar.rb +13 -7
  40. data/lib/lingo/language/token.rb +6 -0
  41. data/lib/lingo/language/word.rb +23 -36
  42. data/lib/lingo/language/word_form.rb +5 -1
  43. data/lib/lingo/srv.rb +2 -2
  44. data/lib/lingo/text_utils.rb +96 -0
  45. data/lib/lingo/version.rb +1 -1
  46. data/lib/lingo/web/views/index.erb +1 -1
  47. data/test/attendee/ts_decomposer.rb +23 -5
  48. data/test/attendee/ts_multi_worder.rb +66 -0
  49. data/test/attendee/ts_sequencer.rb +28 -4
  50. data/test/attendee/ts_text_reader.rb +20 -0
  51. data/test/attendee/ts_tokenizer.rb +20 -0
  52. data/test/attendee/ts_variator.rb +1 -1
  53. data/test/attendee/ts_word_searcher.rb +39 -3
  54. data/test/lir3.txt +12 -0
  55. data/test/ref/artikel.non +1 -12
  56. data/test/ref/artikel.seq +3 -1
  57. data/test/ref/artikel.vec +1 -0
  58. data/test/ref/artikel.vef +35 -34
  59. data/test/ref/artikel.ven +8 -7
  60. data/test/ref/artikel.ver +34 -33
  61. data/test/ref/artikel.vet +2573 -2563
  62. data/test/ref/lir.non +77 -78
  63. data/test/ref/lir.seq +9 -7
  64. data/test/ref/lir.syn +1 -1
  65. data/test/ref/lir.vec +41 -41
  66. data/test/ref/lir.vef +210 -210
  67. data/test/ref/lir.ven +46 -46
  68. data/test/ref/lir.ver +72 -72
  69. data/test/ref/lir.vet +329 -329
  70. data/test/ts_database.rb +166 -62
  71. data/test/ts_language.rb +23 -23
  72. metadata +53 -34
  73. data/lib/lingo/attendee/dehyphenizer.rb +0 -120
  74. data/lib/lingo/attendee/noneword_filter.rb +0 -115
  75. data/test/attendee/ts_noneword_filter.rb +0 -15
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -31,14 +31,16 @@ class Lingo
31
31
 
32
32
  class Database
33
33
 
34
- class Crypter
34
+ module Crypter
35
35
 
36
- def self.digest(key)
36
+ extend self
37
+
38
+ def digest(key)
37
39
  Digest::SHA1.hexdigest(key)
38
40
  end
39
41
 
40
42
  def encode(key, val)
41
- [self.class.digest(key), crypt(:encrypt, key, val)]
43
+ [digest(key), crypt(:encrypt, key, val)]
42
44
  end
43
45
 
44
46
  def decode(key, val)
@@ -49,7 +51,7 @@ class Lingo
49
51
 
50
52
  def crypt(method, key, val)
51
53
  cipher = OpenSSL::Cipher.new('aes-128-cbc').send(method)
52
- cipher.iv = cipher.key = self.class.digest(key)
54
+ cipher.iv = cipher.key = digest(key)
53
55
  cipher.update(val) + cipher.final
54
56
  end
55
57
 
@@ -68,14 +68,15 @@ class Lingo
68
68
 
69
69
  class BackendNotAvailableError < LingoError
70
70
 
71
- attr_reader :mod, :file
71
+ attr_reader :name, :file, :err
72
72
 
73
- def initialize(mod, file)
74
- @mod, @file = mod, file
73
+ def initialize(name, file, err)
74
+ @name, @file, @err = name, file, err
75
75
  end
76
76
 
77
77
  def to_s
78
- "Backend not available `#{mod}' for `#{file}'."
78
+ msg = "Backend not available `#{name}'"
79
+ error(file ? msg << " for `#{file}'" : msg)
79
80
  end
80
81
 
81
82
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,14 +39,22 @@ class Lingo
39
39
 
40
40
  CHAR_PUNCT = '.'
41
41
 
42
- TA_WORD = 'WORD'
43
- TA_PUNCTUATION = 'PUNC'
44
- TA_OTHER = 'OTHR'
42
+ TA_ABBREVIATION = 'ABRV'
43
+ TA_HELP = 'HELP'
44
+ TA_HTML = 'HTML'
45
+ TA_NUMBER = 'NUMS'
46
+ TA_OTHER = 'OTHR'
47
+ TA_PUNCTUATION = 'PUNC'
48
+ TA_SKIP = 'SKIP'
49
+ TA_SPACE = 'SPAC'
50
+ TA_URL = 'URLS'
51
+ TA_WIKI = 'WIKI'
52
+ TA_WORD = 'WORD'
45
53
 
46
54
  WA_UNSET = '-'
47
55
  WA_IDENTIFIED = 'IDF'
48
56
  WA_UNKNOWN = '?'
49
- WA_COMPOUND = 'KOM'
57
+ WA_COMPOUND = 'COM'
50
58
  WA_MULTIWORD = 'MUL'
51
59
  WA_SEQUENCE = 'SEQ'
52
60
  WA_UNKMULPART = 'MU?'
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,6 +39,10 @@ class Lingo
39
39
 
40
40
  HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
41
41
 
42
+ DEFAULTS = {
43
+ min_word_size: 8, min_avg_part_size: 4, min_part_size: 1, max_parts: 4
44
+ }
45
+
42
46
  def self.open(*args)
43
47
  yield grammar = new(*args)
44
48
  ensure
@@ -55,11 +59,8 @@ class Lingo
55
59
  cfg = lingo.dictionary_config['compound'] ||
56
60
  lingo.dictionary_config['compositum'] # DEPRECATE compositum
57
61
 
58
- {
59
- min_word_size: 8, min_avg_part_size: 4, min_part_size: 1, max_parts: 4
60
- }.each { |k, v|
61
- instance_variable_set("@#{k}", cfg.fetch(k.to_s.tr('_', '-'), v).to_i)
62
- }
62
+ DEFAULTS.each { |k, v| instance_variable_set(
63
+ "@#{k}", cfg.fetch(k.to_s.tr('_', '-'), v).to_i) }
63
64
 
64
65
  #--
65
66
  # Die Wortklasse eines Kompositum-Wortteils kann separat gekennzeichnet
@@ -87,6 +88,11 @@ class Lingo
87
88
  permute_compound([[], [], ''], str, level, tail)
88
89
  end
89
90
 
91
+ def find_compound_head(str)
92
+ compound = find_compound(str)
93
+ compound.head || compound if compound && !compound.unknown?
94
+ end
95
+
90
96
  private
91
97
 
92
98
  def permute_compound(ret, str, level, tail)
@@ -113,7 +119,7 @@ class Lingo
113
119
 
114
120
  level > 1 ? ret = res : ret.identify(lex.each { |l|
115
121
  l.attr += @append_wc unless l.attr == LA_COMPOUND
116
- }, WA_COMPOUND) if !lex.empty? &&
122
+ }, WA_COMPOUND, seq) if !lex.empty? &&
117
123
  sta.size <= @max_parts &&
118
124
  sta.min >= @min_part_size &&
119
125
  str.length / sta.size >= @min_avg_part_size &&
@@ -56,6 +56,12 @@ class Lingo
56
56
  attr == TA_WORD
57
57
  end
58
58
 
59
+ alias_method :word_token?, :word?
60
+
61
+ def number?
62
+ attr == TA_NUMBER
63
+ end
64
+
59
65
  def position_and_offset
60
66
  "#{position}#{POSITION_SEP}#{offset}"
61
67
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -81,65 +81,52 @@ class Lingo
81
81
  super
82
82
  end
83
83
 
84
- attr_reader :token
85
-
86
- attr_writer :lexicals
87
-
88
- def lexicals(compound_parts = true)
89
- if !compound_parts && attr == WA_COMPOUND
90
- @lexicals.select { |lex| lex.attr == LA_COMPOUND }
91
- else
92
- @lexicals
93
- end
94
- end
84
+ attr_accessor :lexicals, :pattern
95
85
 
96
86
  def add_lexicals(lex)
97
- @lexicals.concat(lex - @lexicals)
87
+ lexicals.concat(lex - lexicals)
98
88
  end
99
89
 
100
90
  def attr?(*attr)
101
91
  !(attrs & attr).empty?
102
92
  end
103
93
 
104
- def attrs(compound_parts = true)
105
- lexicals(compound_parts).map { |i| i.attr }
94
+ def attrs
95
+ lexicals.map(&:attr)
96
+ end
97
+
98
+ def compound_attrs
99
+ attr == WA_COMPOUND ? attrs.grep(LA_COMPOUND) : attrs
106
100
  end
107
101
 
108
- def genders(compound_parts = true)
109
- lexicals(compound_parts).map { |i| i.gender }
102
+ def genders
103
+ lexicals.map(&:gender)
110
104
  end
111
105
 
112
- def identify(lex, wc = nil)
106
+ def identify(lex, wc = nil, seq = nil)
113
107
  return self if lex.empty?
114
108
 
115
- self.lexicals = lex
109
+ self.lexicals, self.pattern = lex, seq
116
110
  self.attr = wc ||= attr?(LA_COMPOUND) ? WA_COMPOUND : WA_IDENTIFIED
117
111
  self.head = self.class.new_compound_head(lex) if wc == WA_COMPOUND
118
112
 
119
113
  self
120
114
  end
121
115
 
122
- def get_class(wc_re)
123
- wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
124
-
125
- lexicals.empty? ? attr =~ wc_re ? [self] : [] :
126
- lexicals.select { |lex| lex.attr =~ wc_re }
127
- end
116
+ def each_lex(wc_re = //)
117
+ return enum_for(:each_lex, wc_re) unless block_given?
128
118
 
129
- def norm
130
- identified? ? lexicals.first.form : form
131
- end
119
+ wc_re = Regexp.new(wc_re) unless wc_re.is_a?(Regexp)
132
120
 
133
- def compo_form
134
- get_class(LA_COMPOUND).first if attr == WA_COMPOUND
135
- end
121
+ lexicals.empty? ? attr =~ wc_re ? yield(self) : nil :
122
+ lexicals.each { |lex| yield lex if lex.attr =~ wc_re }
136
123
 
137
- def full_compound?
138
- attr == WA_COMPOUND && get_class('x+').empty?
124
+ nil
139
125
  end
140
126
 
141
- def multiword_size(wc_re = LA_MULTIWORD)
142
- lex = get_class(wc_re).first and lex.form.count(' ') + 1
127
+ def lex_form(wc_re = //)
128
+ each_lex(wc_re) { |lex|
129
+ break block_given? ? yield(lex.form) : lex.form }
143
130
  end
144
131
 
145
132
  def position_and_offset
@@ -148,7 +135,7 @@ class Lingo
148
135
 
149
136
  def <<(*lex)
150
137
  lex.flatten!
151
- @lexicals.concat(lex)
138
+ lexicals.concat(lex)
152
139
  self
153
140
  end
154
141
 
@@ -43,7 +43,7 @@ class Lingo
43
43
  @form, @attr, @src = form || '', attr || '', src
44
44
  end
45
45
 
46
- attr_accessor :form, :attr, :gender, :src, :head
46
+ attr_accessor :form, :attr, :gender, :src, :token, :head
47
47
 
48
48
  def unknown?
49
49
  [WA_UNKNOWN, WA_UNKMULPART].include?(attr)
@@ -53,6 +53,10 @@ class Lingo
53
53
  attr == WA_IDENTIFIED
54
54
  end
55
55
 
56
+ def word_token?
57
+ false
58
+ end
59
+
56
60
  def <=>(other)
57
61
  other.nil? ? 1 : to_a <=> other.to_a
58
62
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -58,7 +58,7 @@ class Lingo
58
58
  r = LINGO.talk(q) unless q.empty?
59
59
  return r unless r && SRC_SEP
60
60
 
61
- s = Hash.nest { [] }
61
+ s = Hash.array
62
62
 
63
63
  r.each { |t|
64
64
  a, b = t.split(SRC_SEP, 2)
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ require 'nuggets/file/ext'
28
+
29
+ class Lingo
30
+
31
+ module TextUtils
32
+
33
+ STDIN_EXT = %w[STDIN -].freeze
34
+
35
+ STDOUT_EXT = %w[STDOUT -].freeze
36
+
37
+ GZIP_RE = %r{\.gz(?:ip)?\z}i
38
+
39
+ private
40
+
41
+ def stdin?(path)
42
+ STDIN_EXT.include?(path)
43
+ end
44
+
45
+ def stdout?(path)
46
+ STDOUT_EXT.include?(path)
47
+ end
48
+
49
+ def open_stdin
50
+ stdin = set_encoding(lingo.config.stdin)
51
+ @progress ? StringIO.new(stdin.read) : stdin
52
+ end
53
+
54
+ def open_stdout
55
+ set_encoding(lingo.config.stdout)
56
+ end
57
+
58
+ def open_path(path, mode = 'rb')
59
+ path =~ GZIP_RE ? open_gzip(path, mode) : open_file(path, mode)
60
+ end
61
+
62
+ def open_file(path, mode)
63
+ File.open(path, mode, encoding: bom_encoding(mode))
64
+ end
65
+
66
+ def open_gzip(path, mode)
67
+ require_lib('zlib')
68
+
69
+ case mode
70
+ when 'r', 'rb'
71
+ @progress = false
72
+ Zlib::GzipReader.open(path, encoding: @encoding)
73
+ when 'w', 'wb'
74
+ Zlib::GzipWriter.open(path, encoding: @encoding)
75
+ else
76
+ raise ArgumentError, 'invalid access mode %s' % mode
77
+ end
78
+ end
79
+
80
+ def set_ext(path, ext)
81
+ File.set_ext(path.sub(GZIP_RE, ''), ".#{ext}")
82
+ end
83
+
84
+ def set_encoding(io, encoding = @encoding)
85
+ io.set_encoding(encoding)
86
+ io
87
+ end
88
+
89
+ def bom_encoding(mode = 'r', encoding = @encoding)
90
+ (mode.include?('r') || mode.include?('+')) &&
91
+ encoding.name.start_with?('UTF-') ? "BOM|#{encoding}" : encoding
92
+ end
93
+
94
+ end
95
+
96
+ end
@@ -4,7 +4,7 @@ class Lingo
4
4
 
5
5
  MAJOR = 1
6
6
  MINOR = 8
7
- TINY = 6
7
+ TINY = 7
8
8
 
9
9
  class << self
10
10
 
@@ -98,7 +98,7 @@
98
98
  <tr><th>m (=MUL)</th><td><%= t 'Phrase',
99
99
  'Mehrwortbegriff',
100
100
  'составная лексема' %></td></tr>
101
- <tr><th>k (=KOM)</th><td><%= t 'Compound',
101
+ <tr><th>k (=COM)</th><td><%= t 'Compound',
102
102
  'Kompositum',
103
103
  'сложное слово' %></td></tr>
104
104
  <tr><th>+ </th><td><%= t 'Part of a compound',
@@ -12,11 +12,29 @@ class TestAttendeeDecomposer < AttendeeTestCase
12
12
  wd('Kompositumzerlegung|?'),
13
13
  wd('Kompositumzerlegung|?')
14
14
  ], [
15
- wd('Kleinseite|KOM', 'kleinseite|k', 'klein|a+', 'seite|s+'),
16
- wd('Arrafat-Nachfolger|KOM', 'arrafat-nachfolger|k', 'arrafat|x+', 'nachfolger|s+'),
17
- wd('Afganistan-Reisen|KOM', 'afganistan-reisen|k', 'afganistan|x+', 'reisen|v+', 'reise|s+'),
18
- wd('Kompositumzerlegung|KOM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+'),
19
- wd('Kompositumzerlegung|KOM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+')
15
+ wd('Kleinseite|COM', 'kleinseite|k', 'klein|a+', 'seite|s+'),
16
+ wd('Arrafat-Nachfolger|COM', 'arrafat-nachfolger|k', 'arrafat|x+', 'nachfolger|s+'),
17
+ wd('Afganistan-Reisen|COM', 'afganistan-reisen|k', 'afganistan|x+', 'reisen|v+', 'reise|s+'),
18
+ wd('Kompositumzerlegung|COM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+'),
19
+ wd('Kompositumzerlegung|COM', 'kompositumzerlegung|k', 'kompositum|s+', 'zerlegung|s+')
20
+ ])
21
+ end
22
+
23
+ def test_nums
24
+ meet({ 'source' => 'sys-dic' }, [
25
+ wd('123-Reisen|?'),
26
+ wd('abc123-Reisen|?'),
27
+ wd('Reisen-24|?'),
28
+ wd('Reisen-123|?'),
29
+ wd('Reisen-24-Seite|?'),
30
+ wd('Reisen-123-Seite|?')
31
+ ], [
32
+ wd('123-Reisen|COM', '123-reisen|k', '123|x+', 'reisen|v+', 'reise|s+'),
33
+ wd('abc123-Reisen|COM', 'abc123-reisen|k', 'abc123|x+', 'reisen|v+', 'reise|s+'),
34
+ wd('Reisen-24|?'),
35
+ wd('Reisen-123|COM', 'reisen-123|k', 'reisen|v+', 'reise|s+', '123|x+'),
36
+ wd('Reisen-24-Seite|COM', 'reisen-24-seite|k', 'reisen-24|x+', 'seite|s+'),
37
+ wd('Reisen-123-Seite|COM', 'reisen-123-seite|k', 'reisen|v+', 'reise|s+', '123|x+', 'seite|s+')
20
38
  ])
21
39
  end
22
40