lingo 1.8.7 → 1.9.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +11 -0
- data/README +2 -2
- data/Rakefile +7 -5
- data/lib/lingo.rb +2 -1
- data/lib/lingo/attendee/multi_worder.rb +0 -4
- data/lib/lingo/attendee/object_filter.rb +0 -4
- data/lib/lingo/attendee/text_reader.rb +13 -28
- data/lib/lingo/attendee/text_writer.rb +0 -4
- data/lib/lingo/attendee/vector_filter.rb +0 -4
- data/lib/lingo/attendee/word_searcher.rb +0 -4
- data/lib/lingo/config.rb +3 -8
- data/lib/lingo/filter.rb +48 -0
- data/lib/lingo/filter/pdf.rb +48 -0
- data/lib/lingo/filter/xml.rb +56 -0
- data/lib/lingo/language/grammar.rb +1 -6
- data/lib/lingo/language/word.rb +1 -1
- data/lib/lingo/version.rb +7 -3
- data/test/article.html +63 -0
- data/test/article.pdf +0 -0
- data/test/article.txt +44 -0
- data/test/article.xml +120 -0
- data/test/attendee/ts_text_reader.rb +405 -16
- metadata +58 -49
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d863ede7d1dda418b36230165f9f635a8977d73a
|
4
|
+
data.tar.gz: 9d7caed0a6d27898605b97429b48965617c96960
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62044e35ee507ff911b0aa7b656d6247f7f34a168d23e0da2d7f64dd8010e73b0258cbdda3ea1d61a010c3ca1204c442a02e04866bac8afb92a07d9309ed5c9c
|
7
|
+
data.tar.gz: eb4df8fa604f9f36cbb8094041519ca9198d5a59b624ef70afb937fc9120b8afa2b1d48f95732a2ffb0a253bf2aba5dc16bdb61543f9217926b94b5f80f3e1f8
|
data/ChangeLog
CHANGED
@@ -2,6 +2,17 @@
|
|
2
2
|
|
3
3
|
= Revision history for Lingo
|
4
4
|
|
5
|
+
== 1.9.0 [unreleased]
|
6
|
+
|
7
|
+
* Removed support for deprecated options and attendee names (+old+ → +new+):
|
8
|
+
* Lingo::Language::Grammar : +compositum+ → +compound+
|
9
|
+
* Lingo::Attendee::TextReader : +lir-record-pattern+ → +records+
|
10
|
+
* Lingo::Config : +multiworder+ → +multi_worder+, +objectfilter+ →
|
11
|
+
+object_filter+, +textreader+ → +text_reader+, +textwriter+ →
|
12
|
+
+text_writer+, +vectorfilter+ → +vector_filter+, +wordsearcher+ →
|
13
|
+
+word_searcher+
|
14
|
+
* Fixed errors with XML input (issue #15 by Thomas Berger).
|
15
|
+
|
5
16
|
== 1.8.7 [2015-08-07]
|
6
17
|
|
7
18
|
* Added Lingo::Attendee::LsiFilter to correlate semantically related terms
|
data/README
CHANGED
@@ -34,7 +34,7 @@
|
|
34
34
|
|
35
35
|
== VERSION
|
36
36
|
|
37
|
-
This documentation refers to Lingo version 1.
|
37
|
+
This documentation refers to Lingo version 1.9.0
|
38
38
|
|
39
39
|
|
40
40
|
== DESCRIPTION
|
@@ -555,7 +555,7 @@ Lingo is based on a collective development by Klaus Lepsky and John Vorhauer.
|
|
555
555
|
== LICENSE AND COPYRIGHT
|
556
556
|
|
557
557
|
Copyright (C) 2005-2007 John Vorhauer
|
558
|
-
Copyright (C) 2007-
|
558
|
+
Copyright (C) 2007-2016 John Vorhauer, Jens Wille
|
559
559
|
|
560
560
|
Lingo is free software: you can redistribute it and/or modify it under the
|
561
561
|
terms of the GNU Affero General Public License as published by the Free
|
data/Rakefile
CHANGED
@@ -36,16 +36,18 @@ The main functions of Lingo are:
|
|
36
36
|
].to_a,
|
37
37
|
|
38
38
|
dependencies: {
|
39
|
-
'cyclops' => '~> 0.
|
40
|
-
'nuggets' => '~> 1.
|
39
|
+
'cyclops' => '~> 0.2',
|
40
|
+
'nuggets' => '~> 1.4',
|
41
41
|
'rubyzip' => '~> 1.1',
|
42
|
-
'sinatra-bells' => '~> 0.
|
42
|
+
'sinatra-bells' => '~> 0.3',
|
43
43
|
'unicode' => '~> 0.4'
|
44
44
|
},
|
45
45
|
|
46
46
|
development_dependencies: {
|
47
|
-
'diff-lcs'
|
48
|
-
'
|
47
|
+
'diff-lcs' => '~> 1.2',
|
48
|
+
'nokogiri' => '~> 1.6',
|
49
|
+
'open4' => '~> 1.3',
|
50
|
+
'pdf-reader' => '~> 1.3'
|
49
51
|
},
|
50
52
|
|
51
53
|
required_ruby_version: '>= 1.9.3'
|
data/lib/lingo.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -318,6 +318,7 @@ require_relative 'lingo/call'
|
|
318
318
|
require_relative 'lingo/error'
|
319
319
|
require_relative 'lingo/debug'
|
320
320
|
require_relative 'lingo/config'
|
321
|
+
require_relative 'lingo/filter'
|
321
322
|
require_relative 'lingo/progress'
|
322
323
|
require_relative 'lingo/database'
|
323
324
|
require_relative 'lingo/language'
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -26,15 +26,12 @@
|
|
26
26
|
|
27
27
|
require 'find'
|
28
28
|
|
29
|
-
%w[filemagic mime/types nokogiri nuggets/file/which pdf-reader].each { |lib|
|
30
|
-
begin
|
31
|
-
require lib
|
32
|
-
rescue LoadError
|
33
|
-
end
|
34
|
-
}
|
35
|
-
|
36
29
|
class Lingo
|
37
30
|
|
31
|
+
require_optional 'filemagic'
|
32
|
+
require_optional 'mime/types'
|
33
|
+
require_optional 'nuggets/file/which'
|
34
|
+
|
38
35
|
class Attendee
|
39
36
|
|
40
37
|
#--
|
@@ -118,11 +115,7 @@ class Lingo
|
|
118
115
|
@filter = get_key('filter', false)
|
119
116
|
@progress = get_key('progress', false)
|
120
117
|
|
121
|
-
|
122
|
-
lingo.config.deprecate('lir-record-pattern', :records, self)
|
123
|
-
end
|
124
|
-
|
125
|
-
@lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
|
118
|
+
@lir = get_re('records', nil, %r{^\[(\d+)\.\]})
|
126
119
|
@cut = get_re('fields', !!@lir, %r{^.+?:\s*})
|
127
120
|
@skip = get_re('skip', nil)
|
128
121
|
end
|
@@ -165,8 +158,8 @@ class Lingo
|
|
165
158
|
def filter(io, path, progress)
|
166
159
|
case @filter == true ? file_type(io, path) : @filter.to_s
|
167
160
|
when 'pdftotext' then filter_pdftotext(io, path, progress)
|
168
|
-
when /html/i then
|
169
|
-
when /xml/i then
|
161
|
+
when /html/i then filter_xml(io, :HTML)
|
162
|
+
when /xml/i then filter_xml(io)
|
170
163
|
when /pdf/i then filter_pdf(io)
|
171
164
|
else io
|
172
165
|
end
|
@@ -185,13 +178,13 @@ class Lingo
|
|
185
178
|
end
|
186
179
|
|
187
180
|
def filter_pdf(io)
|
188
|
-
|
189
|
-
|
181
|
+
cancel_filter(:PDF, 'pdf-reader') unless Object.const_defined?(:PDF)
|
182
|
+
Filter::PDF.new(io, @encoding)
|
190
183
|
end
|
191
184
|
|
192
|
-
def
|
193
|
-
Object.const_defined?(:Nokogiri)
|
194
|
-
|
185
|
+
def filter_xml(io, type = :XML)
|
186
|
+
cancel_filter(type, :nokogiri) unless Object.const_defined?(:Nokogiri)
|
187
|
+
Filter.const_get(type).new(io, @encoding)
|
195
188
|
end
|
196
189
|
|
197
190
|
def file_type(io, path)
|
@@ -239,10 +232,6 @@ class Lingo
|
|
239
232
|
tempfiles.each(&:unlink)
|
240
233
|
end
|
241
234
|
|
242
|
-
def text_enum(collection)
|
243
|
-
Enumerator.new { |y| collection.each { |x| y << x.text } }
|
244
|
-
end
|
245
|
-
|
246
235
|
def get_files
|
247
236
|
args = [get_key('glob', '*.txt'), get_key('recursive', false)]
|
248
237
|
|
@@ -263,10 +252,6 @@ class Lingo
|
|
263
252
|
|
264
253
|
end
|
265
254
|
|
266
|
-
# For backwards compatibility.
|
267
|
-
Textreader = TextReader
|
268
|
-
Text_reader = TextReader
|
269
|
-
|
270
255
|
end
|
271
256
|
|
272
257
|
end
|
data/lib/lingo/config.rb
CHANGED
@@ -42,12 +42,7 @@ class Lingo
|
|
42
42
|
load_config('language', :lang)
|
43
43
|
load_config('config')
|
44
44
|
|
45
|
-
if
|
46
|
-
deprecate(:textreader, :text_reader)
|
47
|
-
end
|
48
|
-
|
49
|
-
if r = get('meeting/attendees', 'text_reader') ||
|
50
|
-
get('meeting/attendees', 'textreader') # DEPRECATE textreader
|
45
|
+
if r = get('meeting/attendees', 'text_reader')
|
51
46
|
f = @cli.files
|
52
47
|
|
53
48
|
if i = r['files']
|
@@ -110,11 +105,11 @@ class Lingo
|
|
110
105
|
@cli.send(:quit, *args)
|
111
106
|
end
|
112
107
|
|
113
|
-
def deprecate(old, new, obj = self, what = :option)
|
108
|
+
def deprecate(old, new, obj = self, what = :option, ver = Version.next_minor)
|
114
109
|
unless @deprecated[[source = obj.class.name.sub(/\ALingo::/, ''), old]]
|
115
110
|
warn(
|
116
111
|
"DEPRECATION WARNING: #{source} #{what} `#{old}' is deprecated " <<
|
117
|
-
"and will be removed in Lingo
|
112
|
+
"and will be removed in Lingo #{ver}. Please use `#{new}' instead."
|
118
113
|
)
|
119
114
|
end
|
120
115
|
end
|
data/lib/lingo/filter.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Filter
|
30
|
+
|
31
|
+
def initialize(io, encoding = ENC)
|
32
|
+
@io, @encoding = io, encoding
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
raise NotImplementedError, 'must be implemented by subclass'
|
37
|
+
end
|
38
|
+
|
39
|
+
def close
|
40
|
+
@io.close
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
require_relative 'filter/pdf'
|
48
|
+
require_relative 'filter/xml'
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
require_optional 'pdf-reader'
|
30
|
+
|
31
|
+
class Filter
|
32
|
+
|
33
|
+
class PDF < self
|
34
|
+
|
35
|
+
def initialize(*args)
|
36
|
+
super
|
37
|
+
@obj = ::PDF::Reader.new(@io)
|
38
|
+
end
|
39
|
+
|
40
|
+
def each(&block)
|
41
|
+
@obj.pages.each { |x| x.text.each_line(&block) }
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
require_optional 'nokogiri'
|
30
|
+
|
31
|
+
class Filter
|
32
|
+
|
33
|
+
class XML < self
|
34
|
+
|
35
|
+
TYPE = :XML
|
36
|
+
|
37
|
+
def initialize(*args)
|
38
|
+
super
|
39
|
+
@obj = Nokogiri.send(self.class::TYPE, @io, nil, @encoding.to_s)
|
40
|
+
end
|
41
|
+
|
42
|
+
def each(&block)
|
43
|
+
@obj.root.element_children.each { |n| n.content.each_line(&block) }
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
class HTML < XML
|
49
|
+
|
50
|
+
TYPE = :HTML
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
@@ -52,12 +52,7 @@ class Lingo
|
|
52
52
|
def initialize(config, lingo)
|
53
53
|
@dic, @suggestions = Dictionary.new(config, lingo), []
|
54
54
|
|
55
|
-
|
56
|
-
lingo.config.deprecate(:compositum, :compound, self)
|
57
|
-
end
|
58
|
-
|
59
|
-
cfg = lingo.dictionary_config['compound'] ||
|
60
|
-
lingo.dictionary_config['compositum'] # DEPRECATE compositum
|
55
|
+
cfg = lingo.dictionary_config['compound']
|
61
56
|
|
62
57
|
DEFAULTS.each { |k, v| instance_variable_set(
|
63
58
|
"@#{k}", cfg.fetch(k.to_s.tr('_', '-'), v).to_i) }
|
data/lib/lingo/language/word.rb
CHANGED