lingo 1.8.7 → 1.9.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +11 -0
- data/README +2 -2
- data/Rakefile +7 -5
- data/lib/lingo.rb +2 -1
- data/lib/lingo/attendee/multi_worder.rb +0 -4
- data/lib/lingo/attendee/object_filter.rb +0 -4
- data/lib/lingo/attendee/text_reader.rb +13 -28
- data/lib/lingo/attendee/text_writer.rb +0 -4
- data/lib/lingo/attendee/vector_filter.rb +0 -4
- data/lib/lingo/attendee/word_searcher.rb +0 -4
- data/lib/lingo/config.rb +3 -8
- data/lib/lingo/filter.rb +48 -0
- data/lib/lingo/filter/pdf.rb +48 -0
- data/lib/lingo/filter/xml.rb +56 -0
- data/lib/lingo/language/grammar.rb +1 -6
- data/lib/lingo/language/word.rb +1 -1
- data/lib/lingo/version.rb +7 -3
- data/test/article.html +63 -0
- data/test/article.pdf +0 -0
- data/test/article.txt +44 -0
- data/test/article.xml +120 -0
- data/test/attendee/ts_text_reader.rb +405 -16
- metadata +58 -49
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d863ede7d1dda418b36230165f9f635a8977d73a
|
4
|
+
data.tar.gz: 9d7caed0a6d27898605b97429b48965617c96960
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62044e35ee507ff911b0aa7b656d6247f7f34a168d23e0da2d7f64dd8010e73b0258cbdda3ea1d61a010c3ca1204c442a02e04866bac8afb92a07d9309ed5c9c
|
7
|
+
data.tar.gz: eb4df8fa604f9f36cbb8094041519ca9198d5a59b624ef70afb937fc9120b8afa2b1d48f95732a2ffb0a253bf2aba5dc16bdb61543f9217926b94b5f80f3e1f8
|
data/ChangeLog
CHANGED
@@ -2,6 +2,17 @@
|
|
2
2
|
|
3
3
|
= Revision history for Lingo
|
4
4
|
|
5
|
+
== 1.9.0 [unreleased]
|
6
|
+
|
7
|
+
* Removed support for deprecated options and attendee names (+old+ → +new+):
|
8
|
+
* Lingo::Language::Grammar : +compositum+ → +compound+
|
9
|
+
* Lingo::Attendee::TextReader : +lir-record-pattern+ → +records+
|
10
|
+
* Lingo::Config : +multiworder+ → +multi_worder+, +objectfilter+ →
|
11
|
+
+object_filter+, +textreader+ → +text_reader+, +textwriter+ →
|
12
|
+
+text_writer+, +vectorfilter+ → +vector_filter+, +wordsearcher+ →
|
13
|
+
+word_searcher+
|
14
|
+
* Fixed errors with XML input (issue #15 by Thomas Berger).
|
15
|
+
|
5
16
|
== 1.8.7 [2015-08-07]
|
6
17
|
|
7
18
|
* Added Lingo::Attendee::LsiFilter to correlate semantically related terms
|
data/README
CHANGED
@@ -34,7 +34,7 @@
|
|
34
34
|
|
35
35
|
== VERSION
|
36
36
|
|
37
|
-
This documentation refers to Lingo version 1.
|
37
|
+
This documentation refers to Lingo version 1.9.0
|
38
38
|
|
39
39
|
|
40
40
|
== DESCRIPTION
|
@@ -555,7 +555,7 @@ Lingo is based on a collective development by Klaus Lepsky and John Vorhauer.
|
|
555
555
|
== LICENSE AND COPYRIGHT
|
556
556
|
|
557
557
|
Copyright (C) 2005-2007 John Vorhauer
|
558
|
-
Copyright (C) 2007-
|
558
|
+
Copyright (C) 2007-2016 John Vorhauer, Jens Wille
|
559
559
|
|
560
560
|
Lingo is free software: you can redistribute it and/or modify it under the
|
561
561
|
terms of the GNU Affero General Public License as published by the Free
|
data/Rakefile
CHANGED
@@ -36,16 +36,18 @@ The main functions of Lingo are:
|
|
36
36
|
].to_a,
|
37
37
|
|
38
38
|
dependencies: {
|
39
|
-
'cyclops' => '~> 0.
|
40
|
-
'nuggets' => '~> 1.
|
39
|
+
'cyclops' => '~> 0.2',
|
40
|
+
'nuggets' => '~> 1.4',
|
41
41
|
'rubyzip' => '~> 1.1',
|
42
|
-
'sinatra-bells' => '~> 0.
|
42
|
+
'sinatra-bells' => '~> 0.3',
|
43
43
|
'unicode' => '~> 0.4'
|
44
44
|
},
|
45
45
|
|
46
46
|
development_dependencies: {
|
47
|
-
'diff-lcs'
|
48
|
-
'
|
47
|
+
'diff-lcs' => '~> 1.2',
|
48
|
+
'nokogiri' => '~> 1.6',
|
49
|
+
'open4' => '~> 1.3',
|
50
|
+
'pdf-reader' => '~> 1.3'
|
49
51
|
},
|
50
52
|
|
51
53
|
required_ruby_version: '>= 1.9.3'
|
data/lib/lingo.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -318,6 +318,7 @@ require_relative 'lingo/call'
|
|
318
318
|
require_relative 'lingo/error'
|
319
319
|
require_relative 'lingo/debug'
|
320
320
|
require_relative 'lingo/config'
|
321
|
+
require_relative 'lingo/filter'
|
321
322
|
require_relative 'lingo/progress'
|
322
323
|
require_relative 'lingo/database'
|
323
324
|
require_relative 'lingo/language'
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -26,15 +26,12 @@
|
|
26
26
|
|
27
27
|
require 'find'
|
28
28
|
|
29
|
-
%w[filemagic mime/types nokogiri nuggets/file/which pdf-reader].each { |lib|
|
30
|
-
begin
|
31
|
-
require lib
|
32
|
-
rescue LoadError
|
33
|
-
end
|
34
|
-
}
|
35
|
-
|
36
29
|
class Lingo
|
37
30
|
|
31
|
+
require_optional 'filemagic'
|
32
|
+
require_optional 'mime/types'
|
33
|
+
require_optional 'nuggets/file/which'
|
34
|
+
|
38
35
|
class Attendee
|
39
36
|
|
40
37
|
#--
|
@@ -118,11 +115,7 @@ class Lingo
|
|
118
115
|
@filter = get_key('filter', false)
|
119
116
|
@progress = get_key('progress', false)
|
120
117
|
|
121
|
-
|
122
|
-
lingo.config.deprecate('lir-record-pattern', :records, self)
|
123
|
-
end
|
124
|
-
|
125
|
-
@lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
|
118
|
+
@lir = get_re('records', nil, %r{^\[(\d+)\.\]})
|
126
119
|
@cut = get_re('fields', !!@lir, %r{^.+?:\s*})
|
127
120
|
@skip = get_re('skip', nil)
|
128
121
|
end
|
@@ -165,8 +158,8 @@ class Lingo
|
|
165
158
|
def filter(io, path, progress)
|
166
159
|
case @filter == true ? file_type(io, path) : @filter.to_s
|
167
160
|
when 'pdftotext' then filter_pdftotext(io, path, progress)
|
168
|
-
when /html/i then
|
169
|
-
when /xml/i then
|
161
|
+
when /html/i then filter_xml(io, :HTML)
|
162
|
+
when /xml/i then filter_xml(io)
|
170
163
|
when /pdf/i then filter_pdf(io)
|
171
164
|
else io
|
172
165
|
end
|
@@ -185,13 +178,13 @@ class Lingo
|
|
185
178
|
end
|
186
179
|
|
187
180
|
def filter_pdf(io)
|
188
|
-
|
189
|
-
|
181
|
+
cancel_filter(:PDF, 'pdf-reader') unless Object.const_defined?(:PDF)
|
182
|
+
Filter::PDF.new(io, @encoding)
|
190
183
|
end
|
191
184
|
|
192
|
-
def
|
193
|
-
Object.const_defined?(:Nokogiri)
|
194
|
-
|
185
|
+
def filter_xml(io, type = :XML)
|
186
|
+
cancel_filter(type, :nokogiri) unless Object.const_defined?(:Nokogiri)
|
187
|
+
Filter.const_get(type).new(io, @encoding)
|
195
188
|
end
|
196
189
|
|
197
190
|
def file_type(io, path)
|
@@ -239,10 +232,6 @@ class Lingo
|
|
239
232
|
tempfiles.each(&:unlink)
|
240
233
|
end
|
241
234
|
|
242
|
-
def text_enum(collection)
|
243
|
-
Enumerator.new { |y| collection.each { |x| y << x.text } }
|
244
|
-
end
|
245
|
-
|
246
235
|
def get_files
|
247
236
|
args = [get_key('glob', '*.txt'), get_key('recursive', false)]
|
248
237
|
|
@@ -263,10 +252,6 @@ class Lingo
|
|
263
252
|
|
264
253
|
end
|
265
254
|
|
266
|
-
# For backwards compatibility.
|
267
|
-
Textreader = TextReader
|
268
|
-
Text_reader = TextReader
|
269
|
-
|
270
255
|
end
|
271
256
|
|
272
257
|
end
|
data/lib/lingo/config.rb
CHANGED
@@ -42,12 +42,7 @@ class Lingo
|
|
42
42
|
load_config('language', :lang)
|
43
43
|
load_config('config')
|
44
44
|
|
45
|
-
if
|
46
|
-
deprecate(:textreader, :text_reader)
|
47
|
-
end
|
48
|
-
|
49
|
-
if r = get('meeting/attendees', 'text_reader') ||
|
50
|
-
get('meeting/attendees', 'textreader') # DEPRECATE textreader
|
45
|
+
if r = get('meeting/attendees', 'text_reader')
|
51
46
|
f = @cli.files
|
52
47
|
|
53
48
|
if i = r['files']
|
@@ -110,11 +105,11 @@ class Lingo
|
|
110
105
|
@cli.send(:quit, *args)
|
111
106
|
end
|
112
107
|
|
113
|
-
def deprecate(old, new, obj = self, what = :option)
|
108
|
+
def deprecate(old, new, obj = self, what = :option, ver = Version.next_minor)
|
114
109
|
unless @deprecated[[source = obj.class.name.sub(/\ALingo::/, ''), old]]
|
115
110
|
warn(
|
116
111
|
"DEPRECATION WARNING: #{source} #{what} `#{old}' is deprecated " <<
|
117
|
-
"and will be removed in Lingo
|
112
|
+
"and will be removed in Lingo #{ver}. Please use `#{new}' instead."
|
118
113
|
)
|
119
114
|
end
|
120
115
|
end
|
data/lib/lingo/filter.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Filter
|
30
|
+
|
31
|
+
def initialize(io, encoding = ENC)
|
32
|
+
@io, @encoding = io, encoding
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
raise NotImplementedError, 'must be implemented by subclass'
|
37
|
+
end
|
38
|
+
|
39
|
+
def close
|
40
|
+
@io.close
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
require_relative 'filter/pdf'
|
48
|
+
require_relative 'filter/xml'
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
require_optional 'pdf-reader'
|
30
|
+
|
31
|
+
class Filter
|
32
|
+
|
33
|
+
class PDF < self
|
34
|
+
|
35
|
+
def initialize(*args)
|
36
|
+
super
|
37
|
+
@obj = ::PDF::Reader.new(@io)
|
38
|
+
end
|
39
|
+
|
40
|
+
def each(&block)
|
41
|
+
@obj.pages.each { |x| x.text.each_line(&block) }
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
require_optional 'nokogiri'
|
30
|
+
|
31
|
+
class Filter
|
32
|
+
|
33
|
+
class XML < self
|
34
|
+
|
35
|
+
TYPE = :XML
|
36
|
+
|
37
|
+
def initialize(*args)
|
38
|
+
super
|
39
|
+
@obj = Nokogiri.send(self.class::TYPE, @io, nil, @encoding.to_s)
|
40
|
+
end
|
41
|
+
|
42
|
+
def each(&block)
|
43
|
+
@obj.root.element_children.each { |n| n.content.each_line(&block) }
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
class HTML < XML
|
49
|
+
|
50
|
+
TYPE = :HTML
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
@@ -52,12 +52,7 @@ class Lingo
|
|
52
52
|
def initialize(config, lingo)
|
53
53
|
@dic, @suggestions = Dictionary.new(config, lingo), []
|
54
54
|
|
55
|
-
|
56
|
-
lingo.config.deprecate(:compositum, :compound, self)
|
57
|
-
end
|
58
|
-
|
59
|
-
cfg = lingo.dictionary_config['compound'] ||
|
60
|
-
lingo.dictionary_config['compositum'] # DEPRECATE compositum
|
55
|
+
cfg = lingo.dictionary_config['compound']
|
61
56
|
|
62
57
|
DEFAULTS.each { |k, v| instance_variable_set(
|
63
58
|
"@#{k}", cfg.fetch(k.to_s.tr('_', '-'), v).to_i) }
|
data/lib/lingo/language/word.rb
CHANGED