lingo 1.8.5 → 1.8.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +25 -0
- data/README +7 -5
- data/Rakefile +58 -55
- data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
- data/{lingo.cfg → config/lingo.cfg} +10 -2
- data/{lir.cfg → config/lir.cfg} +10 -2
- data/{de → dict/de}/lingo-abk.txt +0 -0
- data/{de → dict/de}/lingo-dic.txt +0 -0
- data/{de → dict/de}/lingo-mul.txt +0 -0
- data/{de → dict/de}/lingo-syn.txt +0 -0
- data/{de → dict/de}/test_dic.txt +0 -0
- data/{de → dict/de}/test_gen.txt +0 -0
- data/{de → dict/de}/test_mu2.txt +0 -0
- data/{de → dict/de}/test_mul.txt +0 -0
- data/{de → dict/de}/test_sgw.txt +0 -0
- data/{de → dict/de}/test_syn.txt +0 -0
- data/{de → dict/de}/user-dic.txt +0 -0
- data/{en → dict/en}/lingo-dic.txt +0 -0
- data/{en → dict/en}/lingo-irr.txt +0 -0
- data/{en → dict/en}/lingo-mul.txt +0 -0
- data/{en → dict/en}/lingo-syn.txt +0 -0
- data/{en → dict/en}/lingo-wdn.txt +0 -0
- data/{en → dict/en}/user-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-mul.txt +0 -0
- data/{ru → dict/ru}/lingo-syn.txt +0 -0
- data/{ru → dict/ru}/user-dic.txt +0 -0
- data/{de.lang → lang/de.lang} +1 -1
- data/{en.lang → lang/en.lang} +0 -0
- data/{ru.lang → lang/ru.lang} +0 -0
- data/lib/lingo.rb +14 -15
- data/lib/lingo/app.rb +4 -2
- data/lib/lingo/attendee.rb +23 -43
- data/lib/lingo/attendee/abbreviator.rb +5 -5
- data/lib/lingo/attendee/debugger.rb +39 -12
- data/lib/lingo/attendee/decomposer.rb +3 -4
- data/lib/lingo/attendee/dehyphenizer.rb +4 -4
- data/lib/lingo/attendee/formatter.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +3 -4
- data/lib/lingo/attendee/noneword_filter.rb +8 -12
- data/lib/lingo/attendee/object_filter.rb +6 -3
- data/lib/lingo/attendee/sequencer.rb +5 -5
- data/lib/lingo/attendee/stemmer.rb +3 -2
- data/lib/lingo/attendee/synonymer.rb +3 -4
- data/lib/lingo/attendee/text_reader.rb +39 -38
- data/lib/lingo/attendee/text_writer.rb +10 -10
- data/lib/lingo/attendee/tokenizer.rb +63 -33
- data/lib/lingo/attendee/variator.rb +3 -7
- data/lib/lingo/attendee/vector_filter.rb +132 -65
- data/lib/lingo/attendee/word_searcher.rb +5 -3
- data/lib/lingo/buffered_attendee.rb +1 -3
- data/lib/lingo/call.rb +4 -3
- data/lib/lingo/cli.rb +5 -1
- data/lib/lingo/config.rb +11 -5
- data/lib/lingo/ctl.rb +3 -3
- data/lib/lingo/database.rb +3 -1
- data/lib/lingo/database/crypter.rb +1 -3
- data/lib/lingo/database/source.rb +3 -1
- data/lib/lingo/database/source/key_value.rb +3 -1
- data/lib/lingo/database/source/multi_key.rb +3 -1
- data/lib/lingo/database/source/multi_value.rb +3 -1
- data/lib/lingo/database/source/single_word.rb +3 -1
- data/lib/lingo/database/source/word_class.rb +3 -1
- data/lib/lingo/debug.rb +5 -5
- data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
- data/lib/lingo/error.rb +1 -1
- data/lib/lingo/language.rb +1 -9
- data/lib/lingo/language/dictionary.rb +2 -17
- data/lib/lingo/language/grammar.rb +10 -10
- data/lib/lingo/language/lexical.rb +2 -0
- data/lib/lingo/language/lexical_hash.rb +2 -0
- data/lib/lingo/language/token.rb +17 -3
- data/lib/lingo/language/word.rb +13 -5
- data/lib/lingo/language/word_form.rb +5 -3
- data/lib/lingo/progress.rb +2 -2
- data/lib/lingo/srv.rb +1 -1
- data/lib/lingo/srv/lingosrv.cfg +1 -1
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web.rb +1 -1
- data/lib/lingo/web/lingoweb.cfg +1 -1
- data/test/attendee/ts_abbreviator.rb +4 -2
- data/test/attendee/ts_multi_worder.rb +81 -88
- data/test/attendee/ts_noneword_filter.rb +2 -2
- data/test/attendee/ts_object_filter.rb +2 -2
- data/test/attendee/ts_sequencer.rb +40 -20
- data/test/attendee/ts_stemmer.rb +52 -26
- data/test/attendee/ts_text_reader.rb +75 -56
- data/test/attendee/ts_text_writer.rb +6 -4
- data/test/attendee/ts_tokenizer.rb +304 -193
- data/test/attendee/ts_vector_filter.rb +242 -9
- data/test/ref/artikel.non +3 -0
- data/test/ref/artikel.vec +1 -4
- data/test/ref/artikel.vef +940 -0
- data/test/ref/artikel.ven +0 -3
- data/test/ref/artikel.ver +0 -3
- data/test/ref/artikel.vet +2580 -0
- data/test/ref/lir.non +34 -31
- data/test/ref/lir.seq +14 -15
- data/test/ref/lir.vec +37 -37
- data/test/ref/lir.vef +329 -0
- data/test/ref/lir.ven +329 -0
- data/test/ref/lir.ver +329 -0
- data/test/ref/lir.vet +329 -0
- data/test/test_helper.rb +29 -16
- data/test/ts_language.rb +6 -47
- metadata +74 -87
- data/lingo.rb +0 -29
- data/spec/spec_helper.rb +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4cc870c8c1b49c580841a934b5906ed6ddf75e4
|
4
|
+
data.tar.gz: 1ecb26c708daa4bfa09f4aa76f6d7e17f1a72683
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2f0abed6198a7fcf0ff4f44aa442266f38c44646c7f4e8ef894886c453ce1654edd217c675f12e6b7d828c43ac461abb64d92aef20015249dbdf6f9efc03a3f
|
7
|
+
data.tar.gz: cb0be6e46a16639a384bab3507dc3b2bd4465736d1d7e0189d3930d1252e247fff4421364d860bd2cdd12f26b4f4445192a87998bea017bb1f285c8e0bda7639
|
data/ChangeLog
CHANGED
@@ -2,6 +2,31 @@
|
|
2
2
|
|
3
3
|
= Revision history for Lingo
|
4
4
|
|
5
|
+
== 1.8.6 [2015-02-09]
|
6
|
+
|
7
|
+
* Lingo::Attendee::VectorFilter learned +pos+ option to print position and
|
8
|
+
byte offset with each word.
|
9
|
+
* Lingo::Attendee::VectorFilter learned +tfidf+ option to sort results based
|
10
|
+
on their tf–idf[https://en.wikipedia.org/wiki/Tf–idf] score; the document
|
11
|
+
frequencies are calculated over the "corpus" of all files processed during
|
12
|
+
a single program invocation.
|
13
|
+
* Lingo::Attendee::VectorFilter learned +tokens+ option to filter on
|
14
|
+
Lingo::Language::Token in addition to Lingo::Language::Word.
|
15
|
+
* Lingo::Attendee::VectorFilter no longer supports +debug+ (as well as
|
16
|
+
+prompt+ and +preamble+); use Lingo::Attendee::DebugFilter instead.
|
17
|
+
* Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
|
18
|
+
is obsolete.
|
19
|
+
* Lingo::Attendee::TextReader passes byte offset to the following attendee.
|
20
|
+
* Lingo::Attendee::Tokenizer records token's byte offset.
|
21
|
+
* Lingo::Attendee::Tokenizer records token's sequence position.
|
22
|
+
* Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
|
23
|
+
specified tags' contents.
|
24
|
+
* Lingo::Attendee subclasses warn when invalid or obsolete options or names
|
25
|
+
are used.
|
26
|
+
* Changed German infix substitution +/en+ to +ch/chen+ in order to prevent
|
27
|
+
overly aggressive identifications.
|
28
|
+
* Internal refactoring and API changes.
|
29
|
+
|
5
30
|
== 1.8.5 [2014-10-02]
|
6
31
|
|
7
32
|
* Dictionary values (projections) are no longer sorted; hence, order of
|
data/README
CHANGED
@@ -35,7 +35,7 @@
|
|
35
35
|
|
36
36
|
== VERSION
|
37
37
|
|
38
|
-
This documentation refers to Lingo version 1.8.
|
38
|
+
This documentation refers to Lingo version 1.8.6
|
39
39
|
|
40
40
|
|
41
41
|
== DESCRIPTION
|
@@ -58,7 +58,7 @@ is a minimal configuration example to analyse this README file:
|
|
58
58
|
meeting:
|
59
59
|
attendees:
|
60
60
|
- text_reader: { files: 'README' }
|
61
|
-
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>:
|
61
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
|
62
62
|
|
63
63
|
Lingo is told to invite two attendees and wants them to talk to each other,
|
64
64
|
hence the name Lingo (= the technical language).
|
@@ -187,7 +187,7 @@ of context to external files.
|
|
187
187
|
_Example_:
|
188
188
|
|
189
189
|
# keep line endings
|
190
|
-
- text_reader: { files: $(files)
|
190
|
+
- text_reader: { files: $(files) }
|
191
191
|
# keep whitespace
|
192
192
|
- tokenizer: { space: true }
|
193
193
|
# do processing...
|
@@ -545,7 +545,7 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
|
|
545
545
|
=== Background and Theory
|
546
546
|
|
547
547
|
* Gödert, W.; Lepsky, K.; Nagelschmidt, M.: <em>{Informationserschließung und Automatisches Indexieren: ein Lehr- und Arbeitsbuch}[http://dx.doi.org/10.1007/978-3-642-23513-9]</em>. (German) Berlin etc.: Springer, 2012.
|
548
|
-
* Lepsky, K.; Vorhauer, J.: <em>{Lingo
|
548
|
+
* Lepsky, K.; Vorhauer, J.: <em>{Lingo – ein open source System für die automatische Indexierung deutschsprachiger Dokumente}[http://dx.doi.org/10.1515/ABITECH.2006.26.1.18]</em>. (German) In: ABI Technik 26 (1), 2006. pp 18-29.
|
549
549
|
* Nohr, H.: <em>{Grundlagen der automatischen Indexierung: ein Lehrbuch}[http://logos-verlag.de/cgi-bin/buch/isbn/0121]</em>. (German) Berlin: Logos, 2005.
|
550
550
|
* Hausser, R.: <em>{Grundlagen der Computerlinguistik. Mensch-Maschine-Kommunikation in natürlicher Sprache}[http://zbmath.org/?q=an:0956.68141]</em>. (German) Berlin etc.: Springer, 2000.
|
551
551
|
* Allen, J.: <em>{Natural language understanding}[http://zbmath.org/?q=an:0851.68106]</em>. (English) Redwood City, CA: Benjamin/Cummings, 1995.
|
@@ -559,6 +559,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
|
|
559
559
|
* Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
|
560
560
|
* Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
|
561
561
|
* Gödert, W.: <em>{Detecting multiword phrases in mathematical text corpora}[http://arxiv.org/abs/1210.0852]</em>. (English) arXiv:1210.0852 [cs.CL], 2012.
|
562
|
+
* Jersek, T.: <em>{Automatische DDC-Klassifizierung mit Lingo: Vorgehensweise und Ergebnisse}[http://www.citeulike.org/user/klaus-lepsky/article/12476139]</em>. (German) Köln: Fachhochschule Köln, 2012.
|
563
|
+
* Glaesener, L.: <em>{Automatisches Indexieren einer informationswissenschaftlichen Datenbank mit Mehrwortgruppen}[http://www.citeulike.org/user/klaus-lepsky/article/12476133]</em>. (German) Köln: Fachhochschule Köln, 2012.
|
562
564
|
* Schiffer, R.: <em>{Automatisches Indexieren technischer Kongressschriften}[http://ixtrieve.fh-koeln.de/lehre/schiffer-2007.pdf]</em>. (German) Köln: Fachhochschule Köln, 2007.
|
563
565
|
|
564
566
|
|
@@ -582,7 +584,7 @@ Lingo is based on a collective development by Klaus Lepsky and John Vorhauer.
|
|
582
584
|
== LICENSE AND COPYRIGHT
|
583
585
|
|
584
586
|
Copyright (C) 2005-2007 John Vorhauer
|
585
|
-
Copyright (C) 2007-
|
587
|
+
Copyright (C) 2007-2015 John Vorhauer, Jens Wille
|
586
588
|
|
587
589
|
Lingo is free software: you can redistribute it and/or modify it under the
|
588
590
|
terms of the GNU Affero General Public License as published by the Free
|
data/Rakefile
CHANGED
@@ -1,33 +1,20 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
require 'rake/clean'
|
6
|
-
require 'nuggets/ruby'
|
7
|
-
require File.join(__DIR__, %w[lib lingo version])
|
8
|
-
|
9
|
-
PACKAGE_NAME = 'lingo'
|
10
|
-
PACKAGE_PATH = File.join(__DIR__, 'pkg', "#{PACKAGE_NAME}-#{Lingo::VERSION}")
|
11
|
-
|
12
|
-
if RUBY_PLATFORM =~ /msdos|mswin|djgpp|mingw|windows/i
|
13
|
-
ZIP_COMMANDS = ['zip', '7z a'] # for hen's gem task
|
14
|
-
end
|
15
|
-
|
16
|
-
task default: :spec
|
17
|
-
task package: [:checkdoc, 'test:all', :clean]
|
3
|
+
require_relative 'lib/lingo/version'
|
18
4
|
|
19
5
|
begin
|
20
6
|
require 'hen'
|
21
7
|
|
22
8
|
Hen.lay! {{
|
23
9
|
gem: {
|
24
|
-
name:
|
10
|
+
name: 'lingo',
|
25
11
|
version: Lingo::VERSION,
|
26
12
|
summary: 'The full-featured automatic indexing system',
|
27
13
|
authors: ['John Vorhauer', 'Jens Wille'],
|
28
14
|
email: ['lingo@vorhauer.de', 'jens.wille@gmail.com'],
|
29
15
|
license: 'AGPL-3.0',
|
30
16
|
homepage: 'http://lex-lingo.de',
|
17
|
+
|
31
18
|
description: <<-EOT,
|
32
19
|
Lingo is an open source indexing system for research and teachings.
|
33
20
|
The main functions of Lingo are:
|
@@ -39,23 +26,32 @@ The main functions of Lingo are:
|
|
39
26
|
* generic identification of phrases/word sequences based on patterns
|
40
27
|
of word classes
|
41
28
|
EOT
|
29
|
+
|
42
30
|
extra_files: FileList[
|
43
|
-
'lingo
|
44
|
-
'
|
45
|
-
'txt
|
31
|
+
'lib/lingo/{srv,web}/**/{,.}*',
|
32
|
+
'config/*.cfg',
|
33
|
+
'dict/*/*.txt',
|
34
|
+
'lang/*.lang',
|
35
|
+
'txt/*.txt'
|
46
36
|
].to_a,
|
47
|
-
|
37
|
+
|
48
38
|
dependencies: {
|
49
|
-
'cyclops' =>
|
50
|
-
'nuggets' => '~> 1.
|
39
|
+
'cyclops' => '~> 0.1',
|
40
|
+
'nuggets' => '~> 1.1',
|
51
41
|
'rubyzip' => '~> 1.1',
|
52
42
|
'sinatra-bells' => '~> 0.0',
|
53
43
|
'unicode' => '~> 0.4'
|
54
44
|
},
|
45
|
+
|
55
46
|
development_dependencies: {
|
56
47
|
'diff-lcs' => '~> 1.2',
|
57
48
|
'open4' => '~> 1.3'
|
58
|
-
}
|
49
|
+
},
|
50
|
+
|
51
|
+
required_ruby_version: '>= 1.9.3'
|
52
|
+
},
|
53
|
+
test: {
|
54
|
+
pattern: %w[test/ts_*.rb test/attendee/ts_*.rb]
|
59
55
|
}
|
60
56
|
}}
|
61
57
|
rescue LoadError => err
|
@@ -71,40 +67,22 @@ CLEAN.include(
|
|
71
67
|
|
72
68
|
CLOBBER.include('store')
|
73
69
|
|
74
|
-
task :checkdoc do
|
75
|
-
docfile = File.join(__DIR__, 'doc', 'index.html')
|
76
|
-
abort "Please run `rake doc' first." unless File.exists?(docfile)
|
77
|
-
end
|
78
|
-
|
79
70
|
desc 'Run ALL tests'
|
80
|
-
task 'test:all' => [
|
81
|
-
|
82
|
-
Rake::TestTask.new(:test) do |t|
|
83
|
-
t.test_files = FileList.new('test/ts_*.rb', 'test/attendee/ts_*.rb')
|
84
|
-
end
|
71
|
+
task 'test:all' => %w[test test:txt test:lir]
|
85
72
|
|
86
73
|
desc 'Test against reference file (TXT)'
|
87
|
-
task
|
88
|
-
test_ref('artikel', 'lingo')
|
89
|
-
end
|
74
|
+
task('test:txt') { test_ref('artikel', 'lingo') }
|
90
75
|
|
91
76
|
desc 'Test against reference file (LIR)'
|
92
|
-
task
|
93
|
-
test_ref('lir')
|
94
|
-
end
|
95
|
-
|
96
|
-
desc 'Run all tests on packaged distribution'
|
97
|
-
task 'test:remote' => [:package] do
|
98
|
-
chdir(PACKAGE_PATH) { system('rake test:all') } || abort
|
99
|
-
end
|
77
|
+
task('test:lir') { test_ref('lir') }
|
100
78
|
|
101
|
-
unless (benchmarks = Dir[File.
|
79
|
+
unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
|
102
80
|
desc 'Run all benchmarks'
|
103
81
|
task :bench
|
104
82
|
|
105
83
|
benchmarks.each { |benchmark|
|
106
84
|
bench = File.basename(benchmark, '_bench.rb')
|
107
|
-
task :
|
85
|
+
task bench: benchtask = "bench:#{bench}"
|
108
86
|
|
109
87
|
desc "Run #{bench} benchmark"
|
110
88
|
task(benchtask) { system(File.ruby, benchmark) }
|
@@ -113,18 +91,43 @@ end
|
|
113
91
|
|
114
92
|
def test_ref(name, cfg = name)
|
115
93
|
require 'diff/lcs'
|
116
|
-
require 'diff/lcs/
|
94
|
+
require 'diff/lcs/hunk'
|
95
|
+
require 'nuggets/ruby'
|
96
|
+
|
97
|
+
jruby = RUBY_ENGINE == 'jruby'
|
98
|
+
jruby_lir = jruby && name == 'lir'
|
99
|
+
|
100
|
+
cmd = %W[bin/lingo -c #{cfg} txt/#{name}.txt]
|
101
|
+
buf, diff = ["Command failed: #{cmd.join(' ')}"], 0
|
102
|
+
|
103
|
+
Process.ruby(*cmd, I: :lib, &jruby ?
|
104
|
+
lambda { |_, _, o, e| buf << e.read; buf << o.read } :
|
105
|
+
lambda { |_, _, o, e| IO.interact({}, { o => buf, e => buf }) }
|
106
|
+
).success? or abort buf.join("\n\n")
|
107
|
+
|
108
|
+
Dir["test/ref/#{name}.*"].sort.each { |ref|
|
109
|
+
unless File.exist?(txt = ref.sub(/test\/ref/, 'txt'))
|
110
|
+
puts "?? #{txt}"
|
111
|
+
else
|
112
|
+
puts "## #{txt}"
|
113
|
+
|
114
|
+
data = [ref, txt].map { |file|
|
115
|
+
File.readlines(file).each { |line|
|
116
|
+
line.chomp!
|
117
|
+
line.gsub!(/(\d+\.\d+)\d/, '\1') if jruby_lir
|
118
|
+
}
|
119
|
+
}
|
117
120
|
|
118
|
-
|
119
|
-
diff, msg = 0, ["Command failed: #{cmd.join(' ')}"]
|
121
|
+
diffs, fld = Diff::LCS.diff(*data), 0
|
120
122
|
|
121
|
-
|
122
|
-
|
123
|
-
|
123
|
+
diffs.empty? ? next : diffs.each { |piece|
|
124
|
+
dlh = Diff::LCS::Hunk.new(*data, piece, 0, fld)
|
125
|
+
fld = dlh.file_length_difference
|
126
|
+
puts dlh.diff(:old)
|
127
|
+
}
|
128
|
+
end
|
124
129
|
|
125
|
-
|
126
|
-
puts "## #{org = ref.sub(/test\/ref/, 'txt')}"
|
127
|
-
diff += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
|
130
|
+
diff += 1
|
128
131
|
}
|
129
132
|
|
130
133
|
exit diff + 1 unless diff.zero?
|
@@ -52,7 +52,7 @@ meeting:
|
|
52
52
|
########################################
|
53
53
|
# Datenstrom anzeigen
|
54
54
|
#
|
55
|
-
# - debugger: { eval: 'true', ceval: '
|
55
|
+
# - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: 'lex:) ' }
|
56
56
|
|
57
57
|
|
58
58
|
########################################
|
@@ -60,7 +60,7 @@ meeting:
|
|
60
60
|
#
|
61
61
|
|
62
62
|
# Erstelle Datei mit Endung .log für Datenstrom
|
63
|
-
-
|
63
|
+
- debug_filter: { in: syn, prompt: 'lex:) ' }
|
64
64
|
- text_writer: { ext: log, sep: "\n" }
|
65
65
|
|
66
66
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
@@ -83,6 +83,14 @@ meeting:
|
|
83
83
|
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
|
84
84
|
- text_writer: { ext: ver, sep: "\n" }
|
85
85
|
|
86
|
+
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
87
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
88
|
+
- text_writer: { ext: vef, sep: "\n" }
|
89
|
+
|
90
|
+
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
91
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
|
92
|
+
- text_writer: { ext: vet, sep: "\n" }
|
93
|
+
|
86
94
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
87
95
|
- vector_filter: { in: syn, lexicals: m }
|
88
96
|
- text_writer: { ext: mul, sep: "\n" }
|
data/{lir.cfg → config/lir.cfg}
RENAMED
@@ -57,7 +57,7 @@ meeting:
|
|
57
57
|
########################################
|
58
58
|
# Datenstrom anzeigen
|
59
59
|
#
|
60
|
-
# - debugger: { eval: 'true', ceval: '
|
60
|
+
# - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: 'lex:) ' }
|
61
61
|
|
62
62
|
|
63
63
|
########################################
|
@@ -65,7 +65,7 @@ meeting:
|
|
65
65
|
#
|
66
66
|
|
67
67
|
# Erstelle Datei mit Endung .log für Datenstrom
|
68
|
-
-
|
68
|
+
- debug_filter: { in: syn, prompt: 'lex:) ' }
|
69
69
|
- text_writer: { ext: log, sep: "\n" }
|
70
70
|
|
71
71
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
@@ -88,6 +88,14 @@ meeting:
|
|
88
88
|
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
|
89
89
|
- text_writer: { ext: ver, sep: '|' }
|
90
90
|
|
91
|
+
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
92
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
93
|
+
- text_writer: { ext: vef, sep: '|' }
|
94
|
+
|
95
|
+
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
96
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
|
97
|
+
- text_writer: { ext: vet, sep: '|' }
|
98
|
+
|
91
99
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
92
100
|
- vector_filter: { in: syn, lexicals: m }
|
93
101
|
- text_writer: { ext: mul, sep: '|' }
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{de → dict/de}/test_dic.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_gen.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_mu2.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_mul.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_sgw.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_syn.txt
RENAMED
File without changes
|
data/{de → dict/de}/user-dic.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{en → dict/en}/user-dic.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{ru → dict/ru}/user-dic.txt
RENAMED
File without changes
|
data/{de.lang → lang/de.lang}
RENAMED
@@ -79,7 +79,7 @@ language:
|
|
79
79
|
- [a, 'este ste ster sten stes ester estes esten e em en er ere eren erer eres es erem']
|
80
80
|
- [v, 'e/en en/en est/en et/en st/en t/en te/en ten/en eten/en ete/en etest/en s']
|
81
81
|
- [e, 's']
|
82
|
-
- [f, 's n e en es er ch/che /
|
82
|
+
- [f, 's n e en es er ch/che ch/chen']
|
83
83
|
|
84
84
|
inflect:
|
85
85
|
a: # adjectives
|
data/{en.lang → lang/en.lang}
RENAMED
File without changes
|
data/{ru.lang → lang/ru.lang}
RENAMED
File without changes
|
data/lib/lingo.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -29,6 +29,8 @@ require 'stringio'
|
|
29
29
|
require 'pathname'
|
30
30
|
require 'fileutils'
|
31
31
|
require 'nuggets/file/ext'
|
32
|
+
require 'nuggets/hash/nest'
|
33
|
+
require 'nuggets/hash/seen'
|
32
34
|
require 'nuggets/env/user_home'
|
33
35
|
require 'nuggets/string/camelscore'
|
34
36
|
|
@@ -200,12 +202,10 @@ class Lingo
|
|
200
202
|
end
|
201
203
|
|
202
204
|
def walk(path, options, legacy = true)
|
203
|
-
dirs = [options[:dir].to_s]
|
205
|
+
dirs, seen = [options[:dir].to_s], Hash.seen
|
204
206
|
dirs << '' if legacy
|
205
207
|
dirs.uniq!
|
206
208
|
|
207
|
-
seen = Hash.new { |h, k| h[k] = true; false }
|
208
|
-
|
209
209
|
path.each { |d|
|
210
210
|
next if seen[d = File.expand_path(d)]
|
211
211
|
dirs.each { |i| yield File.join(d, i) } or break
|
@@ -265,13 +265,13 @@ class Lingo
|
|
265
265
|
end
|
266
266
|
|
267
267
|
def invite(list = config['meeting/attendees'])
|
268
|
-
supplier = Hash.
|
269
|
-
subscriber = Hash.
|
268
|
+
supplier = Hash.nest { [] }
|
269
|
+
subscriber = Hash.nest { [] }
|
270
270
|
|
271
271
|
last_link, auto_link = '', 0
|
272
272
|
|
273
273
|
list.each { |hash|
|
274
|
-
name = hash.keys.first.camelcase
|
274
|
+
name = (name_key = hash.keys.first).camelcase
|
275
275
|
|
276
276
|
cfg = (config["language/attendees/#{name.downcase}"] || {})
|
277
277
|
.merge(hash.values.first).update('name' => name)
|
@@ -284,41 +284,40 @@ class Lingo
|
|
284
284
|
|
285
285
|
@attendees << attendee = Attendee.const_get(name).new(cfg, self)
|
286
286
|
|
287
|
+
unless name == (real = attendee.class.name.split('::').last)
|
288
|
+
config.deprecate(name_key, real.underscore, attendee, :name)
|
289
|
+
end
|
290
|
+
|
287
291
|
{ 'in' => subscriber, 'out' => supplier }.each { |key, target|
|
288
292
|
cfg[key].split(SEP_RE).each { |ch| target[ch] << attendee }
|
289
293
|
}
|
290
294
|
}
|
291
295
|
|
292
296
|
supplier.each { |ch, attendees| attendees.each { |att|
|
293
|
-
att.
|
297
|
+
att.subscribers.concat(subscriber[ch])
|
294
298
|
} }
|
295
299
|
end
|
296
300
|
|
297
301
|
def start
|
298
|
-
@attendees.first.
|
302
|
+
@attendees.first.control(:TALK)
|
299
303
|
end
|
300
304
|
|
301
305
|
def reset(close = true)
|
302
306
|
dictionaries.each { |i| i.close } if close
|
303
307
|
@dictionaries, @attendees = [], []
|
304
|
-
@lexical_hash = Hash.
|
308
|
+
@lexical_hash = Hash.nest { |k| Language::LexicalHash.new(k, self) }
|
305
309
|
end
|
306
310
|
|
307
311
|
def warn(*msg)
|
308
312
|
config.warn(*msg)
|
309
313
|
end
|
310
314
|
|
311
|
-
def deprecate(old, new, obj = self)
|
312
|
-
config.deprecate(old, new, obj)
|
313
|
-
end
|
314
|
-
|
315
315
|
end
|
316
316
|
|
317
317
|
require_relative 'lingo/call'
|
318
318
|
require_relative 'lingo/error'
|
319
319
|
require_relative 'lingo/debug'
|
320
320
|
require_relative 'lingo/config'
|
321
|
-
require_relative 'lingo/agenda_item'
|
322
321
|
require_relative 'lingo/progress'
|
323
322
|
require_relative 'lingo/database'
|
324
323
|
require_relative 'lingo/language'
|