lingo 1.8.5 → 1.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +25 -0
- data/README +7 -5
- data/Rakefile +58 -55
- data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
- data/{lingo.cfg → config/lingo.cfg} +10 -2
- data/{lir.cfg → config/lir.cfg} +10 -2
- data/{de → dict/de}/lingo-abk.txt +0 -0
- data/{de → dict/de}/lingo-dic.txt +0 -0
- data/{de → dict/de}/lingo-mul.txt +0 -0
- data/{de → dict/de}/lingo-syn.txt +0 -0
- data/{de → dict/de}/test_dic.txt +0 -0
- data/{de → dict/de}/test_gen.txt +0 -0
- data/{de → dict/de}/test_mu2.txt +0 -0
- data/{de → dict/de}/test_mul.txt +0 -0
- data/{de → dict/de}/test_sgw.txt +0 -0
- data/{de → dict/de}/test_syn.txt +0 -0
- data/{de → dict/de}/user-dic.txt +0 -0
- data/{en → dict/en}/lingo-dic.txt +0 -0
- data/{en → dict/en}/lingo-irr.txt +0 -0
- data/{en → dict/en}/lingo-mul.txt +0 -0
- data/{en → dict/en}/lingo-syn.txt +0 -0
- data/{en → dict/en}/lingo-wdn.txt +0 -0
- data/{en → dict/en}/user-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-mul.txt +0 -0
- data/{ru → dict/ru}/lingo-syn.txt +0 -0
- data/{ru → dict/ru}/user-dic.txt +0 -0
- data/{de.lang → lang/de.lang} +1 -1
- data/{en.lang → lang/en.lang} +0 -0
- data/{ru.lang → lang/ru.lang} +0 -0
- data/lib/lingo.rb +14 -15
- data/lib/lingo/app.rb +4 -2
- data/lib/lingo/attendee.rb +23 -43
- data/lib/lingo/attendee/abbreviator.rb +5 -5
- data/lib/lingo/attendee/debugger.rb +39 -12
- data/lib/lingo/attendee/decomposer.rb +3 -4
- data/lib/lingo/attendee/dehyphenizer.rb +4 -4
- data/lib/lingo/attendee/formatter.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +3 -4
- data/lib/lingo/attendee/noneword_filter.rb +8 -12
- data/lib/lingo/attendee/object_filter.rb +6 -3
- data/lib/lingo/attendee/sequencer.rb +5 -5
- data/lib/lingo/attendee/stemmer.rb +3 -2
- data/lib/lingo/attendee/synonymer.rb +3 -4
- data/lib/lingo/attendee/text_reader.rb +39 -38
- data/lib/lingo/attendee/text_writer.rb +10 -10
- data/lib/lingo/attendee/tokenizer.rb +63 -33
- data/lib/lingo/attendee/variator.rb +3 -7
- data/lib/lingo/attendee/vector_filter.rb +132 -65
- data/lib/lingo/attendee/word_searcher.rb +5 -3
- data/lib/lingo/buffered_attendee.rb +1 -3
- data/lib/lingo/call.rb +4 -3
- data/lib/lingo/cli.rb +5 -1
- data/lib/lingo/config.rb +11 -5
- data/lib/lingo/ctl.rb +3 -3
- data/lib/lingo/database.rb +3 -1
- data/lib/lingo/database/crypter.rb +1 -3
- data/lib/lingo/database/source.rb +3 -1
- data/lib/lingo/database/source/key_value.rb +3 -1
- data/lib/lingo/database/source/multi_key.rb +3 -1
- data/lib/lingo/database/source/multi_value.rb +3 -1
- data/lib/lingo/database/source/single_word.rb +3 -1
- data/lib/lingo/database/source/word_class.rb +3 -1
- data/lib/lingo/debug.rb +5 -5
- data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
- data/lib/lingo/error.rb +1 -1
- data/lib/lingo/language.rb +1 -9
- data/lib/lingo/language/dictionary.rb +2 -17
- data/lib/lingo/language/grammar.rb +10 -10
- data/lib/lingo/language/lexical.rb +2 -0
- data/lib/lingo/language/lexical_hash.rb +2 -0
- data/lib/lingo/language/token.rb +17 -3
- data/lib/lingo/language/word.rb +13 -5
- data/lib/lingo/language/word_form.rb +5 -3
- data/lib/lingo/progress.rb +2 -2
- data/lib/lingo/srv.rb +1 -1
- data/lib/lingo/srv/lingosrv.cfg +1 -1
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web.rb +1 -1
- data/lib/lingo/web/lingoweb.cfg +1 -1
- data/test/attendee/ts_abbreviator.rb +4 -2
- data/test/attendee/ts_multi_worder.rb +81 -88
- data/test/attendee/ts_noneword_filter.rb +2 -2
- data/test/attendee/ts_object_filter.rb +2 -2
- data/test/attendee/ts_sequencer.rb +40 -20
- data/test/attendee/ts_stemmer.rb +52 -26
- data/test/attendee/ts_text_reader.rb +75 -56
- data/test/attendee/ts_text_writer.rb +6 -4
- data/test/attendee/ts_tokenizer.rb +304 -193
- data/test/attendee/ts_vector_filter.rb +242 -9
- data/test/ref/artikel.non +3 -0
- data/test/ref/artikel.vec +1 -4
- data/test/ref/artikel.vef +940 -0
- data/test/ref/artikel.ven +0 -3
- data/test/ref/artikel.ver +0 -3
- data/test/ref/artikel.vet +2580 -0
- data/test/ref/lir.non +34 -31
- data/test/ref/lir.seq +14 -15
- data/test/ref/lir.vec +37 -37
- data/test/ref/lir.vef +329 -0
- data/test/ref/lir.ven +329 -0
- data/test/ref/lir.ver +329 -0
- data/test/ref/lir.vet +329 -0
- data/test/test_helper.rb +29 -16
- data/test/ts_language.rb +6 -47
- metadata +74 -87
- data/lingo.rb +0 -29
- data/spec/spec_helper.rb +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4cc870c8c1b49c580841a934b5906ed6ddf75e4
|
4
|
+
data.tar.gz: 1ecb26c708daa4bfa09f4aa76f6d7e17f1a72683
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2f0abed6198a7fcf0ff4f44aa442266f38c44646c7f4e8ef894886c453ce1654edd217c675f12e6b7d828c43ac461abb64d92aef20015249dbdf6f9efc03a3f
|
7
|
+
data.tar.gz: cb0be6e46a16639a384bab3507dc3b2bd4465736d1d7e0189d3930d1252e247fff4421364d860bd2cdd12f26b4f4445192a87998bea017bb1f285c8e0bda7639
|
data/ChangeLog
CHANGED
@@ -2,6 +2,31 @@
|
|
2
2
|
|
3
3
|
= Revision history for Lingo
|
4
4
|
|
5
|
+
== 1.8.6 [2015-02-09]
|
6
|
+
|
7
|
+
* Lingo::Attendee::VectorFilter learned +pos+ option to print position and
|
8
|
+
byte offset with each word.
|
9
|
+
* Lingo::Attendee::VectorFilter learned +tfidf+ option to sort results based
|
10
|
+
on their tf–idf[https://en.wikipedia.org/wiki/Tf–idf] score; the document
|
11
|
+
frequencies are calculated over the "corpus" of all files processed during
|
12
|
+
a single program invocation.
|
13
|
+
* Lingo::Attendee::VectorFilter learned +tokens+ option to filter on
|
14
|
+
Lingo::Language::Token in addition to Lingo::Language::Word.
|
15
|
+
* Lingo::Attendee::VectorFilter no longer supports +debug+ (as well as
|
16
|
+
+prompt+ and +preamble+); use Lingo::Attendee::DebugFilter instead.
|
17
|
+
* Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
|
18
|
+
is obsolete.
|
19
|
+
* Lingo::Attendee::TextReader passes byte offset to the following attendee.
|
20
|
+
* Lingo::Attendee::Tokenizer records token's byte offset.
|
21
|
+
* Lingo::Attendee::Tokenizer records token's sequence position.
|
22
|
+
* Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
|
23
|
+
specified tags' contents.
|
24
|
+
* Lingo::Attendee subclasses warn when invalid or obsolete options or names
|
25
|
+
are used.
|
26
|
+
* Changed German infix substitution +/en+ to +ch/chen+ in order to prevent
|
27
|
+
overly aggressive identifications.
|
28
|
+
* Internal refactoring and API changes.
|
29
|
+
|
5
30
|
== 1.8.5 [2014-10-02]
|
6
31
|
|
7
32
|
* Dictionary values (projections) are no longer sorted; hence, order of
|
data/README
CHANGED
@@ -35,7 +35,7 @@
|
|
35
35
|
|
36
36
|
== VERSION
|
37
37
|
|
38
|
-
This documentation refers to Lingo version 1.8.
|
38
|
+
This documentation refers to Lingo version 1.8.6
|
39
39
|
|
40
40
|
|
41
41
|
== DESCRIPTION
|
@@ -58,7 +58,7 @@ is a minimal configuration example to analyse this README file:
|
|
58
58
|
meeting:
|
59
59
|
attendees:
|
60
60
|
- text_reader: { files: 'README' }
|
61
|
-
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>:
|
61
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
|
62
62
|
|
63
63
|
Lingo is told to invite two attendees and wants them to talk to each other,
|
64
64
|
hence the name Lingo (= the technical language).
|
@@ -187,7 +187,7 @@ of context to external files.
|
|
187
187
|
_Example_:
|
188
188
|
|
189
189
|
# keep line endings
|
190
|
-
- text_reader: { files: $(files)
|
190
|
+
- text_reader: { files: $(files) }
|
191
191
|
# keep whitespace
|
192
192
|
- tokenizer: { space: true }
|
193
193
|
# do processing...
|
@@ -545,7 +545,7 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
|
|
545
545
|
=== Background and Theory
|
546
546
|
|
547
547
|
* Gödert, W.; Lepsky, K.; Nagelschmidt, M.: <em>{Informationserschließung und Automatisches Indexieren: ein Lehr- und Arbeitsbuch}[http://dx.doi.org/10.1007/978-3-642-23513-9]</em>. (German) Berlin etc.: Springer, 2012.
|
548
|
-
* Lepsky, K.; Vorhauer, J.: <em>{Lingo
|
548
|
+
* Lepsky, K.; Vorhauer, J.: <em>{Lingo – ein open source System für die automatische Indexierung deutschsprachiger Dokumente}[http://dx.doi.org/10.1515/ABITECH.2006.26.1.18]</em>. (German) In: ABI Technik 26 (1), 2006. pp 18-29.
|
549
549
|
* Nohr, H.: <em>{Grundlagen der automatischen Indexierung: ein Lehrbuch}[http://logos-verlag.de/cgi-bin/buch/isbn/0121]</em>. (German) Berlin: Logos, 2005.
|
550
550
|
* Hausser, R.: <em>{Grundlagen der Computerlinguistik. Mensch-Maschine-Kommunikation in natürlicher Sprache}[http://zbmath.org/?q=an:0956.68141]</em>. (German) Berlin etc.: Springer, 2000.
|
551
551
|
* Allen, J.: <em>{Natural language understanding}[http://zbmath.org/?q=an:0851.68106]</em>. (English) Redwood City, CA: Benjamin/Cummings, 1995.
|
@@ -559,6 +559,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
|
|
559
559
|
* Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
|
560
560
|
* Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
|
561
561
|
* Gödert, W.: <em>{Detecting multiword phrases in mathematical text corpora}[http://arxiv.org/abs/1210.0852]</em>. (English) arXiv:1210.0852 [cs.CL], 2012.
|
562
|
+
* Jersek, T.: <em>{Automatische DDC-Klassifizierung mit Lingo: Vorgehensweise und Ergebnisse}[http://www.citeulike.org/user/klaus-lepsky/article/12476139]</em>. (German) Köln: Fachhochschule Köln, 2012.
|
563
|
+
* Glaesener, L.: <em>{Automatisches Indexieren einer informationswissenschaftlichen Datenbank mit Mehrwortgruppen}[http://www.citeulike.org/user/klaus-lepsky/article/12476133]</em>. (German) Köln: Fachhochschule Köln, 2012.
|
562
564
|
* Schiffer, R.: <em>{Automatisches Indexieren technischer Kongressschriften}[http://ixtrieve.fh-koeln.de/lehre/schiffer-2007.pdf]</em>. (German) Köln: Fachhochschule Köln, 2007.
|
563
565
|
|
564
566
|
|
@@ -582,7 +584,7 @@ Lingo is based on a collective development by Klaus Lepsky and John Vorhauer.
|
|
582
584
|
== LICENSE AND COPYRIGHT
|
583
585
|
|
584
586
|
Copyright (C) 2005-2007 John Vorhauer
|
585
|
-
Copyright (C) 2007-
|
587
|
+
Copyright (C) 2007-2015 John Vorhauer, Jens Wille
|
586
588
|
|
587
589
|
Lingo is free software: you can redistribute it and/or modify it under the
|
588
590
|
terms of the GNU Affero General Public License as published by the Free
|
data/Rakefile
CHANGED
@@ -1,33 +1,20 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
require 'rake/clean'
|
6
|
-
require 'nuggets/ruby'
|
7
|
-
require File.join(__DIR__, %w[lib lingo version])
|
8
|
-
|
9
|
-
PACKAGE_NAME = 'lingo'
|
10
|
-
PACKAGE_PATH = File.join(__DIR__, 'pkg', "#{PACKAGE_NAME}-#{Lingo::VERSION}")
|
11
|
-
|
12
|
-
if RUBY_PLATFORM =~ /msdos|mswin|djgpp|mingw|windows/i
|
13
|
-
ZIP_COMMANDS = ['zip', '7z a'] # for hen's gem task
|
14
|
-
end
|
15
|
-
|
16
|
-
task default: :spec
|
17
|
-
task package: [:checkdoc, 'test:all', :clean]
|
3
|
+
require_relative 'lib/lingo/version'
|
18
4
|
|
19
5
|
begin
|
20
6
|
require 'hen'
|
21
7
|
|
22
8
|
Hen.lay! {{
|
23
9
|
gem: {
|
24
|
-
name:
|
10
|
+
name: 'lingo',
|
25
11
|
version: Lingo::VERSION,
|
26
12
|
summary: 'The full-featured automatic indexing system',
|
27
13
|
authors: ['John Vorhauer', 'Jens Wille'],
|
28
14
|
email: ['lingo@vorhauer.de', 'jens.wille@gmail.com'],
|
29
15
|
license: 'AGPL-3.0',
|
30
16
|
homepage: 'http://lex-lingo.de',
|
17
|
+
|
31
18
|
description: <<-EOT,
|
32
19
|
Lingo is an open source indexing system for research and teachings.
|
33
20
|
The main functions of Lingo are:
|
@@ -39,23 +26,32 @@ The main functions of Lingo are:
|
|
39
26
|
* generic identification of phrases/word sequences based on patterns
|
40
27
|
of word classes
|
41
28
|
EOT
|
29
|
+
|
42
30
|
extra_files: FileList[
|
43
|
-
'lingo
|
44
|
-
'
|
45
|
-
'txt
|
31
|
+
'lib/lingo/{srv,web}/**/{,.}*',
|
32
|
+
'config/*.cfg',
|
33
|
+
'dict/*/*.txt',
|
34
|
+
'lang/*.lang',
|
35
|
+
'txt/*.txt'
|
46
36
|
].to_a,
|
47
|
-
|
37
|
+
|
48
38
|
dependencies: {
|
49
|
-
'cyclops' =>
|
50
|
-
'nuggets' => '~> 1.
|
39
|
+
'cyclops' => '~> 0.1',
|
40
|
+
'nuggets' => '~> 1.1',
|
51
41
|
'rubyzip' => '~> 1.1',
|
52
42
|
'sinatra-bells' => '~> 0.0',
|
53
43
|
'unicode' => '~> 0.4'
|
54
44
|
},
|
45
|
+
|
55
46
|
development_dependencies: {
|
56
47
|
'diff-lcs' => '~> 1.2',
|
57
48
|
'open4' => '~> 1.3'
|
58
|
-
}
|
49
|
+
},
|
50
|
+
|
51
|
+
required_ruby_version: '>= 1.9.3'
|
52
|
+
},
|
53
|
+
test: {
|
54
|
+
pattern: %w[test/ts_*.rb test/attendee/ts_*.rb]
|
59
55
|
}
|
60
56
|
}}
|
61
57
|
rescue LoadError => err
|
@@ -71,40 +67,22 @@ CLEAN.include(
|
|
71
67
|
|
72
68
|
CLOBBER.include('store')
|
73
69
|
|
74
|
-
task :checkdoc do
|
75
|
-
docfile = File.join(__DIR__, 'doc', 'index.html')
|
76
|
-
abort "Please run `rake doc' first." unless File.exists?(docfile)
|
77
|
-
end
|
78
|
-
|
79
70
|
desc 'Run ALL tests'
|
80
|
-
task 'test:all' => [
|
81
|
-
|
82
|
-
Rake::TestTask.new(:test) do |t|
|
83
|
-
t.test_files = FileList.new('test/ts_*.rb', 'test/attendee/ts_*.rb')
|
84
|
-
end
|
71
|
+
task 'test:all' => %w[test test:txt test:lir]
|
85
72
|
|
86
73
|
desc 'Test against reference file (TXT)'
|
87
|
-
task
|
88
|
-
test_ref('artikel', 'lingo')
|
89
|
-
end
|
74
|
+
task('test:txt') { test_ref('artikel', 'lingo') }
|
90
75
|
|
91
76
|
desc 'Test against reference file (LIR)'
|
92
|
-
task
|
93
|
-
test_ref('lir')
|
94
|
-
end
|
95
|
-
|
96
|
-
desc 'Run all tests on packaged distribution'
|
97
|
-
task 'test:remote' => [:package] do
|
98
|
-
chdir(PACKAGE_PATH) { system('rake test:all') } || abort
|
99
|
-
end
|
77
|
+
task('test:lir') { test_ref('lir') }
|
100
78
|
|
101
|
-
unless (benchmarks = Dir[File.
|
79
|
+
unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
|
102
80
|
desc 'Run all benchmarks'
|
103
81
|
task :bench
|
104
82
|
|
105
83
|
benchmarks.each { |benchmark|
|
106
84
|
bench = File.basename(benchmark, '_bench.rb')
|
107
|
-
task :
|
85
|
+
task bench: benchtask = "bench:#{bench}"
|
108
86
|
|
109
87
|
desc "Run #{bench} benchmark"
|
110
88
|
task(benchtask) { system(File.ruby, benchmark) }
|
@@ -113,18 +91,43 @@ end
|
|
113
91
|
|
114
92
|
def test_ref(name, cfg = name)
|
115
93
|
require 'diff/lcs'
|
116
|
-
require 'diff/lcs/
|
94
|
+
require 'diff/lcs/hunk'
|
95
|
+
require 'nuggets/ruby'
|
96
|
+
|
97
|
+
jruby = RUBY_ENGINE == 'jruby'
|
98
|
+
jruby_lir = jruby && name == 'lir'
|
99
|
+
|
100
|
+
cmd = %W[bin/lingo -c #{cfg} txt/#{name}.txt]
|
101
|
+
buf, diff = ["Command failed: #{cmd.join(' ')}"], 0
|
102
|
+
|
103
|
+
Process.ruby(*cmd, I: :lib, &jruby ?
|
104
|
+
lambda { |_, _, o, e| buf << e.read; buf << o.read } :
|
105
|
+
lambda { |_, _, o, e| IO.interact({}, { o => buf, e => buf }) }
|
106
|
+
).success? or abort buf.join("\n\n")
|
107
|
+
|
108
|
+
Dir["test/ref/#{name}.*"].sort.each { |ref|
|
109
|
+
unless File.exist?(txt = ref.sub(/test\/ref/, 'txt'))
|
110
|
+
puts "?? #{txt}"
|
111
|
+
else
|
112
|
+
puts "## #{txt}"
|
113
|
+
|
114
|
+
data = [ref, txt].map { |file|
|
115
|
+
File.readlines(file).each { |line|
|
116
|
+
line.chomp!
|
117
|
+
line.gsub!(/(\d+\.\d+)\d/, '\1') if jruby_lir
|
118
|
+
}
|
119
|
+
}
|
117
120
|
|
118
|
-
|
119
|
-
diff, msg = 0, ["Command failed: #{cmd.join(' ')}"]
|
121
|
+
diffs, fld = Diff::LCS.diff(*data), 0
|
120
122
|
|
121
|
-
|
122
|
-
|
123
|
-
|
123
|
+
diffs.empty? ? next : diffs.each { |piece|
|
124
|
+
dlh = Diff::LCS::Hunk.new(*data, piece, 0, fld)
|
125
|
+
fld = dlh.file_length_difference
|
126
|
+
puts dlh.diff(:old)
|
127
|
+
}
|
128
|
+
end
|
124
129
|
|
125
|
-
|
126
|
-
puts "## #{org = ref.sub(/test\/ref/, 'txt')}"
|
127
|
-
diff += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
|
130
|
+
diff += 1
|
128
131
|
}
|
129
132
|
|
130
133
|
exit diff + 1 unless diff.zero?
|
@@ -52,7 +52,7 @@ meeting:
|
|
52
52
|
########################################
|
53
53
|
# Datenstrom anzeigen
|
54
54
|
#
|
55
|
-
# - debugger: { eval: 'true', ceval: '
|
55
|
+
# - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: 'lex:) ' }
|
56
56
|
|
57
57
|
|
58
58
|
########################################
|
@@ -60,7 +60,7 @@ meeting:
|
|
60
60
|
#
|
61
61
|
|
62
62
|
# Erstelle Datei mit Endung .log für Datenstrom
|
63
|
-
-
|
63
|
+
- debug_filter: { in: syn, prompt: 'lex:) ' }
|
64
64
|
- text_writer: { ext: log, sep: "\n" }
|
65
65
|
|
66
66
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
@@ -83,6 +83,14 @@ meeting:
|
|
83
83
|
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
|
84
84
|
- text_writer: { ext: ver, sep: "\n" }
|
85
85
|
|
86
|
+
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
87
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
88
|
+
- text_writer: { ext: vef, sep: "\n" }
|
89
|
+
|
90
|
+
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
91
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
|
92
|
+
- text_writer: { ext: vet, sep: "\n" }
|
93
|
+
|
86
94
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
87
95
|
- vector_filter: { in: syn, lexicals: m }
|
88
96
|
- text_writer: { ext: mul, sep: "\n" }
|
data/{lir.cfg → config/lir.cfg}
RENAMED
@@ -57,7 +57,7 @@ meeting:
|
|
57
57
|
########################################
|
58
58
|
# Datenstrom anzeigen
|
59
59
|
#
|
60
|
-
# - debugger: { eval: 'true', ceval: '
|
60
|
+
# - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: 'lex:) ' }
|
61
61
|
|
62
62
|
|
63
63
|
########################################
|
@@ -65,7 +65,7 @@ meeting:
|
|
65
65
|
#
|
66
66
|
|
67
67
|
# Erstelle Datei mit Endung .log für Datenstrom
|
68
|
-
-
|
68
|
+
- debug_filter: { in: syn, prompt: 'lex:) ' }
|
69
69
|
- text_writer: { ext: log, sep: "\n" }
|
70
70
|
|
71
71
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
@@ -88,6 +88,14 @@ meeting:
|
|
88
88
|
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
|
89
89
|
- text_writer: { ext: ver, sep: '|' }
|
90
90
|
|
91
|
+
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
92
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
93
|
+
- text_writer: { ext: vef, sep: '|' }
|
94
|
+
|
95
|
+
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
96
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
|
97
|
+
- text_writer: { ext: vet, sep: '|' }
|
98
|
+
|
91
99
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
92
100
|
- vector_filter: { in: syn, lexicals: m }
|
93
101
|
- text_writer: { ext: mul, sep: '|' }
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{de → dict/de}/test_dic.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_gen.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_mu2.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_mul.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_sgw.txt
RENAMED
File without changes
|
data/{de → dict/de}/test_syn.txt
RENAMED
File without changes
|
data/{de → dict/de}/user-dic.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{en → dict/en}/user-dic.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{ru → dict/ru}/user-dic.txt
RENAMED
File without changes
|
data/{de.lang → lang/de.lang}
RENAMED
@@ -79,7 +79,7 @@ language:
|
|
79
79
|
- [a, 'este ste ster sten stes ester estes esten e em en er ere eren erer eres es erem']
|
80
80
|
- [v, 'e/en en/en est/en et/en st/en t/en te/en ten/en eten/en ete/en etest/en s']
|
81
81
|
- [e, 's']
|
82
|
-
- [f, 's n e en es er ch/che /
|
82
|
+
- [f, 's n e en es er ch/che ch/chen']
|
83
83
|
|
84
84
|
inflect:
|
85
85
|
a: # adjectives
|
data/{en.lang → lang/en.lang}
RENAMED
File without changes
|
data/{ru.lang → lang/ru.lang}
RENAMED
File without changes
|
data/lib/lingo.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -29,6 +29,8 @@ require 'stringio'
|
|
29
29
|
require 'pathname'
|
30
30
|
require 'fileutils'
|
31
31
|
require 'nuggets/file/ext'
|
32
|
+
require 'nuggets/hash/nest'
|
33
|
+
require 'nuggets/hash/seen'
|
32
34
|
require 'nuggets/env/user_home'
|
33
35
|
require 'nuggets/string/camelscore'
|
34
36
|
|
@@ -200,12 +202,10 @@ class Lingo
|
|
200
202
|
end
|
201
203
|
|
202
204
|
def walk(path, options, legacy = true)
|
203
|
-
dirs = [options[:dir].to_s]
|
205
|
+
dirs, seen = [options[:dir].to_s], Hash.seen
|
204
206
|
dirs << '' if legacy
|
205
207
|
dirs.uniq!
|
206
208
|
|
207
|
-
seen = Hash.new { |h, k| h[k] = true; false }
|
208
|
-
|
209
209
|
path.each { |d|
|
210
210
|
next if seen[d = File.expand_path(d)]
|
211
211
|
dirs.each { |i| yield File.join(d, i) } or break
|
@@ -265,13 +265,13 @@ class Lingo
|
|
265
265
|
end
|
266
266
|
|
267
267
|
def invite(list = config['meeting/attendees'])
|
268
|
-
supplier = Hash.
|
269
|
-
subscriber = Hash.
|
268
|
+
supplier = Hash.nest { [] }
|
269
|
+
subscriber = Hash.nest { [] }
|
270
270
|
|
271
271
|
last_link, auto_link = '', 0
|
272
272
|
|
273
273
|
list.each { |hash|
|
274
|
-
name = hash.keys.first.camelcase
|
274
|
+
name = (name_key = hash.keys.first).camelcase
|
275
275
|
|
276
276
|
cfg = (config["language/attendees/#{name.downcase}"] || {})
|
277
277
|
.merge(hash.values.first).update('name' => name)
|
@@ -284,41 +284,40 @@ class Lingo
|
|
284
284
|
|
285
285
|
@attendees << attendee = Attendee.const_get(name).new(cfg, self)
|
286
286
|
|
287
|
+
unless name == (real = attendee.class.name.split('::').last)
|
288
|
+
config.deprecate(name_key, real.underscore, attendee, :name)
|
289
|
+
end
|
290
|
+
|
287
291
|
{ 'in' => subscriber, 'out' => supplier }.each { |key, target|
|
288
292
|
cfg[key].split(SEP_RE).each { |ch| target[ch] << attendee }
|
289
293
|
}
|
290
294
|
}
|
291
295
|
|
292
296
|
supplier.each { |ch, attendees| attendees.each { |att|
|
293
|
-
att.
|
297
|
+
att.subscribers.concat(subscriber[ch])
|
294
298
|
} }
|
295
299
|
end
|
296
300
|
|
297
301
|
def start
|
298
|
-
@attendees.first.
|
302
|
+
@attendees.first.control(:TALK)
|
299
303
|
end
|
300
304
|
|
301
305
|
def reset(close = true)
|
302
306
|
dictionaries.each { |i| i.close } if close
|
303
307
|
@dictionaries, @attendees = [], []
|
304
|
-
@lexical_hash = Hash.
|
308
|
+
@lexical_hash = Hash.nest { |k| Language::LexicalHash.new(k, self) }
|
305
309
|
end
|
306
310
|
|
307
311
|
def warn(*msg)
|
308
312
|
config.warn(*msg)
|
309
313
|
end
|
310
314
|
|
311
|
-
def deprecate(old, new, obj = self)
|
312
|
-
config.deprecate(old, new, obj)
|
313
|
-
end
|
314
|
-
|
315
315
|
end
|
316
316
|
|
317
317
|
require_relative 'lingo/call'
|
318
318
|
require_relative 'lingo/error'
|
319
319
|
require_relative 'lingo/debug'
|
320
320
|
require_relative 'lingo/config'
|
321
|
-
require_relative 'lingo/agenda_item'
|
322
321
|
require_relative 'lingo/progress'
|
323
322
|
require_relative 'lingo/database'
|
324
323
|
require_relative 'lingo/language'
|