lingo 1.8.5 → 1.8.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (109) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +25 -0
  3. data/README +7 -5
  4. data/Rakefile +58 -55
  5. data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
  6. data/{lingo.cfg → config/lingo.cfg} +10 -2
  7. data/{lir.cfg → config/lir.cfg} +10 -2
  8. data/{de → dict/de}/lingo-abk.txt +0 -0
  9. data/{de → dict/de}/lingo-dic.txt +0 -0
  10. data/{de → dict/de}/lingo-mul.txt +0 -0
  11. data/{de → dict/de}/lingo-syn.txt +0 -0
  12. data/{de → dict/de}/test_dic.txt +0 -0
  13. data/{de → dict/de}/test_gen.txt +0 -0
  14. data/{de → dict/de}/test_mu2.txt +0 -0
  15. data/{de → dict/de}/test_mul.txt +0 -0
  16. data/{de → dict/de}/test_sgw.txt +0 -0
  17. data/{de → dict/de}/test_syn.txt +0 -0
  18. data/{de → dict/de}/user-dic.txt +0 -0
  19. data/{en → dict/en}/lingo-dic.txt +0 -0
  20. data/{en → dict/en}/lingo-irr.txt +0 -0
  21. data/{en → dict/en}/lingo-mul.txt +0 -0
  22. data/{en → dict/en}/lingo-syn.txt +0 -0
  23. data/{en → dict/en}/lingo-wdn.txt +0 -0
  24. data/{en → dict/en}/user-dic.txt +0 -0
  25. data/{ru → dict/ru}/lingo-dic.txt +0 -0
  26. data/{ru → dict/ru}/lingo-mul.txt +0 -0
  27. data/{ru → dict/ru}/lingo-syn.txt +0 -0
  28. data/{ru → dict/ru}/user-dic.txt +0 -0
  29. data/{de.lang → lang/de.lang} +1 -1
  30. data/{en.lang → lang/en.lang} +0 -0
  31. data/{ru.lang → lang/ru.lang} +0 -0
  32. data/lib/lingo.rb +14 -15
  33. data/lib/lingo/app.rb +4 -2
  34. data/lib/lingo/attendee.rb +23 -43
  35. data/lib/lingo/attendee/abbreviator.rb +5 -5
  36. data/lib/lingo/attendee/debugger.rb +39 -12
  37. data/lib/lingo/attendee/decomposer.rb +3 -4
  38. data/lib/lingo/attendee/dehyphenizer.rb +4 -4
  39. data/lib/lingo/attendee/formatter.rb +1 -3
  40. data/lib/lingo/attendee/multi_worder.rb +3 -4
  41. data/lib/lingo/attendee/noneword_filter.rb +8 -12
  42. data/lib/lingo/attendee/object_filter.rb +6 -3
  43. data/lib/lingo/attendee/sequencer.rb +5 -5
  44. data/lib/lingo/attendee/stemmer.rb +3 -2
  45. data/lib/lingo/attendee/synonymer.rb +3 -4
  46. data/lib/lingo/attendee/text_reader.rb +39 -38
  47. data/lib/lingo/attendee/text_writer.rb +10 -10
  48. data/lib/lingo/attendee/tokenizer.rb +63 -33
  49. data/lib/lingo/attendee/variator.rb +3 -7
  50. data/lib/lingo/attendee/vector_filter.rb +132 -65
  51. data/lib/lingo/attendee/word_searcher.rb +5 -3
  52. data/lib/lingo/buffered_attendee.rb +1 -3
  53. data/lib/lingo/call.rb +4 -3
  54. data/lib/lingo/cli.rb +5 -1
  55. data/lib/lingo/config.rb +11 -5
  56. data/lib/lingo/ctl.rb +3 -3
  57. data/lib/lingo/database.rb +3 -1
  58. data/lib/lingo/database/crypter.rb +1 -3
  59. data/lib/lingo/database/source.rb +3 -1
  60. data/lib/lingo/database/source/key_value.rb +3 -1
  61. data/lib/lingo/database/source/multi_key.rb +3 -1
  62. data/lib/lingo/database/source/multi_value.rb +3 -1
  63. data/lib/lingo/database/source/single_word.rb +3 -1
  64. data/lib/lingo/database/source/word_class.rb +3 -1
  65. data/lib/lingo/debug.rb +5 -5
  66. data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
  67. data/lib/lingo/error.rb +1 -1
  68. data/lib/lingo/language.rb +1 -9
  69. data/lib/lingo/language/dictionary.rb +2 -17
  70. data/lib/lingo/language/grammar.rb +10 -10
  71. data/lib/lingo/language/lexical.rb +2 -0
  72. data/lib/lingo/language/lexical_hash.rb +2 -0
  73. data/lib/lingo/language/token.rb +17 -3
  74. data/lib/lingo/language/word.rb +13 -5
  75. data/lib/lingo/language/word_form.rb +5 -3
  76. data/lib/lingo/progress.rb +2 -2
  77. data/lib/lingo/srv.rb +1 -1
  78. data/lib/lingo/srv/lingosrv.cfg +1 -1
  79. data/lib/lingo/version.rb +1 -1
  80. data/lib/lingo/web.rb +1 -1
  81. data/lib/lingo/web/lingoweb.cfg +1 -1
  82. data/test/attendee/ts_abbreviator.rb +4 -2
  83. data/test/attendee/ts_multi_worder.rb +81 -88
  84. data/test/attendee/ts_noneword_filter.rb +2 -2
  85. data/test/attendee/ts_object_filter.rb +2 -2
  86. data/test/attendee/ts_sequencer.rb +40 -20
  87. data/test/attendee/ts_stemmer.rb +52 -26
  88. data/test/attendee/ts_text_reader.rb +75 -56
  89. data/test/attendee/ts_text_writer.rb +6 -4
  90. data/test/attendee/ts_tokenizer.rb +304 -193
  91. data/test/attendee/ts_vector_filter.rb +242 -9
  92. data/test/ref/artikel.non +3 -0
  93. data/test/ref/artikel.vec +1 -4
  94. data/test/ref/artikel.vef +940 -0
  95. data/test/ref/artikel.ven +0 -3
  96. data/test/ref/artikel.ver +0 -3
  97. data/test/ref/artikel.vet +2580 -0
  98. data/test/ref/lir.non +34 -31
  99. data/test/ref/lir.seq +14 -15
  100. data/test/ref/lir.vec +37 -37
  101. data/test/ref/lir.vef +329 -0
  102. data/test/ref/lir.ven +329 -0
  103. data/test/ref/lir.ver +329 -0
  104. data/test/ref/lir.vet +329 -0
  105. data/test/test_helper.rb +29 -16
  106. data/test/ts_language.rb +6 -47
  107. metadata +74 -87
  108. data/lingo.rb +0 -29
  109. data/spec/spec_helper.rb +0 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 43119723e15432c990503ec61938ee96b15a1e53
4
- data.tar.gz: 6674993f107c30acf34ea43eff24272356663b5a
3
+ metadata.gz: e4cc870c8c1b49c580841a934b5906ed6ddf75e4
4
+ data.tar.gz: 1ecb26c708daa4bfa09f4aa76f6d7e17f1a72683
5
5
  SHA512:
6
- metadata.gz: 95bff3ff2eadfc3267da83a1d3a5398cc3b9f4eff13ae1f6682c24054fd5baf7e20ad03b241a2d5731f8c3636dd87d7f937bdb1a4f022ab883e40c5ad4407ec1
7
- data.tar.gz: b7c7a505f4f61f70a59e8154d204a9ac87fe84fda453e1449fd66ca70e82bda8378070f4b0f7d763f7c8191f19e789c382b65135f8951c9917d7e7ca204d504d
6
+ metadata.gz: f2f0abed6198a7fcf0ff4f44aa442266f38c44646c7f4e8ef894886c453ce1654edd217c675f12e6b7d828c43ac461abb64d92aef20015249dbdf6f9efc03a3f
7
+ data.tar.gz: cb0be6e46a16639a384bab3507dc3b2bd4465736d1d7e0189d3930d1252e247fff4421364d860bd2cdd12f26b4f4445192a87998bea017bb1f285c8e0bda7639
data/ChangeLog CHANGED
@@ -2,6 +2,31 @@
2
2
 
3
3
  = Revision history for Lingo
4
4
 
5
+ == 1.8.6 [2015-02-09]
6
+
7
+ * Lingo::Attendee::VectorFilter learned +pos+ option to print position and
8
+ byte offset with each word.
9
+ * Lingo::Attendee::VectorFilter learned +tfidf+ option to sort results based
10
+ on their tf–idf[https://en.wikipedia.org/wiki/Tf–idf] score; the document
11
+ frequencies are calculated over the "corpus" of all files processed during
12
+ a single program invocation.
13
+ * Lingo::Attendee::VectorFilter learned +tokens+ option to filter on
14
+ Lingo::Language::Token in addition to Lingo::Language::Word.
15
+ * Lingo::Attendee::VectorFilter no longer supports +debug+ (as well as
16
+ +prompt+ and +preamble+); use Lingo::Attendee::DebugFilter instead.
17
+ * Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
18
+ is obsolete.
19
+ * Lingo::Attendee::TextReader passes byte offset to the following attendee.
20
+ * Lingo::Attendee::Tokenizer records token's byte offset.
21
+ * Lingo::Attendee::Tokenizer records token's sequence position.
22
+ * Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
23
+ specified tags' contents.
24
+ * Lingo::Attendee subclasses warn when invalid or obsolete options or names
25
+ are used.
26
+ * Changed German infix substitution +/en+ to +ch/chen+ in order to prevent
27
+ overly aggressive identifications.
28
+ * Internal refactoring and API changes.
29
+
5
30
  == 1.8.5 [2014-10-02]
6
31
 
7
32
  * Dictionary values (projections) are no longer sorted; hence, order of
data/README CHANGED
@@ -35,7 +35,7 @@
35
35
 
36
36
  == VERSION
37
37
 
38
- This documentation refers to Lingo version 1.8.5
38
+ This documentation refers to Lingo version 1.8.6
39
39
 
40
40
 
41
41
  == DESCRIPTION
@@ -58,7 +58,7 @@ is a minimal configuration example to analyse this README file:
58
58
  meeting:
59
59
  attendees:
60
60
  - text_reader: { files: 'README' }
61
- - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
61
+ - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
62
62
 
63
63
  Lingo is told to invite two attendees and wants them to talk to each other,
64
64
  hence the name Lingo (= the technical language).
@@ -187,7 +187,7 @@ of context to external files.
187
187
  _Example_:
188
188
 
189
189
  # keep line endings
190
- - text_reader: { files: $(files), chomp: false }
190
+ - text_reader: { files: $(files) }
191
191
  # keep whitespace
192
192
  - tokenizer: { space: true }
193
193
  # do processing...
@@ -545,7 +545,7 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
545
545
  === Background and Theory
546
546
 
547
547
  * Gödert, W.; Lepsky, K.; Nagelschmidt, M.: <em>{Informationserschließung und Automatisches Indexieren: ein Lehr- und Arbeitsbuch}[http://dx.doi.org/10.1007/978-3-642-23513-9]</em>. (German) Berlin etc.: Springer, 2012.
548
- * Lepsky, K.; Vorhauer, J.: <em>{Lingo: ein open source System für die automatische Indexierung deutschsprachiger Dokumente}[http://dx.doi.org/10.1515/ABITECH.2006.26.1.18]</em>. (German) In: ABI Technik 26 (1), 2006. pp 18-29.
548
+ * Lepsky, K.; Vorhauer, J.: <em>{Lingo ein open source System für die automatische Indexierung deutschsprachiger Dokumente}[http://dx.doi.org/10.1515/ABITECH.2006.26.1.18]</em>. (German) In: ABI Technik 26 (1), 2006. pp 18-29.
549
549
  * Nohr, H.: <em>{Grundlagen der automatischen Indexierung: ein Lehrbuch}[http://logos-verlag.de/cgi-bin/buch/isbn/0121]</em>. (German) Berlin: Logos, 2005.
550
550
  * Hausser, R.: <em>{Grundlagen der Computerlinguistik. Mensch-Maschine-Kommunikation in natürlicher Sprache}[http://zbmath.org/?q=an:0956.68141]</em>. (German) Berlin etc.: Springer, 2000.
551
551
  * Allen, J.: <em>{Natural language understanding}[http://zbmath.org/?q=an:0851.68106]</em>. (English) Redwood City, CA: Benjamin/Cummings, 1995.
@@ -559,6 +559,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
559
559
  * Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
560
560
  * Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
561
561
  * Gödert, W.: <em>{Detecting multiword phrases in mathematical text corpora}[http://arxiv.org/abs/1210.0852]</em>. (English) arXiv:1210.0852 [cs.CL], 2012.
562
+ * Jersek, T.: <em>{Automatische DDC-Klassifizierung mit Lingo: Vorgehensweise und Ergebnisse}[http://www.citeulike.org/user/klaus-lepsky/article/12476139]</em>. (German) Köln: Fachhochschule Köln, 2012.
563
+ * Glaesener, L.: <em>{Automatisches Indexieren einer informationswissenschaftlichen Datenbank mit Mehrwortgruppen}[http://www.citeulike.org/user/klaus-lepsky/article/12476133]</em>. (German) Köln: Fachhochschule Köln, 2012.
562
564
  * Schiffer, R.: <em>{Automatisches Indexieren technischer Kongressschriften}[http://ixtrieve.fh-koeln.de/lehre/schiffer-2007.pdf]</em>. (German) Köln: Fachhochschule Köln, 2007.
563
565
 
564
566
 
@@ -582,7 +584,7 @@ Lingo is based on a collective development by Klaus Lepsky and John Vorhauer.
582
584
  == LICENSE AND COPYRIGHT
583
585
 
584
586
  Copyright (C) 2005-2007 John Vorhauer
585
- Copyright (C) 2007-2014 John Vorhauer, Jens Wille
587
+ Copyright (C) 2007-2015 John Vorhauer, Jens Wille
586
588
 
587
589
  Lingo is free software: you can redistribute it and/or modify it under the
588
590
  terms of the GNU Affero General Public License as published by the Free
data/Rakefile CHANGED
@@ -1,33 +1,20 @@
1
1
  # encoding: utf-8
2
2
 
3
- __DIR__ = File.expand_path('..', __FILE__)
4
-
5
- require 'rake/clean'
6
- require 'nuggets/ruby'
7
- require File.join(__DIR__, %w[lib lingo version])
8
-
9
- PACKAGE_NAME = 'lingo'
10
- PACKAGE_PATH = File.join(__DIR__, 'pkg', "#{PACKAGE_NAME}-#{Lingo::VERSION}")
11
-
12
- if RUBY_PLATFORM =~ /msdos|mswin|djgpp|mingw|windows/i
13
- ZIP_COMMANDS = ['zip', '7z a'] # for hen's gem task
14
- end
15
-
16
- task default: :spec
17
- task package: [:checkdoc, 'test:all', :clean]
3
+ require_relative 'lib/lingo/version'
18
4
 
19
5
  begin
20
6
  require 'hen'
21
7
 
22
8
  Hen.lay! {{
23
9
  gem: {
24
- name: PACKAGE_NAME,
10
+ name: 'lingo',
25
11
  version: Lingo::VERSION,
26
12
  summary: 'The full-featured automatic indexing system',
27
13
  authors: ['John Vorhauer', 'Jens Wille'],
28
14
  email: ['lingo@vorhauer.de', 'jens.wille@gmail.com'],
29
15
  license: 'AGPL-3.0',
30
16
  homepage: 'http://lex-lingo.de',
17
+
31
18
  description: <<-EOT,
32
19
  Lingo is an open source indexing system for research and teachings.
33
20
  The main functions of Lingo are:
@@ -39,23 +26,32 @@ The main functions of Lingo are:
39
26
  * generic identification of phrases/word sequences based on patterns
40
27
  of word classes
41
28
  EOT
29
+
42
30
  extra_files: FileList[
43
- 'lingo.rb', 'lingo{,-call}.cfg', 'lir.cfg',
44
- '{de,en,ru}.lang', '{de,en,ru}/{lingo-*,user-dic,test_*}.txt',
45
- 'txt/{artikel{,-en,-ru},lir}.txt', 'lib/lingo/{srv,web}/**/{,.}*'
31
+ 'lib/lingo/{srv,web}/**/{,.}*',
32
+ 'config/*.cfg',
33
+ 'dict/*/*.txt',
34
+ 'lang/*.lang',
35
+ 'txt/*.txt'
46
36
  ].to_a,
47
- required_ruby_version: '>= 1.9.3',
37
+
48
38
  dependencies: {
49
- 'cyclops' => ['~> 0.0', '>= 0.0.4'],
50
- 'nuggets' => '~> 1.0',
39
+ 'cyclops' => '~> 0.1',
40
+ 'nuggets' => '~> 1.1',
51
41
  'rubyzip' => '~> 1.1',
52
42
  'sinatra-bells' => '~> 0.0',
53
43
  'unicode' => '~> 0.4'
54
44
  },
45
+
55
46
  development_dependencies: {
56
47
  'diff-lcs' => '~> 1.2',
57
48
  'open4' => '~> 1.3'
58
- }
49
+ },
50
+
51
+ required_ruby_version: '>= 1.9.3'
52
+ },
53
+ test: {
54
+ pattern: %w[test/ts_*.rb test/attendee/ts_*.rb]
59
55
  }
60
56
  }}
61
57
  rescue LoadError => err
@@ -71,40 +67,22 @@ CLEAN.include(
71
67
 
72
68
  CLOBBER.include('store')
73
69
 
74
- task :checkdoc do
75
- docfile = File.join(__DIR__, 'doc', 'index.html')
76
- abort "Please run `rake doc' first." unless File.exists?(docfile)
77
- end
78
-
79
70
  desc 'Run ALL tests'
80
- task 'test:all' => [:test, 'test:txt', 'test:lir']
81
-
82
- Rake::TestTask.new(:test) do |t|
83
- t.test_files = FileList.new('test/ts_*.rb', 'test/attendee/ts_*.rb')
84
- end
71
+ task 'test:all' => %w[test test:txt test:lir]
85
72
 
86
73
  desc 'Test against reference file (TXT)'
87
- task 'test:txt' do
88
- test_ref('artikel', 'lingo')
89
- end
74
+ task('test:txt') { test_ref('artikel', 'lingo') }
90
75
 
91
76
  desc 'Test against reference file (LIR)'
92
- task 'test:lir' do
93
- test_ref('lir')
94
- end
95
-
96
- desc 'Run all tests on packaged distribution'
97
- task 'test:remote' => [:package] do
98
- chdir(PACKAGE_PATH) { system('rake test:all') } || abort
99
- end
77
+ task('test:lir') { test_ref('lir') }
100
78
 
101
- unless (benchmarks = Dir[File.join(__DIR__, 'bench', '*_bench.rb')]).empty?
79
+ unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
102
80
  desc 'Run all benchmarks'
103
81
  task :bench
104
82
 
105
83
  benchmarks.each { |benchmark|
106
84
  bench = File.basename(benchmark, '_bench.rb')
107
- task :bench => benchtask = "bench:#{bench}"
85
+ task bench: benchtask = "bench:#{bench}"
108
86
 
109
87
  desc "Run #{bench} benchmark"
110
88
  task(benchtask) { system(File.ruby, benchmark) }
@@ -113,18 +91,43 @@ end
113
91
 
114
92
  def test_ref(name, cfg = name)
115
93
  require 'diff/lcs'
116
- require 'diff/lcs/ldiff'
94
+ require 'diff/lcs/hunk'
95
+ require 'nuggets/ruby'
96
+
97
+ jruby = RUBY_ENGINE == 'jruby'
98
+ jruby_lir = jruby && name == 'lir'
99
+
100
+ cmd = %W[bin/lingo -c #{cfg} txt/#{name}.txt]
101
+ buf, diff = ["Command failed: #{cmd.join(' ')}"], 0
102
+
103
+ Process.ruby(*cmd, I: :lib, &jruby ?
104
+ lambda { |_, _, o, e| buf << e.read; buf << o.read } :
105
+ lambda { |_, _, o, e| IO.interact({}, { o => buf, e => buf }) }
106
+ ).success? or abort buf.join("\n\n")
107
+
108
+ Dir["test/ref/#{name}.*"].sort.each { |ref|
109
+ unless File.exist?(txt = ref.sub(/test\/ref/, 'txt'))
110
+ puts "?? #{txt}"
111
+ else
112
+ puts "## #{txt}"
113
+
114
+ data = [ref, txt].map { |file|
115
+ File.readlines(file).each { |line|
116
+ line.chomp!
117
+ line.gsub!(/(\d+\.\d+)\d/, '\1') if jruby_lir
118
+ }
119
+ }
117
120
 
118
- cmd = %W[lingo.rb -c #{cfg} txt/#{name}.txt]
119
- diff, msg = 0, ["Command failed: #{cmd.join(' ')}"]
121
+ diffs, fld = Diff::LCS.diff(*data), 0
120
122
 
121
- Process.ruby(*cmd) { |_, _, o, e|
122
- IO.interact({}, { o => msg, e => msg })
123
- }.success? or abort msg.join("\n\n")
123
+ diffs.empty? ? next : diffs.each { |piece|
124
+ dlh = Diff::LCS::Hunk.new(*data, piece, 0, fld)
125
+ fld = dlh.file_length_difference
126
+ puts dlh.diff(:old)
127
+ }
128
+ end
124
129
 
125
- Dir["test/ref/#{name}.*"].each { |ref|
126
- puts "## #{org = ref.sub(/test\/ref/, 'txt')}"
127
- diff += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
130
+ diff += 1
128
131
  }
129
132
 
130
133
  exit diff + 1 unless diff.zero?
@@ -8,4 +8,4 @@ meeting:
8
8
  - multi_worder: { source: sys-mul }
9
9
  - sequencer: { }
10
10
  - synonymer: { source: sys-syn, skip: '?,t' }
11
- - debugger: { prompt: '', ceval: 'false' }
11
+ - debugger: { prompt: '', ceval: 'false', preamble: false }
@@ -52,7 +52,7 @@ meeting:
52
52
  ########################################
53
53
  # Datenstrom anzeigen
54
54
  #
55
- # - debugger: { eval: 'true', ceval: 'obj.cmd!="EOL"', prompt: 'lex:) ' }
55
+ # - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: 'lex:) ' }
56
56
 
57
57
 
58
58
  ########################################
@@ -60,7 +60,7 @@ meeting:
60
60
  #
61
61
 
62
62
  # Erstelle Datei mit Endung .log für Datenstrom
63
- - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
63
+ - debug_filter: { in: syn, prompt: 'lex:) ' }
64
64
  - text_writer: { ext: log, sep: "\n" }
65
65
 
66
66
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
@@ -83,6 +83,14 @@ meeting:
83
83
  - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
84
84
  - text_writer: { ext: ver, sep: "\n" }
85
85
 
86
+ # Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
87
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
88
+ - text_writer: { ext: vef, sep: "\n" }
89
+
90
+ # Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
91
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
92
+ - text_writer: { ext: vet, sep: "\n" }
93
+
86
94
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
87
95
  - vector_filter: { in: syn, lexicals: m }
88
96
  - text_writer: { ext: mul, sep: "\n" }
@@ -57,7 +57,7 @@ meeting:
57
57
  ########################################
58
58
  # Datenstrom anzeigen
59
59
  #
60
- # - debugger: { eval: 'true', ceval: 'obj.cmd!="EOL"', prompt: 'lex:) ' }
60
+ # - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: 'lex:) ' }
61
61
 
62
62
 
63
63
  ########################################
@@ -65,7 +65,7 @@ meeting:
65
65
  #
66
66
 
67
67
  # Erstelle Datei mit Endung .log für Datenstrom
68
- - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
68
+ - debug_filter: { in: syn, prompt: 'lex:) ' }
69
69
  - text_writer: { ext: log, sep: "\n" }
70
70
 
71
71
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
@@ -88,6 +88,14 @@ meeting:
88
88
  - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
89
89
  - text_writer: { ext: ver, sep: '|' }
90
90
 
91
+ # Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
92
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
93
+ - text_writer: { ext: vef, sep: '|' }
94
+
95
+ # Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
96
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
97
+ - text_writer: { ext: vet, sep: '|' }
98
+
91
99
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
92
100
  - vector_filter: { in: syn, lexicals: m }
93
101
  - text_writer: { ext: mul, sep: '|' }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -79,7 +79,7 @@ language:
79
79
  - [a, 'este ste ster sten stes ester estes esten e em en er ere eren erer eres es erem']
80
80
  - [v, 'e/en en/en est/en et/en st/en t/en te/en ten/en eten/en ete/en etest/en s']
81
81
  - [e, 's']
82
- - [f, 's n e en es er ch/che /en']
82
+ - [f, 's n e en es er ch/che ch/chen']
83
83
 
84
84
  inflect:
85
85
  a: # adjectives
File without changes
File without changes
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -29,6 +29,8 @@ require 'stringio'
29
29
  require 'pathname'
30
30
  require 'fileutils'
31
31
  require 'nuggets/file/ext'
32
+ require 'nuggets/hash/nest'
33
+ require 'nuggets/hash/seen'
32
34
  require 'nuggets/env/user_home'
33
35
  require 'nuggets/string/camelscore'
34
36
 
@@ -200,12 +202,10 @@ class Lingo
200
202
  end
201
203
 
202
204
  def walk(path, options, legacy = true)
203
- dirs = [options[:dir].to_s]
205
+ dirs, seen = [options[:dir].to_s], Hash.seen
204
206
  dirs << '' if legacy
205
207
  dirs.uniq!
206
208
 
207
- seen = Hash.new { |h, k| h[k] = true; false }
208
-
209
209
  path.each { |d|
210
210
  next if seen[d = File.expand_path(d)]
211
211
  dirs.each { |i| yield File.join(d, i) } or break
@@ -265,13 +265,13 @@ class Lingo
265
265
  end
266
266
 
267
267
  def invite(list = config['meeting/attendees'])
268
- supplier = Hash.new { |h, k| h[k] = [] }
269
- subscriber = Hash.new { |h, k| h[k] = [] }
268
+ supplier = Hash.nest { [] }
269
+ subscriber = Hash.nest { [] }
270
270
 
271
271
  last_link, auto_link = '', 0
272
272
 
273
273
  list.each { |hash|
274
- name = hash.keys.first.camelcase
274
+ name = (name_key = hash.keys.first).camelcase
275
275
 
276
276
  cfg = (config["language/attendees/#{name.downcase}"] || {})
277
277
  .merge(hash.values.first).update('name' => name)
@@ -284,41 +284,40 @@ class Lingo
284
284
 
285
285
  @attendees << attendee = Attendee.const_get(name).new(cfg, self)
286
286
 
287
+ unless name == (real = attendee.class.name.split('::').last)
288
+ config.deprecate(name_key, real.underscore, attendee, :name)
289
+ end
290
+
287
291
  { 'in' => subscriber, 'out' => supplier }.each { |key, target|
288
292
  cfg[key].split(SEP_RE).each { |ch| target[ch] << attendee }
289
293
  }
290
294
  }
291
295
 
292
296
  supplier.each { |ch, attendees| attendees.each { |att|
293
- att.add_subscriber(subscriber[ch])
297
+ att.subscribers.concat(subscriber[ch])
294
298
  } }
295
299
  end
296
300
 
297
301
  def start
298
- @attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_TALK))
302
+ @attendees.first.control(:TALK)
299
303
  end
300
304
 
301
305
  def reset(close = true)
302
306
  dictionaries.each { |i| i.close } if close
303
307
  @dictionaries, @attendees = [], []
304
- @lexical_hash = Hash.new { |h, k| h[k] = Language::LexicalHash.new(k, self) }
308
+ @lexical_hash = Hash.nest { |k| Language::LexicalHash.new(k, self) }
305
309
  end
306
310
 
307
311
  def warn(*msg)
308
312
  config.warn(*msg)
309
313
  end
310
314
 
311
- def deprecate(old, new, obj = self)
312
- config.deprecate(old, new, obj)
313
- end
314
-
315
315
  end
316
316
 
317
317
  require_relative 'lingo/call'
318
318
  require_relative 'lingo/error'
319
319
  require_relative 'lingo/debug'
320
320
  require_relative 'lingo/config'
321
- require_relative 'lingo/agenda_item'
322
321
  require_relative 'lingo/progress'
323
322
  require_relative 'lingo/database'
324
323
  require_relative 'lingo/language'