lingo 1.8.6 → 1.8.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +40 -4
  3. data/README +22 -51
  4. data/Rakefile +3 -17
  5. data/config/lingo.cfg +24 -15
  6. data/config/lir.cfg +25 -16
  7. data/dict/de/test_muh.txt +6 -0
  8. data/dict/en/lingo-dic.txt +2 -3
  9. data/lang/de.lang +10 -9
  10. data/lang/en.lang +1 -1
  11. data/lib/lingo.rb +4 -4
  12. data/lib/lingo/attendee.rb +27 -7
  13. data/lib/lingo/attendee/analysis_filter.rb +81 -0
  14. data/lib/lingo/attendee/debug_filter.rb +42 -0
  15. data/lib/lingo/attendee/debugger.rb +2 -11
  16. data/lib/lingo/attendee/decomposer.rb +6 -3
  17. data/lib/lingo/attendee/formatter.rb +6 -6
  18. data/lib/lingo/attendee/hal_filter.rb +94 -0
  19. data/lib/lingo/attendee/lsi_filter.rb +99 -0
  20. data/lib/lingo/attendee/multi_worder.rb +69 -43
  21. data/lib/lingo/attendee/sequencer.rb +32 -19
  22. data/lib/lingo/attendee/synonymer.rb +2 -2
  23. data/lib/lingo/attendee/text_reader.rb +63 -92
  24. data/lib/lingo/attendee/text_writer.rb +12 -21
  25. data/lib/lingo/attendee/tokenizer.rb +32 -21
  26. data/lib/lingo/attendee/variator.rb +3 -3
  27. data/lib/lingo/attendee/vector_filter.rb +7 -9
  28. data/lib/lingo/attendee/word_searcher.rb +3 -3
  29. data/lib/lingo/buffered_attendee.rb +3 -36
  30. data/lib/lingo/config.rb +1 -1
  31. data/lib/lingo/ctl.rb +7 -155
  32. data/lib/lingo/ctl/analysis.rb +136 -0
  33. data/lib/lingo/ctl/files.rb +86 -0
  34. data/lib/lingo/ctl/other.rb +140 -0
  35. data/lib/lingo/database.rb +64 -60
  36. data/lib/lingo/database/crypter.rb +7 -5
  37. data/lib/lingo/error.rb +5 -4
  38. data/lib/lingo/language.rb +13 -5
  39. data/lib/lingo/language/grammar.rb +13 -7
  40. data/lib/lingo/language/token.rb +6 -0
  41. data/lib/lingo/language/word.rb +23 -36
  42. data/lib/lingo/language/word_form.rb +5 -1
  43. data/lib/lingo/srv.rb +2 -2
  44. data/lib/lingo/text_utils.rb +96 -0
  45. data/lib/lingo/version.rb +1 -1
  46. data/lib/lingo/web/views/index.erb +1 -1
  47. data/test/attendee/ts_decomposer.rb +23 -5
  48. data/test/attendee/ts_multi_worder.rb +66 -0
  49. data/test/attendee/ts_sequencer.rb +28 -4
  50. data/test/attendee/ts_text_reader.rb +20 -0
  51. data/test/attendee/ts_tokenizer.rb +20 -0
  52. data/test/attendee/ts_variator.rb +1 -1
  53. data/test/attendee/ts_word_searcher.rb +39 -3
  54. data/test/lir3.txt +12 -0
  55. data/test/ref/artikel.non +1 -12
  56. data/test/ref/artikel.seq +3 -1
  57. data/test/ref/artikel.vec +1 -0
  58. data/test/ref/artikel.vef +35 -34
  59. data/test/ref/artikel.ven +8 -7
  60. data/test/ref/artikel.ver +34 -33
  61. data/test/ref/artikel.vet +2573 -2563
  62. data/test/ref/lir.non +77 -78
  63. data/test/ref/lir.seq +9 -7
  64. data/test/ref/lir.syn +1 -1
  65. data/test/ref/lir.vec +41 -41
  66. data/test/ref/lir.vef +210 -210
  67. data/test/ref/lir.ven +46 -46
  68. data/test/ref/lir.ver +72 -72
  69. data/test/ref/lir.vet +329 -329
  70. data/test/ts_database.rb +166 -62
  71. data/test/ts_language.rb +23 -23
  72. metadata +53 -34
  73. data/lib/lingo/attendee/dehyphenizer.rb +0 -120
  74. data/lib/lingo/attendee/noneword_filter.rb +0 -115
  75. data/test/attendee/ts_noneword_filter.rb +0 -15
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e4cc870c8c1b49c580841a934b5906ed6ddf75e4
4
- data.tar.gz: 1ecb26c708daa4bfa09f4aa76f6d7e17f1a72683
3
+ metadata.gz: 1130ec52467314ba95af17e635888f60046c5b42
4
+ data.tar.gz: 6a882ea4f88b1fbcf1a66b1d5fafe8fa05458b89
5
5
  SHA512:
6
- metadata.gz: f2f0abed6198a7fcf0ff4f44aa442266f38c44646c7f4e8ef894886c453ce1654edd217c675f12e6b7d828c43ac461abb64d92aef20015249dbdf6f9efc03a3f
7
- data.tar.gz: cb0be6e46a16639a384bab3507dc3b2bd4465736d1d7e0189d3930d1252e247fff4421364d860bd2cdd12f26b4f4445192a87998bea017bb1f285c8e0bda7639
6
+ metadata.gz: 3e0b384a822c28961c99d411bbdd399d9a49b29fdc40688d3534a9897cef984a7447cdd21b05fda45dda9a0fad25d99f60d524064803edb218ff95bbf9fc4fe6
7
+ data.tar.gz: 5cfa5c7f235113238d0e8568e9948f27a7ec864af63ac696a5efdfceed6eb678724f3ca3df24d3ac296068aceb1bf20d1369f439edf7f670265550ace1ce1cd0
data/ChangeLog CHANGED
@@ -2,6 +2,43 @@
2
2
 
3
3
  = Revision history for Lingo
4
4
 
5
+ == 1.8.7 [2015-08-07]
6
+
7
+ * Added Lingo::Attendee::LsiFilter to correlate semantically related terms
8
+ (LSI[https://en.wikipedia.org/wiki/Latent_semantic_indexing]) over the
9
+ "corpus" of all files processed during a single program invocation; requires
10
+ lsi4r[https://blackwinter.github.com/lsi4r] which in turn requires
11
+ rb-gsl[https://blackwinter.github.com/rb-gsl]. [EXPERIMENTAL: Interface may
12
+ be changed or removed in next release.]
13
+ * Added Lingo::Attendee::HalFilter to correlate semantically related terms
14
+ (HAL[https://en.wikipedia.org/wiki/Hyperspace_Analogue_to_Language]) over
15
+ individual documents; requires hal4r[https://blackwinter.github.com/hal4r]
16
+ which in turn requires rb-gsl[https://blackwinter.github.com/rb-gsl].
17
+ [EXPERIMENTAL: Interface may be changed or removed in next release.]
18
+ * Added Lingo::Attendee::AnalysisFilter and associated +lingoctl+ tooling.
19
+ * Multiword dictionaries can now identify hyphenated variants (e.g.
20
+ <tt>automatic data-processing</tt>); set <tt>hyphenate: true</tt> in the
21
+ dictionary config.
22
+ * Lingo::Attendee::Tokenizer no longer considers hyphens at word edges as part
23
+ of the word. As a consequence, Lingo::Attendee::Dehyphenizer has been
24
+ dropped.
25
+ * Dropped Lingo::Attendee::NonewordFilter; use Lingo::Attendee::VectorFilter
26
+ with option <tt>lexicals: '\?'</tt> instead.
27
+ * Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned
28
+ +encoding+ option to read/write text that is not UTF-8 encoded;
29
+ configuration files and dictionaries still need to be UTF-8, though.
30
+ * Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned to
31
+ read/write Gzip-compressed files (file extension +.gz+ or +.gzip+).
32
+ * Lingo::Attendee::Sequencer learned to recognize +0+ in the pattern to match
33
+ number tokens.
34
+ * Fixed Lingo::Attendee::TextReader to recognize BOM in input files; does not
35
+ apply to input read from +STDIN+.
36
+ * Fixed regression introduced in 1.8.6 where Lingo::Attendee::Debugger would
37
+ no longer work immediately behind Lingo::Attendee::TextReader.
38
+ * Fixed +lingoctl+ copy commands when overwriting existing files.
39
+ * Refactored Lingo::Database::Crypter into a module.
40
+ * JRuby 9000 compatibility.
41
+
5
42
  == 1.8.6 [2015-02-09]
6
43
 
7
44
  * Lingo::Attendee::VectorFilter learned +pos+ option to print position and
@@ -17,8 +54,7 @@
17
54
  * Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
18
55
  is obsolete.
19
56
  * Lingo::Attendee::TextReader passes byte offset to the following attendee.
20
- * Lingo::Attendee::Tokenizer records token's byte offset.
21
- * Lingo::Attendee::Tokenizer records token's sequence position.
57
+ * Lingo::Attendee::Tokenizer records token's position and byte offset.
22
58
  * Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
23
59
  specified tags' contents.
24
60
  * Lingo::Attendee subclasses warn when invalid or obsolete options or names
@@ -184,9 +220,9 @@
184
220
  the word class for multiword entries (defaults to <tt>def-wc</tt>). Use
185
221
  <tt>def-mul-wc: 'm'</tt> in your <tt>.lang</tt> to restore the previous
186
222
  behaviour.
187
- * New Lingo::Attendee::Formatter for configurable output formatting as an
223
+ * Added Lingo::Attendee::Formatter for configurable output formatting as an
188
224
  alternative to Lingo::Attendee::TextWriter.
189
- * New basic input filters to enable indexing of HTML/XML (and PDF) files.
225
+ * Added basic input filters to enable indexing of HTML/XML (and PDF) files.
190
226
  * Updated the system dictionary.
191
227
  * Switched license to Affero GPL.
192
228
 
data/README CHANGED
@@ -15,7 +15,6 @@
15
15
  * {Example}[rdoc-label:label-EXAMPLE]
16
16
  * {Installation and Usage}[rdoc-label:label-INSTALLATION+AND+USAGE]
17
17
  * {Dictionary and configuration file lookup}[rdoc-label:label-Dictionary+and+configuration+file+lookup]
18
- * {Legacy version}[rdoc-label:label-Legacy+version]
19
18
  * {File formats}[rdoc-label:label-FILE+FORMATS]
20
19
  * {Configuration}[rdoc-label:label-Configuration]
21
20
  * {Language definition}[rdoc-label:label-Language+definition]
@@ -35,7 +34,7 @@
35
34
 
36
35
  == VERSION
37
36
 
38
- This documentation refers to Lingo version 1.8.6
37
+ This documentation refers to Lingo version 1.8.7
39
38
 
40
39
 
41
40
  == DESCRIPTION
@@ -58,7 +57,7 @@ is a minimal configuration example to analyse this README file:
58
57
  meeting:
59
58
  attendees:
60
59
  - text_reader: { files: 'README' }
61
- - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
60
+ - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: '<debug>: ' }
62
61
 
63
62
  Lingo is told to invite two attendees and wants them to talk to each other,
64
63
  hence the name Lingo (= the technical language).
@@ -131,8 +130,6 @@ information see each attendee's documentation):
131
130
  Lingo::Attendee::Debugger)
132
131
  +variator+:: Tries to correct spelling errors and the like. (see
133
132
  Lingo::Attendee::Variator)
134
- +dehyphenizer+:: Tries to undo hyphenation. (see
135
- Lingo::Attendee::Dehyphenizer)
136
133
  +multi_worder+:: Identifies phrases (word sequences) based on a multiword
137
134
  dictionary. (see Lingo::Attendee::MultiWorder)
138
135
  +sequencer+:: Identifies phrases (word sequences) based on patterns of
@@ -186,14 +183,14 @@ of context to external files.
186
183
 
187
184
  _Example_:
188
185
 
189
- # keep line endings
190
- - text_reader: { files: $(files) }
186
+ # read files
187
+ - text_reader: { files: $(files) }
191
188
  # keep whitespace
192
- - tokenizer: { space: true }
189
+ - tokenizer: { space: true }
193
190
  # do processing...
194
- - word_searcher: { source: sys-dic, mode: first }
195
- # insert formatted results (e.g. "[[Name::lingo|Lingo]] got these [[Noun::word|words]].")
196
- - formatter: { ext: out, format: '[[%3$s::%2$s|%1$s]]', map: { e: Name, s: Noun } }
191
+ - word_searcher: { source: sys-dic, mode: first }
192
+ # insert formatted results (e.g. "[[Name::lingo|Lingo]] finds [[Noun::word|words]].")
193
+ - formatter: { ext: out, format: '[[%3$s::%2$s|%1$s]]', map: { e: Name, s: Noun } }
197
194
 
198
195
  === Plugins
199
196
 
@@ -267,7 +264,7 @@ truncated for clarity).
267
264
  <Lingo = [(lingo/s), (lingo/e)]>
268
265
  <-|?>
269
266
  <A|?>
270
- <full-featured|KOM = [(full-featured/k), (full/s+), (full/a+), (full/v+), (featured/a+)]>
267
+ <full-featured|COM = [(full-featured/k), (full/s+), (full/a+), (full/v+), (featured/a+)]>
271
268
  <automatic = [(automatic/s), (automatic/a)]>
272
269
  <indexing = [(index/v)]>
273
270
  <system = [(system/s)]>
@@ -349,9 +346,8 @@ the +lingo+ executable to process your text files. See <tt>lingo --help</tt>
349
346
  for available options.
350
347
 
351
348
  Please note that Lingo requires Ruby version 1.9.3 or higher to run
352
- (2.1.3[http://ruby-lang.org/en/downloads/] is the currently recommended
353
- version). If you want to use Lingo on Ruby 1.8, please refer to the
354
- {legacy version}[rdoc-label:label-Legacy+version].
349
+ (2.2.2[http://ruby-lang.org/en/downloads/] is the currently recommended
350
+ version).
355
351
 
356
352
  Since Lingo depends on native extensions, you need to make sure that
357
353
  development files for your Ruby version are installed. On Debian-based
@@ -359,12 +355,8 @@ Linux platforms they are included in the package <tt>ruby-dev</tt>;
359
355
  other distributions may have a similarly named package. On Windows those
360
356
  development files are currently not required.
361
357
 
362
- Prior to version 1.8.0, Lingo expected to be run from its installation
363
- directory. This is no longer necessary. But if you prefer that use case,
364
- you can either download and extract an
365
- {archive file}[http://github.com/lex-lingo/lingo/releases] or unpack the
366
- Gem archive (<tt>gem unpack lingo</tt>); or you can install the legacy
367
- version of Lingo (see below).
358
+ On JRuby, install gdbm[https://rubygems.org/gems/gdbm] for efficient database
359
+ operations: <tt>gem install gdbm</tt>.
368
360
 
369
361
  === Dictionary and configuration file lookup
370
362
 
@@ -395,29 +387,6 @@ typically organized in the following directory structure:
395
387
 
396
388
  But for compatibility reasons these naming conventions are not enforced.
397
389
 
398
- === Legacy version
399
-
400
- As Lingo 1.8 introduced some major disruptions and no longer runs on Ruby 1.8,
401
- there is a maintenance branch for Lingo 1.7.x that will remain compatible with
402
- both Ruby 1.8 and the previous line of Lingo prior to 1.8. This branch may
403
- receive occasional bug fixes and minor feature updates. However, the bulk of
404
- the development efforts will be directed towards Lingo 1.8+.
405
-
406
- To install the legacy version, download and extract the
407
- {ZIP archive}[http://ixtrieve.fh-koeln.de/buch/lingo-1.7.1.zip].
408
- No additional dependencies are required. This version of Lingo works
409
- with both Ruby 1.8 (1.8.5 or higher) and 1.9 (1.9.2 or higher).
410
-
411
- The executable is named +lingo.rb+. It's located at the root of the installation
412
- directory and may only be run from there. See <tt>ruby lingo.rb -h</tt> for
413
- usage instructions.
414
-
415
- Configuration and language definition files are also located at the root of the
416
- installation directory (<tt>*.cfg</tt> and <tt>*.lang</tt>, respectively).
417
- Dictionary source files are found in language-specific subdirectories (+de/+,
418
- +en/+, ...) and are named <tt>*.txt</tt>. The compiled dictionaries are found
419
- beneath these language subdirectories in a directory named <tt>store/</tt>.
420
-
421
390
 
422
391
  == FILE FORMATS
423
392
 
@@ -443,24 +412,24 @@ _Example_:
443
412
 
444
413
  # input is taken from the previous attendee,
445
414
  # output is sent to the named channel "syn"
446
- - synonymer: { skip: '?,t', source: sys-syn, out: syn }
415
+ - synonymer: { skip: '?,t', source: sys-syn, out: syn }
447
416
  
448
417
  # input is taken from the named channel "syn",
449
418
  # output is sent to the next attendee
450
- - vector_filter: { in: syn, lexicals: y, sort: term_abs }
419
+ - vector_filter: { in: syn, lexicals: y, sort: term_abs }
451
420
  
452
421
  # input is taken from the previous attendee,
453
422
  # output is sent to the next attendee
454
- - text_writer: { ext: syn, sep: "\n" }
423
+ - text_writer: { ext: syn, sep: "\n" }
455
424
  
456
425
  # input is taken from the named channel "syn"
457
426
  # (ignoring the output of the previous attendee),
458
427
  # output is sent to the next attendee
459
- - vector_filter: { in: syn, lexicals: m }
428
+ - vector_filter: { in: syn, lexicals: m }
460
429
  
461
430
  # input is taken from the previous attendee,
462
431
  # output is sent to the next attendee
463
- - text_writer: { ext: mul, sep: "\n" }
432
+ - text_writer: { ext: mul, sep: "\n" }
464
433
 
465
434
  === Language definition
466
435
 
@@ -532,8 +501,8 @@ the full test suite.
532
501
  == LINKS
533
502
 
534
503
  Website:: http://lex-lingo.de
535
- Demo:: http://ixtrieve.fh-koeln.de/lingoweb
536
- Documentation:: https://lex-lingo.github.com/lingo
504
+ Demo:: http://lex-lingo.de/lingoweb
505
+ Documentation:: http://lex-lingo.de/doc
537
506
  Source code:: https://github.com/lex-lingo/lingo
538
507
  RubyGem:: https://rubygems.org/gems/lingo
539
508
  Bug tracker:: https://github.com/lex-lingo/lingo/issues
@@ -555,6 +524,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
555
524
 
556
525
  === Research publications
557
526
 
527
+ * Siebenkäs, A.; Markscheffel, B.: <em>{Conception of a workflow for the semi-automatic construction of a thesaurus for the German printing industry}[https://zenodo.org/record/17945]</em>. (English) In: Re:inventing Information Science in the Networked Society. Proceedings of the 14th International Symposium on Information Science (ISI 2015), Zadar, Croatia, 19th-21st May 2015. Eds.: F. Pehar, C. Schlögl, C. Wolff. Glückstadt: Verlag Werner Hülsbusch, 2015. pp 217-229
528
+ * Grün, S.: <em>Bildung von Komposita-Indextermen auf der Basis einer algorithmischen Mehrwortgruppenanalyse mit Lingo</em>. (German) Köln: Fachhochschule Köln, 2015.
558
529
  * Bredack, J.; Lepsky, K.: <em>{Automatische Extraktion von Fachterminologie aus Volltexten}[http://dx.doi.org/10.1515/abitech-2014-0002]</em>. (German) In: ABI Technik 34 (1), 2014. pp 2-12.
559
530
  * Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
560
531
  * Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
data/Rakefile CHANGED
@@ -37,7 +37,7 @@ The main functions of Lingo are:
37
37
 
38
38
  dependencies: {
39
39
  'cyclops' => '~> 0.1',
40
- 'nuggets' => '~> 1.1',
40
+ 'nuggets' => '~> 1.3',
41
41
  'rubyzip' => '~> 1.1',
42
42
  'sinatra-bells' => '~> 0.0',
43
43
  'unicode' => '~> 0.4'
@@ -59,10 +59,9 @@ rescue LoadError => err
59
59
  end
60
60
 
61
61
  CLEAN.include(
62
- 'txt/*.{log,mul,non,seq,ste,syn,ve?}',
62
+ 'txt/*.{als,hal,log,lsi,mul,non,seq,ste,syn,ve?}',
63
63
  'test/{test.*,text.non}',
64
- 'store/*/*.rev',
65
- 'bench/tmp.*'
64
+ 'store/*/*.rev'
66
65
  )
67
66
 
68
67
  CLOBBER.include('store')
@@ -76,19 +75,6 @@ task('test:txt') { test_ref('artikel', 'lingo') }
76
75
  desc 'Test against reference file (LIR)'
77
76
  task('test:lir') { test_ref('lir') }
78
77
 
79
- unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
80
- desc 'Run all benchmarks'
81
- task :bench
82
-
83
- benchmarks.each { |benchmark|
84
- bench = File.basename(benchmark, '_bench.rb')
85
- task bench: benchtask = "bench:#{bench}"
86
-
87
- desc "Run #{bench} benchmark"
88
- task(benchtask) { system(File.ruby, benchmark) }
89
- }
90
- end
91
-
92
78
  def test_ref(name, cfg = name)
93
79
  require 'diff/lcs'
94
80
  require 'diff/lcs/hunk'
@@ -30,9 +30,6 @@ meeting:
30
30
  # Schreibweisen variieren und erneut suchen
31
31
  # - variator: { source: sys-dic }
32
32
 
33
- # Worttrennungen aufheben
34
- # - dehyphenizer: { source: sys-dic }
35
-
36
33
  # Wortstämme für nicht erkannte Wörter einfügen
37
34
  # - stemmer: { }
38
35
 
@@ -46,7 +43,7 @@ meeting:
46
43
  - sequencer: { stopper: 'PUNC,OTHR' }
47
44
 
48
45
  # Relationierungen einfügen
49
- - synonymer: { skip: '?,t', source: sys-syn, out: syn }
46
+ - synonymer: { skip: '?,t', source: sys-syn, out: res }
50
47
 
51
48
 
52
49
  ########################################
@@ -60,45 +57,57 @@ meeting:
60
57
  #
61
58
 
62
59
  # Erstelle Datei mit Endung .log für Datenstrom
63
- - debug_filter: { in: syn, prompt: 'lex:) ' }
60
+ - debug_filter: { in: res, prompt: 'lex:) ' }
64
61
  - text_writer: { ext: log, sep: "\n" }
65
62
 
63
+ # Erstelle Datei mit Endung .als für Datenstrom
64
+ - analysis_filter: { in: res }
65
+ - text_writer: { ext: als, sep: "\n" }
66
+
66
67
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
67
- - noneword_filter: { in: syn }
68
+ - vector_filter: { in: res, lexicals: '\?' }
68
69
  - text_writer: { ext: non, sep: "\n" }
69
70
 
70
71
  # Erstelle Datei mit Endung .ste für Wortstämme
71
- - vector_filter: { in: syn, lexicals: z }
72
+ - vector_filter: { in: res, lexicals: z }
72
73
  - text_writer: { ext: ste, sep: "\n" }
73
74
 
74
75
  # Erstelle Datei mit Endung .vec für erkannte Indexterme
75
- - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
76
+ - vector_filter: { in: res, lexicals: '^[ksavem]$' }
76
77
  - text_writer: { ext: vec, sep: "\n" }
77
78
 
78
79
  # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
79
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_abs }
80
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
80
81
  - text_writer: { ext: ven, sep: "\n" }
81
82
 
82
83
  # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
83
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
84
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
84
85
  - text_writer: { ext: ver, sep: "\n" }
85
86
 
86
87
  # Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
87
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
88
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
88
89
  - text_writer: { ext: vef, sep: "\n" }
89
90
 
90
91
  # Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
91
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
92
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
92
93
  - text_writer: { ext: vet, sep: "\n" }
93
94
 
94
95
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
95
- - vector_filter: { in: syn, lexicals: m }
96
+ - vector_filter: { in: res, lexicals: m }
96
97
  - text_writer: { ext: mul, sep: "\n" }
97
98
 
98
99
  # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
99
- - vector_filter: { in: syn, lexicals: q, sort: term_abs }
100
+ - vector_filter: { in: res, lexicals: q, sort: term_abs }
100
101
  - text_writer: { ext: seq, sep: "\n" }
101
102
 
102
103
  # Erstelle Datei mit Endung .syn für erkannte Synonyme
103
- - vector_filter: { in: syn, lexicals: y, sort: term_abs }
104
+ - vector_filter: { in: res, lexicals: y, sort: term_abs }
104
105
  - text_writer: { ext: syn, sep: "\n" }
106
+
107
+ # Erstelle Datei mit Endung .hal für HAL-Indexterme
108
+ # - hal_filter: { in: res, lexicals: '^[ksavem]$' }
109
+ # - text_writer: { ext: hal, sep: "\n" }
110
+
111
+ # Erstelle Datei mit Endung .lsi für LSI-Indexterme
112
+ # - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
113
+ # - text_writer: { ext: lsi, sep: "\n" }
@@ -35,9 +35,6 @@ meeting:
35
35
  # Schreibweisen variieren und erneut suchen
36
36
  # - variator: { source: sys-dic }
37
37
 
38
- # Worttrennungen aufheben
39
- # - dehyphenizer: { source: sys-dic }
40
-
41
38
  # Wortstämme für nicht erkannte Wörter einfügen
42
39
  # - stemmer: { }
43
40
 
@@ -51,7 +48,7 @@ meeting:
51
48
  - sequencer: { stopper: 'PUNC,OTHR' }
52
49
 
53
50
  # Relationierungen einfügen
54
- - synonymer: { skip: '?,t', source: sys-syn, out: syn }
51
+ - synonymer: { skip: '?,t', source: sys-syn, out: res }
55
52
 
56
53
 
57
54
  ########################################
@@ -65,45 +62,57 @@ meeting:
65
62
  #
66
63
 
67
64
  # Erstelle Datei mit Endung .log für Datenstrom
68
- - debug_filter: { in: syn, prompt: 'lex:) ' }
69
- - text_writer: { ext: log, sep: "\n" }
65
+ - debug_filter: { in: res, prompt: 'lex:) ' }
66
+ - text_writer: { ext: log, sep: "\n", lir-format: ~ }
67
+
68
+ # Erstelle Datei mit Endung .als für Datenstrom
69
+ - analysis_filter: { in: res }
70
+ - text_writer: { ext: als, sep: "\n", lir-format: ~ }
70
71
 
71
72
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
72
- - noneword_filter: { in: syn }
73
+ - vector_filter: { in: res, lexicals: '\?' }
73
74
  - text_writer: { ext: non, sep: '|' }
74
75
 
75
76
  # Erstelle Datei mit Endung .ste für Wortstämme
76
- - vector_filter: { in: syn, lexicals: z }
77
+ - vector_filter: { in: res, lexicals: z }
77
78
  - text_writer: { ext: ste, sep: '|' }
78
79
 
79
80
  # Erstelle Datei mit Endung .vec für erkannte Indexterme
80
- - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
81
+ - vector_filter: { in: res, lexicals: '^[ksavem]$' }
81
82
  - text_writer: { ext: vec, sep: '|' }
82
83
 
83
84
  # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
84
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_abs }
85
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
85
86
  - text_writer: { ext: ven, sep: '|' }
86
87
 
87
88
  # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
88
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
89
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
89
90
  - text_writer: { ext: ver, sep: '|' }
90
91
 
91
92
  # Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
92
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
93
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
93
94
  - text_writer: { ext: vef, sep: '|' }
94
95
 
95
96
  # Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
96
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
97
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
97
98
  - text_writer: { ext: vet, sep: '|' }
98
99
 
99
100
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
100
- - vector_filter: { in: syn, lexicals: m }
101
+ - vector_filter: { in: res, lexicals: m }
101
102
  - text_writer: { ext: mul, sep: '|' }
102
103
 
103
104
  # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
104
- - vector_filter: { in: syn, lexicals: q, sort: term_abs }
105
+ - vector_filter: { in: res, lexicals: q, sort: term_abs }
105
106
  - text_writer: { ext: seq, sep: '|' }
106
107
 
107
108
  # Erstelle Datei mit Endung .syn für erkannte Synonyme
108
- - vector_filter: { in: syn, lexicals: y, sort: term_abs }
109
+ - vector_filter: { in: res, lexicals: y, sort: term_abs }
109
110
  - text_writer: { ext: syn, sep: '|' }
111
+
112
+ # Erstelle Datei mit Endung .hal für HAL-Indexterme
113
+ # - hal_filter: { in: res, lexicals: '^[ksavem]$' }
114
+ # - text_writer: { ext: hal, sep: '|' }
115
+
116
+ # Erstelle Datei mit Endung .lsi für LSI-Indexterme
117
+ # - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
118
+ # - text_writer: { ext: lsi, sep: '|' }