lingo 1.8.6 → 1.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +40 -4
  3. data/README +22 -51
  4. data/Rakefile +3 -17
  5. data/config/lingo.cfg +24 -15
  6. data/config/lir.cfg +25 -16
  7. data/dict/de/test_muh.txt +6 -0
  8. data/dict/en/lingo-dic.txt +2 -3
  9. data/lang/de.lang +10 -9
  10. data/lang/en.lang +1 -1
  11. data/lib/lingo.rb +4 -4
  12. data/lib/lingo/attendee.rb +27 -7
  13. data/lib/lingo/attendee/analysis_filter.rb +81 -0
  14. data/lib/lingo/attendee/debug_filter.rb +42 -0
  15. data/lib/lingo/attendee/debugger.rb +2 -11
  16. data/lib/lingo/attendee/decomposer.rb +6 -3
  17. data/lib/lingo/attendee/formatter.rb +6 -6
  18. data/lib/lingo/attendee/hal_filter.rb +94 -0
  19. data/lib/lingo/attendee/lsi_filter.rb +99 -0
  20. data/lib/lingo/attendee/multi_worder.rb +69 -43
  21. data/lib/lingo/attendee/sequencer.rb +32 -19
  22. data/lib/lingo/attendee/synonymer.rb +2 -2
  23. data/lib/lingo/attendee/text_reader.rb +63 -92
  24. data/lib/lingo/attendee/text_writer.rb +12 -21
  25. data/lib/lingo/attendee/tokenizer.rb +32 -21
  26. data/lib/lingo/attendee/variator.rb +3 -3
  27. data/lib/lingo/attendee/vector_filter.rb +7 -9
  28. data/lib/lingo/attendee/word_searcher.rb +3 -3
  29. data/lib/lingo/buffered_attendee.rb +3 -36
  30. data/lib/lingo/config.rb +1 -1
  31. data/lib/lingo/ctl.rb +7 -155
  32. data/lib/lingo/ctl/analysis.rb +136 -0
  33. data/lib/lingo/ctl/files.rb +86 -0
  34. data/lib/lingo/ctl/other.rb +140 -0
  35. data/lib/lingo/database.rb +64 -60
  36. data/lib/lingo/database/crypter.rb +7 -5
  37. data/lib/lingo/error.rb +5 -4
  38. data/lib/lingo/language.rb +13 -5
  39. data/lib/lingo/language/grammar.rb +13 -7
  40. data/lib/lingo/language/token.rb +6 -0
  41. data/lib/lingo/language/word.rb +23 -36
  42. data/lib/lingo/language/word_form.rb +5 -1
  43. data/lib/lingo/srv.rb +2 -2
  44. data/lib/lingo/text_utils.rb +96 -0
  45. data/lib/lingo/version.rb +1 -1
  46. data/lib/lingo/web/views/index.erb +1 -1
  47. data/test/attendee/ts_decomposer.rb +23 -5
  48. data/test/attendee/ts_multi_worder.rb +66 -0
  49. data/test/attendee/ts_sequencer.rb +28 -4
  50. data/test/attendee/ts_text_reader.rb +20 -0
  51. data/test/attendee/ts_tokenizer.rb +20 -0
  52. data/test/attendee/ts_variator.rb +1 -1
  53. data/test/attendee/ts_word_searcher.rb +39 -3
  54. data/test/lir3.txt +12 -0
  55. data/test/ref/artikel.non +1 -12
  56. data/test/ref/artikel.seq +3 -1
  57. data/test/ref/artikel.vec +1 -0
  58. data/test/ref/artikel.vef +35 -34
  59. data/test/ref/artikel.ven +8 -7
  60. data/test/ref/artikel.ver +34 -33
  61. data/test/ref/artikel.vet +2573 -2563
  62. data/test/ref/lir.non +77 -78
  63. data/test/ref/lir.seq +9 -7
  64. data/test/ref/lir.syn +1 -1
  65. data/test/ref/lir.vec +41 -41
  66. data/test/ref/lir.vef +210 -210
  67. data/test/ref/lir.ven +46 -46
  68. data/test/ref/lir.ver +72 -72
  69. data/test/ref/lir.vet +329 -329
  70. data/test/ts_database.rb +166 -62
  71. data/test/ts_language.rb +23 -23
  72. metadata +53 -34
  73. data/lib/lingo/attendee/dehyphenizer.rb +0 -120
  74. data/lib/lingo/attendee/noneword_filter.rb +0 -115
  75. data/test/attendee/ts_noneword_filter.rb +0 -15
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e4cc870c8c1b49c580841a934b5906ed6ddf75e4
4
- data.tar.gz: 1ecb26c708daa4bfa09f4aa76f6d7e17f1a72683
3
+ metadata.gz: 1130ec52467314ba95af17e635888f60046c5b42
4
+ data.tar.gz: 6a882ea4f88b1fbcf1a66b1d5fafe8fa05458b89
5
5
  SHA512:
6
- metadata.gz: f2f0abed6198a7fcf0ff4f44aa442266f38c44646c7f4e8ef894886c453ce1654edd217c675f12e6b7d828c43ac461abb64d92aef20015249dbdf6f9efc03a3f
7
- data.tar.gz: cb0be6e46a16639a384bab3507dc3b2bd4465736d1d7e0189d3930d1252e247fff4421364d860bd2cdd12f26b4f4445192a87998bea017bb1f285c8e0bda7639
6
+ metadata.gz: 3e0b384a822c28961c99d411bbdd399d9a49b29fdc40688d3534a9897cef984a7447cdd21b05fda45dda9a0fad25d99f60d524064803edb218ff95bbf9fc4fe6
7
+ data.tar.gz: 5cfa5c7f235113238d0e8568e9948f27a7ec864af63ac696a5efdfceed6eb678724f3ca3df24d3ac296068aceb1bf20d1369f439edf7f670265550ace1ce1cd0
data/ChangeLog CHANGED
@@ -2,6 +2,43 @@
2
2
 
3
3
  = Revision history for Lingo
4
4
 
5
+ == 1.8.7 [2015-08-07]
6
+
7
+ * Added Lingo::Attendee::LsiFilter to correlate semantically related terms
8
+ (LSI[https://en.wikipedia.org/wiki/Latent_semantic_indexing]) over the
9
+ "corpus" of all files processed during a single program invocation; requires
10
+ lsi4r[https://blackwinter.github.com/lsi4r] which in turn requires
11
+ rb-gsl[https://blackwinter.github.com/rb-gsl]. [EXPERIMENTAL: Interface may
12
+ be changed or removed in next release.]
13
+ * Added Lingo::Attendee::HalFilter to correlate semantically related terms
14
+ (HAL[https://en.wikipedia.org/wiki/Hyperspace_Analogue_to_Language]) over
15
+ individual documents; requires hal4r[https://blackwinter.github.com/hal4r]
16
+ which in turn requires rb-gsl[https://blackwinter.github.com/rb-gsl].
17
+ [EXPERIMENTAL: Interface may be changed or removed in next release.]
18
+ * Added Lingo::Attendee::AnalysisFilter and associated +lingoctl+ tooling.
19
+ * Multiword dictionaries can now identify hyphenated variants (e.g.
20
+ <tt>automatic data-processing</tt>); set <tt>hyphenate: true</tt> in the
21
+ dictionary config.
22
+ * Lingo::Attendee::Tokenizer no longer considers hyphens at word edges as part
23
+ of the word. As a consequence, Lingo::Attendee::Dehyphenizer has been
24
+ dropped.
25
+ * Dropped Lingo::Attendee::NonewordFilter; use Lingo::Attendee::VectorFilter
26
+ with option <tt>lexicals: '\?'</tt> instead.
27
+ * Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned
28
+ +encoding+ option to read/write text that is not UTF-8 encoded;
29
+ configuration files and dictionaries still need to be UTF-8, though.
30
+ * Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned to
31
+ read/write Gzip-compressed files (file extension +.gz+ or +.gzip+).
32
+ * Lingo::Attendee::Sequencer learned to recognize +0+ in the pattern to match
33
+ number tokens.
34
+ * Fixed Lingo::Attendee::TextReader to recognize BOM in input files; does not
35
+ apply to input read from +STDIN+.
36
+ * Fixed regression introduced in 1.8.6 where Lingo::Attendee::Debugger would
37
+ no longer work immediately behind Lingo::Attendee::TextReader.
38
+ * Fixed +lingoctl+ copy commands when overwriting existing files.
39
+ * Refactored Lingo::Database::Crypter into a module.
40
+ * JRuby 9000 compatibility.
41
+
5
42
  == 1.8.6 [2015-02-09]
6
43
 
7
44
  * Lingo::Attendee::VectorFilter learned +pos+ option to print position and
@@ -17,8 +54,7 @@
17
54
  * Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
18
55
  is obsolete.
19
56
  * Lingo::Attendee::TextReader passes byte offset to the following attendee.
20
- * Lingo::Attendee::Tokenizer records token's byte offset.
21
- * Lingo::Attendee::Tokenizer records token's sequence position.
57
+ * Lingo::Attendee::Tokenizer records token's position and byte offset.
22
58
  * Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
23
59
  specified tags' contents.
24
60
  * Lingo::Attendee subclasses warn when invalid or obsolete options or names
@@ -184,9 +220,9 @@
184
220
  the word class for multiword entries (defaults to <tt>def-wc</tt>). Use
185
221
  <tt>def-mul-wc: 'm'</tt> in your <tt>.lang</tt> to restore the previous
186
222
  behaviour.
187
- * New Lingo::Attendee::Formatter for configurable output formatting as an
223
+ * Added Lingo::Attendee::Formatter for configurable output formatting as an
188
224
  alternative to Lingo::Attendee::TextWriter.
189
- * New basic input filters to enable indexing of HTML/XML (and PDF) files.
225
+ * Added basic input filters to enable indexing of HTML/XML (and PDF) files.
190
226
  * Updated the system dictionary.
191
227
  * Switched license to Affero GPL.
192
228
 
data/README CHANGED
@@ -15,7 +15,6 @@
15
15
  * {Example}[rdoc-label:label-EXAMPLE]
16
16
  * {Installation and Usage}[rdoc-label:label-INSTALLATION+AND+USAGE]
17
17
  * {Dictionary and configuration file lookup}[rdoc-label:label-Dictionary+and+configuration+file+lookup]
18
- * {Legacy version}[rdoc-label:label-Legacy+version]
19
18
  * {File formats}[rdoc-label:label-FILE+FORMATS]
20
19
  * {Configuration}[rdoc-label:label-Configuration]
21
20
  * {Language definition}[rdoc-label:label-Language+definition]
@@ -35,7 +34,7 @@
35
34
 
36
35
  == VERSION
37
36
 
38
- This documentation refers to Lingo version 1.8.6
37
+ This documentation refers to Lingo version 1.8.7
39
38
 
40
39
 
41
40
  == DESCRIPTION
@@ -58,7 +57,7 @@ is a minimal configuration example to analyse this README file:
58
57
  meeting:
59
58
  attendees:
60
59
  - text_reader: { files: 'README' }
61
- - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
60
+ - debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: '<debug>: ' }
62
61
 
63
62
  Lingo is told to invite two attendees and wants them to talk to each other,
64
63
  hence the name Lingo (= the technical language).
@@ -131,8 +130,6 @@ information see each attendee's documentation):
131
130
  Lingo::Attendee::Debugger)
132
131
  +variator+:: Tries to correct spelling errors and the like. (see
133
132
  Lingo::Attendee::Variator)
134
- +dehyphenizer+:: Tries to undo hyphenation. (see
135
- Lingo::Attendee::Dehyphenizer)
136
133
  +multi_worder+:: Identifies phrases (word sequences) based on a multiword
137
134
  dictionary. (see Lingo::Attendee::MultiWorder)
138
135
  +sequencer+:: Identifies phrases (word sequences) based on patterns of
@@ -186,14 +183,14 @@ of context to external files.
186
183
 
187
184
  _Example_:
188
185
 
189
- # keep line endings
190
- - text_reader: { files: $(files) }
186
+ # read files
187
+ - text_reader: { files: $(files) }
191
188
  # keep whitespace
192
- - tokenizer: { space: true }
189
+ - tokenizer: { space: true }
193
190
  # do processing...
194
- - word_searcher: { source: sys-dic, mode: first }
195
- # insert formatted results (e.g. "[[Name::lingo|Lingo]] got these [[Noun::word|words]].")
196
- - formatter: { ext: out, format: '[[%3$s::%2$s|%1$s]]', map: { e: Name, s: Noun } }
191
+ - word_searcher: { source: sys-dic, mode: first }
192
+ # insert formatted results (e.g. "[[Name::lingo|Lingo]] finds [[Noun::word|words]].")
193
+ - formatter: { ext: out, format: '[[%3$s::%2$s|%1$s]]', map: { e: Name, s: Noun } }
197
194
 
198
195
  === Plugins
199
196
 
@@ -267,7 +264,7 @@ truncated for clarity).
267
264
  <Lingo = [(lingo/s), (lingo/e)]>
268
265
  <-|?>
269
266
  <A|?>
270
- <full-featured|KOM = [(full-featured/k), (full/s+), (full/a+), (full/v+), (featured/a+)]>
267
+ <full-featured|COM = [(full-featured/k), (full/s+), (full/a+), (full/v+), (featured/a+)]>
271
268
  <automatic = [(automatic/s), (automatic/a)]>
272
269
  <indexing = [(index/v)]>
273
270
  <system = [(system/s)]>
@@ -349,9 +346,8 @@ the +lingo+ executable to process your text files. See <tt>lingo --help</tt>
349
346
  for available options.
350
347
 
351
348
  Please note that Lingo requires Ruby version 1.9.3 or higher to run
352
- (2.1.3[http://ruby-lang.org/en/downloads/] is the currently recommended
353
- version). If you want to use Lingo on Ruby 1.8, please refer to the
354
- {legacy version}[rdoc-label:label-Legacy+version].
349
+ (2.2.2[http://ruby-lang.org/en/downloads/] is the currently recommended
350
+ version).
355
351
 
356
352
  Since Lingo depends on native extensions, you need to make sure that
357
353
  development files for your Ruby version are installed. On Debian-based
@@ -359,12 +355,8 @@ Linux platforms they are included in the package <tt>ruby-dev</tt>;
359
355
  other distributions may have a similarly named package. On Windows those
360
356
  development files are currently not required.
361
357
 
362
- Prior to version 1.8.0, Lingo expected to be run from its installation
363
- directory. This is no longer necessary. But if you prefer that use case,
364
- you can either download and extract an
365
- {archive file}[http://github.com/lex-lingo/lingo/releases] or unpack the
366
- Gem archive (<tt>gem unpack lingo</tt>); or you can install the legacy
367
- version of Lingo (see below).
358
+ On JRuby, install gdbm[https://rubygems.org/gems/gdbm] for efficient database
359
+ operations: <tt>gem install gdbm</tt>.
368
360
 
369
361
  === Dictionary and configuration file lookup
370
362
 
@@ -395,29 +387,6 @@ typically organized in the following directory structure:
395
387
 
396
388
  But for compatibility reasons these naming conventions are not enforced.
397
389
 
398
- === Legacy version
399
-
400
- As Lingo 1.8 introduced some major disruptions and no longer runs on Ruby 1.8,
401
- there is a maintenance branch for Lingo 1.7.x that will remain compatible with
402
- both Ruby 1.8 and the previous line of Lingo prior to 1.8. This branch may
403
- receive occasional bug fixes and minor feature updates. However, the bulk of
404
- the development efforts will be directed towards Lingo 1.8+.
405
-
406
- To install the legacy version, download and extract the
407
- {ZIP archive}[http://ixtrieve.fh-koeln.de/buch/lingo-1.7.1.zip].
408
- No additional dependencies are required. This version of Lingo works
409
- with both Ruby 1.8 (1.8.5 or higher) and 1.9 (1.9.2 or higher).
410
-
411
- The executable is named +lingo.rb+. It's located at the root of the installation
412
- directory and may only be run from there. See <tt>ruby lingo.rb -h</tt> for
413
- usage instructions.
414
-
415
- Configuration and language definition files are also located at the root of the
416
- installation directory (<tt>*.cfg</tt> and <tt>*.lang</tt>, respectively).
417
- Dictionary source files are found in language-specific subdirectories (+de/+,
418
- +en/+, ...) and are named <tt>*.txt</tt>. The compiled dictionaries are found
419
- beneath these language subdirectories in a directory named <tt>store/</tt>.
420
-
421
390
 
422
391
  == FILE FORMATS
423
392
 
@@ -443,24 +412,24 @@ _Example_:
443
412
 
444
413
  # input is taken from the previous attendee,
445
414
  # output is sent to the named channel "syn"
446
- - synonymer: { skip: '?,t', source: sys-syn, out: syn }
415
+ - synonymer: { skip: '?,t', source: sys-syn, out: syn }
447
416
  
448
417
  # input is taken from the named channel "syn",
449
418
  # output is sent to the next attendee
450
- - vector_filter: { in: syn, lexicals: y, sort: term_abs }
419
+ - vector_filter: { in: syn, lexicals: y, sort: term_abs }
451
420
  
452
421
  # input is taken from the previous attendee,
453
422
  # output is sent to the next attendee
454
- - text_writer: { ext: syn, sep: "\n" }
423
+ - text_writer: { ext: syn, sep: "\n" }
455
424
  
456
425
  # input is taken from the named channel "syn"
457
426
  # (ignoring the output of the previous attendee),
458
427
  # output is sent to the next attendee
459
- - vector_filter: { in: syn, lexicals: m }
428
+ - vector_filter: { in: syn, lexicals: m }
460
429
  
461
430
  # input is taken from the previous attendee,
462
431
  # output is sent to the next attendee
463
- - text_writer: { ext: mul, sep: "\n" }
432
+ - text_writer: { ext: mul, sep: "\n" }
464
433
 
465
434
  === Language definition
466
435
 
@@ -532,8 +501,8 @@ the full test suite.
532
501
  == LINKS
533
502
 
534
503
  Website:: http://lex-lingo.de
535
- Demo:: http://ixtrieve.fh-koeln.de/lingoweb
536
- Documentation:: https://lex-lingo.github.com/lingo
504
+ Demo:: http://lex-lingo.de/lingoweb
505
+ Documentation:: http://lex-lingo.de/doc
537
506
  Source code:: https://github.com/lex-lingo/lingo
538
507
  RubyGem:: https://rubygems.org/gems/lingo
539
508
  Bug tracker:: https://github.com/lex-lingo/lingo/issues
@@ -555,6 +524,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
555
524
 
556
525
  === Research publications
557
526
 
527
+ * Siebenkäs, A.; Markscheffel, B.: <em>{Conception of a workflow for the semi-automatic construction of a thesaurus for the German printing industry}[https://zenodo.org/record/17945]</em>. (English) In: Re:inventing Information Science in the Networked Society. Proceedings of the 14th International Symposium on Information Science (ISI 2015), Zadar, Croatia, 19th-21st May 2015. Eds.: F. Pehar, C. Schlögl, C. Wolff. Glückstadt: Verlag Werner Hülsbusch, 2015. pp 217-229
528
+ * Grün, S.: <em>Bildung von Komposita-Indextermen auf der Basis einer algorithmischen Mehrwortgruppenanalyse mit Lingo</em>. (German) Köln: Fachhochschule Köln, 2015.
558
529
  * Bredack, J.; Lepsky, K.: <em>{Automatische Extraktion von Fachterminologie aus Volltexten}[http://dx.doi.org/10.1515/abitech-2014-0002]</em>. (German) In: ABI Technik 34 (1), 2014. pp 2-12.
559
530
  * Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
560
531
  * Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
data/Rakefile CHANGED
@@ -37,7 +37,7 @@ The main functions of Lingo are:
37
37
 
38
38
  dependencies: {
39
39
  'cyclops' => '~> 0.1',
40
- 'nuggets' => '~> 1.1',
40
+ 'nuggets' => '~> 1.3',
41
41
  'rubyzip' => '~> 1.1',
42
42
  'sinatra-bells' => '~> 0.0',
43
43
  'unicode' => '~> 0.4'
@@ -59,10 +59,9 @@ rescue LoadError => err
59
59
  end
60
60
 
61
61
  CLEAN.include(
62
- 'txt/*.{log,mul,non,seq,ste,syn,ve?}',
62
+ 'txt/*.{als,hal,log,lsi,mul,non,seq,ste,syn,ve?}',
63
63
  'test/{test.*,text.non}',
64
- 'store/*/*.rev',
65
- 'bench/tmp.*'
64
+ 'store/*/*.rev'
66
65
  )
67
66
 
68
67
  CLOBBER.include('store')
@@ -76,19 +75,6 @@ task('test:txt') { test_ref('artikel', 'lingo') }
76
75
  desc 'Test against reference file (LIR)'
77
76
  task('test:lir') { test_ref('lir') }
78
77
 
79
- unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
80
- desc 'Run all benchmarks'
81
- task :bench
82
-
83
- benchmarks.each { |benchmark|
84
- bench = File.basename(benchmark, '_bench.rb')
85
- task bench: benchtask = "bench:#{bench}"
86
-
87
- desc "Run #{bench} benchmark"
88
- task(benchtask) { system(File.ruby, benchmark) }
89
- }
90
- end
91
-
92
78
  def test_ref(name, cfg = name)
93
79
  require 'diff/lcs'
94
80
  require 'diff/lcs/hunk'
@@ -30,9 +30,6 @@ meeting:
30
30
  # Schreibweisen variieren und erneut suchen
31
31
  # - variator: { source: sys-dic }
32
32
 
33
- # Worttrennungen aufheben
34
- # - dehyphenizer: { source: sys-dic }
35
-
36
33
  # Wortstämme für nicht erkannte Wörter einfügen
37
34
  # - stemmer: { }
38
35
 
@@ -46,7 +43,7 @@ meeting:
46
43
  - sequencer: { stopper: 'PUNC,OTHR' }
47
44
 
48
45
  # Relationierungen einfügen
49
- - synonymer: { skip: '?,t', source: sys-syn, out: syn }
46
+ - synonymer: { skip: '?,t', source: sys-syn, out: res }
50
47
 
51
48
 
52
49
  ########################################
@@ -60,45 +57,57 @@ meeting:
60
57
  #
61
58
 
62
59
  # Erstelle Datei mit Endung .log für Datenstrom
63
- - debug_filter: { in: syn, prompt: 'lex:) ' }
60
+ - debug_filter: { in: res, prompt: 'lex:) ' }
64
61
  - text_writer: { ext: log, sep: "\n" }
65
62
 
63
+ # Erstelle Datei mit Endung .als für Datenstrom
64
+ - analysis_filter: { in: res }
65
+ - text_writer: { ext: als, sep: "\n" }
66
+
66
67
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
67
- - noneword_filter: { in: syn }
68
+ - vector_filter: { in: res, lexicals: '\?' }
68
69
  - text_writer: { ext: non, sep: "\n" }
69
70
 
70
71
  # Erstelle Datei mit Endung .ste für Wortstämme
71
- - vector_filter: { in: syn, lexicals: z }
72
+ - vector_filter: { in: res, lexicals: z }
72
73
  - text_writer: { ext: ste, sep: "\n" }
73
74
 
74
75
  # Erstelle Datei mit Endung .vec für erkannte Indexterme
75
- - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
76
+ - vector_filter: { in: res, lexicals: '^[ksavem]$' }
76
77
  - text_writer: { ext: vec, sep: "\n" }
77
78
 
78
79
  # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
79
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_abs }
80
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
80
81
  - text_writer: { ext: ven, sep: "\n" }
81
82
 
82
83
  # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
83
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
84
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
84
85
  - text_writer: { ext: ver, sep: "\n" }
85
86
 
86
87
  # Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
87
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
88
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
88
89
  - text_writer: { ext: vef, sep: "\n" }
89
90
 
90
91
  # Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
91
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
92
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
92
93
  - text_writer: { ext: vet, sep: "\n" }
93
94
 
94
95
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
95
- - vector_filter: { in: syn, lexicals: m }
96
+ - vector_filter: { in: res, lexicals: m }
96
97
  - text_writer: { ext: mul, sep: "\n" }
97
98
 
98
99
  # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
99
- - vector_filter: { in: syn, lexicals: q, sort: term_abs }
100
+ - vector_filter: { in: res, lexicals: q, sort: term_abs }
100
101
  - text_writer: { ext: seq, sep: "\n" }
101
102
 
102
103
  # Erstelle Datei mit Endung .syn für erkannte Synonyme
103
- - vector_filter: { in: syn, lexicals: y, sort: term_abs }
104
+ - vector_filter: { in: res, lexicals: y, sort: term_abs }
104
105
  - text_writer: { ext: syn, sep: "\n" }
106
+
107
+ # Erstelle Datei mit Endung .hal für HAL-Indexterme
108
+ # - hal_filter: { in: res, lexicals: '^[ksavem]$' }
109
+ # - text_writer: { ext: hal, sep: "\n" }
110
+
111
+ # Erstelle Datei mit Endung .lsi für LSI-Indexterme
112
+ # - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
113
+ # - text_writer: { ext: lsi, sep: "\n" }
@@ -35,9 +35,6 @@ meeting:
35
35
  # Schreibweisen variieren und erneut suchen
36
36
  # - variator: { source: sys-dic }
37
37
 
38
- # Worttrennungen aufheben
39
- # - dehyphenizer: { source: sys-dic }
40
-
41
38
  # Wortstämme für nicht erkannte Wörter einfügen
42
39
  # - stemmer: { }
43
40
 
@@ -51,7 +48,7 @@ meeting:
51
48
  - sequencer: { stopper: 'PUNC,OTHR' }
52
49
 
53
50
  # Relationierungen einfügen
54
- - synonymer: { skip: '?,t', source: sys-syn, out: syn }
51
+ - synonymer: { skip: '?,t', source: sys-syn, out: res }
55
52
 
56
53
 
57
54
  ########################################
@@ -65,45 +62,57 @@ meeting:
65
62
  #
66
63
 
67
64
  # Erstelle Datei mit Endung .log für Datenstrom
68
- - debug_filter: { in: syn, prompt: 'lex:) ' }
69
- - text_writer: { ext: log, sep: "\n" }
65
+ - debug_filter: { in: res, prompt: 'lex:) ' }
66
+ - text_writer: { ext: log, sep: "\n", lir-format: ~ }
67
+
68
+ # Erstelle Datei mit Endung .als für Datenstrom
69
+ - analysis_filter: { in: res }
70
+ - text_writer: { ext: als, sep: "\n", lir-format: ~ }
70
71
 
71
72
  # Erstelle Datei mit Endung .non für nicht erkannte Wörter
72
- - noneword_filter: { in: syn }
73
+ - vector_filter: { in: res, lexicals: '\?' }
73
74
  - text_writer: { ext: non, sep: '|' }
74
75
 
75
76
  # Erstelle Datei mit Endung .ste für Wortstämme
76
- - vector_filter: { in: syn, lexicals: z }
77
+ - vector_filter: { in: res, lexicals: z }
77
78
  - text_writer: { ext: ste, sep: '|' }
78
79
 
79
80
  # Erstelle Datei mit Endung .vec für erkannte Indexterme
80
- - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
81
+ - vector_filter: { in: res, lexicals: '^[ksavem]$' }
81
82
  - text_writer: { ext: vec, sep: '|' }
82
83
 
83
84
  # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
84
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_abs }
85
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
85
86
  - text_writer: { ext: ven, sep: '|' }
86
87
 
87
88
  # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
88
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel }
89
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
89
90
  - text_writer: { ext: ver, sep: '|' }
90
91
 
91
92
  # Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
92
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
93
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
93
94
  - text_writer: { ext: vef, sep: '|' }
94
95
 
95
96
  # Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
96
- - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: false, pos: true }
97
+ - vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
97
98
  - text_writer: { ext: vet, sep: '|' }
98
99
 
99
100
  # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
100
- - vector_filter: { in: syn, lexicals: m }
101
+ - vector_filter: { in: res, lexicals: m }
101
102
  - text_writer: { ext: mul, sep: '|' }
102
103
 
103
104
  # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
104
- - vector_filter: { in: syn, lexicals: q, sort: term_abs }
105
+ - vector_filter: { in: res, lexicals: q, sort: term_abs }
105
106
  - text_writer: { ext: seq, sep: '|' }
106
107
 
107
108
  # Erstelle Datei mit Endung .syn für erkannte Synonyme
108
- - vector_filter: { in: syn, lexicals: y, sort: term_abs }
109
+ - vector_filter: { in: res, lexicals: y, sort: term_abs }
109
110
  - text_writer: { ext: syn, sep: '|' }
111
+
112
+ # Erstelle Datei mit Endung .hal für HAL-Indexterme
113
+ # - hal_filter: { in: res, lexicals: '^[ksavem]$' }
114
+ # - text_writer: { ext: hal, sep: '|' }
115
+
116
+ # Erstelle Datei mit Endung .lsi für LSI-Indexterme
117
+ # - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
118
+ # - text_writer: { ext: lsi, sep: '|' }