lingo 1.8.6 → 1.8.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +40 -4
- data/README +22 -51
- data/Rakefile +3 -17
- data/config/lingo.cfg +24 -15
- data/config/lir.cfg +25 -16
- data/dict/de/test_muh.txt +6 -0
- data/dict/en/lingo-dic.txt +2 -3
- data/lang/de.lang +10 -9
- data/lang/en.lang +1 -1
- data/lib/lingo.rb +4 -4
- data/lib/lingo/attendee.rb +27 -7
- data/lib/lingo/attendee/analysis_filter.rb +81 -0
- data/lib/lingo/attendee/debug_filter.rb +42 -0
- data/lib/lingo/attendee/debugger.rb +2 -11
- data/lib/lingo/attendee/decomposer.rb +6 -3
- data/lib/lingo/attendee/formatter.rb +6 -6
- data/lib/lingo/attendee/hal_filter.rb +94 -0
- data/lib/lingo/attendee/lsi_filter.rb +99 -0
- data/lib/lingo/attendee/multi_worder.rb +69 -43
- data/lib/lingo/attendee/sequencer.rb +32 -19
- data/lib/lingo/attendee/synonymer.rb +2 -2
- data/lib/lingo/attendee/text_reader.rb +63 -92
- data/lib/lingo/attendee/text_writer.rb +12 -21
- data/lib/lingo/attendee/tokenizer.rb +32 -21
- data/lib/lingo/attendee/variator.rb +3 -3
- data/lib/lingo/attendee/vector_filter.rb +7 -9
- data/lib/lingo/attendee/word_searcher.rb +3 -3
- data/lib/lingo/buffered_attendee.rb +3 -36
- data/lib/lingo/config.rb +1 -1
- data/lib/lingo/ctl.rb +7 -155
- data/lib/lingo/ctl/analysis.rb +136 -0
- data/lib/lingo/ctl/files.rb +86 -0
- data/lib/lingo/ctl/other.rb +140 -0
- data/lib/lingo/database.rb +64 -60
- data/lib/lingo/database/crypter.rb +7 -5
- data/lib/lingo/error.rb +5 -4
- data/lib/lingo/language.rb +13 -5
- data/lib/lingo/language/grammar.rb +13 -7
- data/lib/lingo/language/token.rb +6 -0
- data/lib/lingo/language/word.rb +23 -36
- data/lib/lingo/language/word_form.rb +5 -1
- data/lib/lingo/srv.rb +2 -2
- data/lib/lingo/text_utils.rb +96 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/views/index.erb +1 -1
- data/test/attendee/ts_decomposer.rb +23 -5
- data/test/attendee/ts_multi_worder.rb +66 -0
- data/test/attendee/ts_sequencer.rb +28 -4
- data/test/attendee/ts_text_reader.rb +20 -0
- data/test/attendee/ts_tokenizer.rb +20 -0
- data/test/attendee/ts_variator.rb +1 -1
- data/test/attendee/ts_word_searcher.rb +39 -3
- data/test/lir3.txt +12 -0
- data/test/ref/artikel.non +1 -12
- data/test/ref/artikel.seq +3 -1
- data/test/ref/artikel.vec +1 -0
- data/test/ref/artikel.vef +35 -34
- data/test/ref/artikel.ven +8 -7
- data/test/ref/artikel.ver +34 -33
- data/test/ref/artikel.vet +2573 -2563
- data/test/ref/lir.non +77 -78
- data/test/ref/lir.seq +9 -7
- data/test/ref/lir.syn +1 -1
- data/test/ref/lir.vec +41 -41
- data/test/ref/lir.vef +210 -210
- data/test/ref/lir.ven +46 -46
- data/test/ref/lir.ver +72 -72
- data/test/ref/lir.vet +329 -329
- data/test/ts_database.rb +166 -62
- data/test/ts_language.rb +23 -23
- metadata +53 -34
- data/lib/lingo/attendee/dehyphenizer.rb +0 -120
- data/lib/lingo/attendee/noneword_filter.rb +0 -115
- data/test/attendee/ts_noneword_filter.rb +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1130ec52467314ba95af17e635888f60046c5b42
|
4
|
+
data.tar.gz: 6a882ea4f88b1fbcf1a66b1d5fafe8fa05458b89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e0b384a822c28961c99d411bbdd399d9a49b29fdc40688d3534a9897cef984a7447cdd21b05fda45dda9a0fad25d99f60d524064803edb218ff95bbf9fc4fe6
|
7
|
+
data.tar.gz: 5cfa5c7f235113238d0e8568e9948f27a7ec864af63ac696a5efdfceed6eb678724f3ca3df24d3ac296068aceb1bf20d1369f439edf7f670265550ace1ce1cd0
|
data/ChangeLog
CHANGED
@@ -2,6 +2,43 @@
|
|
2
2
|
|
3
3
|
= Revision history for Lingo
|
4
4
|
|
5
|
+
== 1.8.7 [2015-08-07]
|
6
|
+
|
7
|
+
* Added Lingo::Attendee::LsiFilter to correlate semantically related terms
|
8
|
+
(LSI[https://en.wikipedia.org/wiki/Latent_semantic_indexing]) over the
|
9
|
+
"corpus" of all files processed during a single program invocation; requires
|
10
|
+
lsi4r[https://blackwinter.github.com/lsi4r] which in turn requires
|
11
|
+
rb-gsl[https://blackwinter.github.com/rb-gsl]. [EXPERIMENTAL: Interface may
|
12
|
+
be changed or removed in next release.]
|
13
|
+
* Added Lingo::Attendee::HalFilter to correlate semantically related terms
|
14
|
+
(HAL[https://en.wikipedia.org/wiki/Hyperspace_Analogue_to_Language]) over
|
15
|
+
individual documents; requires hal4r[https://blackwinter.github.com/hal4r]
|
16
|
+
which in turn requires rb-gsl[https://blackwinter.github.com/rb-gsl].
|
17
|
+
[EXPERIMENTAL: Interface may be changed or removed in next release.]
|
18
|
+
* Added Lingo::Attendee::AnalysisFilter and associated +lingoctl+ tooling.
|
19
|
+
* Multiword dictionaries can now identify hyphenated variants (e.g.
|
20
|
+
<tt>automatic data-processing</tt>); set <tt>hyphenate: true</tt> in the
|
21
|
+
dictionary config.
|
22
|
+
* Lingo::Attendee::Tokenizer no longer considers hyphens at word edges as part
|
23
|
+
of the word. As a consequence, Lingo::Attendee::Dehyphenizer has been
|
24
|
+
dropped.
|
25
|
+
* Dropped Lingo::Attendee::NonewordFilter; use Lingo::Attendee::VectorFilter
|
26
|
+
with option <tt>lexicals: '\?'</tt> instead.
|
27
|
+
* Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned
|
28
|
+
+encoding+ option to read/write text that is not UTF-8 encoded;
|
29
|
+
configuration files and dictionaries still need to be UTF-8, though.
|
30
|
+
* Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned to
|
31
|
+
read/write Gzip-compressed files (file extension +.gz+ or +.gzip+).
|
32
|
+
* Lingo::Attendee::Sequencer learned to recognize +0+ in the pattern to match
|
33
|
+
number tokens.
|
34
|
+
* Fixed Lingo::Attendee::TextReader to recognize BOM in input files; does not
|
35
|
+
apply to input read from +STDIN+.
|
36
|
+
* Fixed regression introduced in 1.8.6 where Lingo::Attendee::Debugger would
|
37
|
+
no longer work immediately behind Lingo::Attendee::TextReader.
|
38
|
+
* Fixed +lingoctl+ copy commands when overwriting existing files.
|
39
|
+
* Refactored Lingo::Database::Crypter into a module.
|
40
|
+
* JRuby 9000 compatibility.
|
41
|
+
|
5
42
|
== 1.8.6 [2015-02-09]
|
6
43
|
|
7
44
|
* Lingo::Attendee::VectorFilter learned +pos+ option to print position and
|
@@ -17,8 +54,7 @@
|
|
17
54
|
* Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
|
18
55
|
is obsolete.
|
19
56
|
* Lingo::Attendee::TextReader passes byte offset to the following attendee.
|
20
|
-
* Lingo::Attendee::Tokenizer records token's byte offset.
|
21
|
-
* Lingo::Attendee::Tokenizer records token's sequence position.
|
57
|
+
* Lingo::Attendee::Tokenizer records token's position and byte offset.
|
22
58
|
* Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
|
23
59
|
specified tags' contents.
|
24
60
|
* Lingo::Attendee subclasses warn when invalid or obsolete options or names
|
@@ -184,9 +220,9 @@
|
|
184
220
|
the word class for multiword entries (defaults to <tt>def-wc</tt>). Use
|
185
221
|
<tt>def-mul-wc: 'm'</tt> in your <tt>.lang</tt> to restore the previous
|
186
222
|
behaviour.
|
187
|
-
*
|
223
|
+
* Added Lingo::Attendee::Formatter for configurable output formatting as an
|
188
224
|
alternative to Lingo::Attendee::TextWriter.
|
189
|
-
*
|
225
|
+
* Added basic input filters to enable indexing of HTML/XML (and PDF) files.
|
190
226
|
* Updated the system dictionary.
|
191
227
|
* Switched license to Affero GPL.
|
192
228
|
|
data/README
CHANGED
@@ -15,7 +15,6 @@
|
|
15
15
|
* {Example}[rdoc-label:label-EXAMPLE]
|
16
16
|
* {Installation and Usage}[rdoc-label:label-INSTALLATION+AND+USAGE]
|
17
17
|
* {Dictionary and configuration file lookup}[rdoc-label:label-Dictionary+and+configuration+file+lookup]
|
18
|
-
* {Legacy version}[rdoc-label:label-Legacy+version]
|
19
18
|
* {File formats}[rdoc-label:label-FILE+FORMATS]
|
20
19
|
* {Configuration}[rdoc-label:label-Configuration]
|
21
20
|
* {Language definition}[rdoc-label:label-Language+definition]
|
@@ -35,7 +34,7 @@
|
|
35
34
|
|
36
35
|
== VERSION
|
37
36
|
|
38
|
-
This documentation refers to Lingo version 1.8.
|
37
|
+
This documentation refers to Lingo version 1.8.7
|
39
38
|
|
40
39
|
|
41
40
|
== DESCRIPTION
|
@@ -58,7 +57,7 @@ is a minimal configuration example to analyse this README file:
|
|
58
57
|
meeting:
|
59
58
|
attendees:
|
60
59
|
- text_reader: { files: 'README' }
|
61
|
-
- debugger: { eval: 'true', ceval: 'cmd
|
60
|
+
- debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: '<debug>: ' }
|
62
61
|
|
63
62
|
Lingo is told to invite two attendees and wants them to talk to each other,
|
64
63
|
hence the name Lingo (= the technical language).
|
@@ -131,8 +130,6 @@ information see each attendee's documentation):
|
|
131
130
|
Lingo::Attendee::Debugger)
|
132
131
|
+variator+:: Tries to correct spelling errors and the like. (see
|
133
132
|
Lingo::Attendee::Variator)
|
134
|
-
+dehyphenizer+:: Tries to undo hyphenation. (see
|
135
|
-
Lingo::Attendee::Dehyphenizer)
|
136
133
|
+multi_worder+:: Identifies phrases (word sequences) based on a multiword
|
137
134
|
dictionary. (see Lingo::Attendee::MultiWorder)
|
138
135
|
+sequencer+:: Identifies phrases (word sequences) based on patterns of
|
@@ -186,14 +183,14 @@ of context to external files.
|
|
186
183
|
|
187
184
|
_Example_:
|
188
185
|
|
189
|
-
#
|
190
|
-
- text_reader:
|
186
|
+
# read files
|
187
|
+
- text_reader: { files: $(files) }
|
191
188
|
# keep whitespace
|
192
|
-
- tokenizer:
|
189
|
+
- tokenizer: { space: true }
|
193
190
|
# do processing...
|
194
|
-
- word_searcher:
|
195
|
-
# insert formatted results (e.g. "[[Name::lingo|Lingo]]
|
196
|
-
- formatter:
|
191
|
+
- word_searcher: { source: sys-dic, mode: first }
|
192
|
+
# insert formatted results (e.g. "[[Name::lingo|Lingo]] finds [[Noun::word|words]].")
|
193
|
+
- formatter: { ext: out, format: '[[%3$s::%2$s|%1$s]]', map: { e: Name, s: Noun } }
|
197
194
|
|
198
195
|
=== Plugins
|
199
196
|
|
@@ -267,7 +264,7 @@ truncated for clarity).
|
|
267
264
|
<Lingo = [(lingo/s), (lingo/e)]>
|
268
265
|
<-|?>
|
269
266
|
<A|?>
|
270
|
-
<full-featured|
|
267
|
+
<full-featured|COM = [(full-featured/k), (full/s+), (full/a+), (full/v+), (featured/a+)]>
|
271
268
|
<automatic = [(automatic/s), (automatic/a)]>
|
272
269
|
<indexing = [(index/v)]>
|
273
270
|
<system = [(system/s)]>
|
@@ -349,9 +346,8 @@ the +lingo+ executable to process your text files. See <tt>lingo --help</tt>
|
|
349
346
|
for available options.
|
350
347
|
|
351
348
|
Please note that Lingo requires Ruby version 1.9.3 or higher to run
|
352
|
-
(2.
|
353
|
-
version).
|
354
|
-
{legacy version}[rdoc-label:label-Legacy+version].
|
349
|
+
(2.2.2[http://ruby-lang.org/en/downloads/] is the currently recommended
|
350
|
+
version).
|
355
351
|
|
356
352
|
Since Lingo depends on native extensions, you need to make sure that
|
357
353
|
development files for your Ruby version are installed. On Debian-based
|
@@ -359,12 +355,8 @@ Linux platforms they are included in the package <tt>ruby-dev</tt>;
|
|
359
355
|
other distributions may have a similarly named package. On Windows those
|
360
356
|
development files are currently not required.
|
361
357
|
|
362
|
-
|
363
|
-
|
364
|
-
you can either download and extract an
|
365
|
-
{archive file}[http://github.com/lex-lingo/lingo/releases] or unpack the
|
366
|
-
Gem archive (<tt>gem unpack lingo</tt>); or you can install the legacy
|
367
|
-
version of Lingo (see below).
|
358
|
+
On JRuby, install gdbm[https://rubygems.org/gems/gdbm] for efficient database
|
359
|
+
operations: <tt>gem install gdbm</tt>.
|
368
360
|
|
369
361
|
=== Dictionary and configuration file lookup
|
370
362
|
|
@@ -395,29 +387,6 @@ typically organized in the following directory structure:
|
|
395
387
|
|
396
388
|
But for compatibility reasons these naming conventions are not enforced.
|
397
389
|
|
398
|
-
=== Legacy version
|
399
|
-
|
400
|
-
As Lingo 1.8 introduced some major disruptions and no longer runs on Ruby 1.8,
|
401
|
-
there is a maintenance branch for Lingo 1.7.x that will remain compatible with
|
402
|
-
both Ruby 1.8 and the previous line of Lingo prior to 1.8. This branch may
|
403
|
-
receive occasional bug fixes and minor feature updates. However, the bulk of
|
404
|
-
the development efforts will be directed towards Lingo 1.8+.
|
405
|
-
|
406
|
-
To install the legacy version, download and extract the
|
407
|
-
{ZIP archive}[http://ixtrieve.fh-koeln.de/buch/lingo-1.7.1.zip].
|
408
|
-
No additional dependencies are required. This version of Lingo works
|
409
|
-
with both Ruby 1.8 (1.8.5 or higher) and 1.9 (1.9.2 or higher).
|
410
|
-
|
411
|
-
The executable is named +lingo.rb+. It's located at the root of the installation
|
412
|
-
directory and may only be run from there. See <tt>ruby lingo.rb -h</tt> for
|
413
|
-
usage instructions.
|
414
|
-
|
415
|
-
Configuration and language definition files are also located at the root of the
|
416
|
-
installation directory (<tt>*.cfg</tt> and <tt>*.lang</tt>, respectively).
|
417
|
-
Dictionary source files are found in language-specific subdirectories (+de/+,
|
418
|
-
+en/+, ...) and are named <tt>*.txt</tt>. The compiled dictionaries are found
|
419
|
-
beneath these language subdirectories in a directory named <tt>store/</tt>.
|
420
|
-
|
421
390
|
|
422
391
|
== FILE FORMATS
|
423
392
|
|
@@ -443,24 +412,24 @@ _Example_:
|
|
443
412
|
|
444
413
|
# input is taken from the previous attendee,
|
445
414
|
# output is sent to the named channel "syn"
|
446
|
-
- synonymer:
|
415
|
+
- synonymer: { skip: '?,t', source: sys-syn, out: syn }
|
447
416
|
|
448
417
|
# input is taken from the named channel "syn",
|
449
418
|
# output is sent to the next attendee
|
450
|
-
- vector_filter:
|
419
|
+
- vector_filter: { in: syn, lexicals: y, sort: term_abs }
|
451
420
|
|
452
421
|
# input is taken from the previous attendee,
|
453
422
|
# output is sent to the next attendee
|
454
|
-
- text_writer:
|
423
|
+
- text_writer: { ext: syn, sep: "\n" }
|
455
424
|
|
456
425
|
# input is taken from the named channel "syn"
|
457
426
|
# (ignoring the output of the previous attendee),
|
458
427
|
# output is sent to the next attendee
|
459
|
-
- vector_filter:
|
428
|
+
- vector_filter: { in: syn, lexicals: m }
|
460
429
|
|
461
430
|
# input is taken from the previous attendee,
|
462
431
|
# output is sent to the next attendee
|
463
|
-
- text_writer:
|
432
|
+
- text_writer: { ext: mul, sep: "\n" }
|
464
433
|
|
465
434
|
=== Language definition
|
466
435
|
|
@@ -532,8 +501,8 @@ the full test suite.
|
|
532
501
|
== LINKS
|
533
502
|
|
534
503
|
Website:: http://lex-lingo.de
|
535
|
-
Demo:: http://
|
536
|
-
Documentation::
|
504
|
+
Demo:: http://lex-lingo.de/lingoweb
|
505
|
+
Documentation:: http://lex-lingo.de/doc
|
537
506
|
Source code:: https://github.com/lex-lingo/lingo
|
538
507
|
RubyGem:: https://rubygems.org/gems/lingo
|
539
508
|
Bug tracker:: https://github.com/lex-lingo/lingo/issues
|
@@ -555,6 +524,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
|
|
555
524
|
|
556
525
|
=== Research publications
|
557
526
|
|
527
|
+
* Siebenkäs, A.; Markscheffel, B.: <em>{Conception of a workflow for the semi-automatic construction of a thesaurus for the German printing industry}[https://zenodo.org/record/17945]</em>. (English) In: Re:inventing Information Science in the Networked Society. Proceedings of the 14th International Symposium on Information Science (ISI 2015), Zadar, Croatia, 19th-21st May 2015. Eds.: F. Pehar, C. Schlögl, C. Wolff. Glückstadt: Verlag Werner Hülsbusch, 2015. pp 217-229
|
528
|
+
* Grün, S.: <em>Bildung von Komposita-Indextermen auf der Basis einer algorithmischen Mehrwortgruppenanalyse mit Lingo</em>. (German) Köln: Fachhochschule Köln, 2015.
|
558
529
|
* Bredack, J.; Lepsky, K.: <em>{Automatische Extraktion von Fachterminologie aus Volltexten}[http://dx.doi.org/10.1515/abitech-2014-0002]</em>. (German) In: ABI Technik 34 (1), 2014. pp 2-12.
|
559
530
|
* Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
|
560
531
|
* Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
|
data/Rakefile
CHANGED
@@ -37,7 +37,7 @@ The main functions of Lingo are:
|
|
37
37
|
|
38
38
|
dependencies: {
|
39
39
|
'cyclops' => '~> 0.1',
|
40
|
-
'nuggets' => '~> 1.
|
40
|
+
'nuggets' => '~> 1.3',
|
41
41
|
'rubyzip' => '~> 1.1',
|
42
42
|
'sinatra-bells' => '~> 0.0',
|
43
43
|
'unicode' => '~> 0.4'
|
@@ -59,10 +59,9 @@ rescue LoadError => err
|
|
59
59
|
end
|
60
60
|
|
61
61
|
CLEAN.include(
|
62
|
-
'txt/*.{log,mul,non,seq,ste,syn,ve?}',
|
62
|
+
'txt/*.{als,hal,log,lsi,mul,non,seq,ste,syn,ve?}',
|
63
63
|
'test/{test.*,text.non}',
|
64
|
-
'store/*/*.rev'
|
65
|
-
'bench/tmp.*'
|
64
|
+
'store/*/*.rev'
|
66
65
|
)
|
67
66
|
|
68
67
|
CLOBBER.include('store')
|
@@ -76,19 +75,6 @@ task('test:txt') { test_ref('artikel', 'lingo') }
|
|
76
75
|
desc 'Test against reference file (LIR)'
|
77
76
|
task('test:lir') { test_ref('lir') }
|
78
77
|
|
79
|
-
unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
|
80
|
-
desc 'Run all benchmarks'
|
81
|
-
task :bench
|
82
|
-
|
83
|
-
benchmarks.each { |benchmark|
|
84
|
-
bench = File.basename(benchmark, '_bench.rb')
|
85
|
-
task bench: benchtask = "bench:#{bench}"
|
86
|
-
|
87
|
-
desc "Run #{bench} benchmark"
|
88
|
-
task(benchtask) { system(File.ruby, benchmark) }
|
89
|
-
}
|
90
|
-
end
|
91
|
-
|
92
78
|
def test_ref(name, cfg = name)
|
93
79
|
require 'diff/lcs'
|
94
80
|
require 'diff/lcs/hunk'
|
data/config/lingo.cfg
CHANGED
@@ -30,9 +30,6 @@ meeting:
|
|
30
30
|
# Schreibweisen variieren und erneut suchen
|
31
31
|
# - variator: { source: sys-dic }
|
32
32
|
|
33
|
-
# Worttrennungen aufheben
|
34
|
-
# - dehyphenizer: { source: sys-dic }
|
35
|
-
|
36
33
|
# Wortstämme für nicht erkannte Wörter einfügen
|
37
34
|
# - stemmer: { }
|
38
35
|
|
@@ -46,7 +43,7 @@ meeting:
|
|
46
43
|
- sequencer: { stopper: 'PUNC,OTHR' }
|
47
44
|
|
48
45
|
# Relationierungen einfügen
|
49
|
-
- synonymer: { skip: '?,t', source: sys-syn, out:
|
46
|
+
- synonymer: { skip: '?,t', source: sys-syn, out: res }
|
50
47
|
|
51
48
|
|
52
49
|
########################################
|
@@ -60,45 +57,57 @@ meeting:
|
|
60
57
|
#
|
61
58
|
|
62
59
|
# Erstelle Datei mit Endung .log für Datenstrom
|
63
|
-
- debug_filter: { in:
|
60
|
+
- debug_filter: { in: res, prompt: 'lex:) ' }
|
64
61
|
- text_writer: { ext: log, sep: "\n" }
|
65
62
|
|
63
|
+
# Erstelle Datei mit Endung .als für Datenstrom
|
64
|
+
- analysis_filter: { in: res }
|
65
|
+
- text_writer: { ext: als, sep: "\n" }
|
66
|
+
|
66
67
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
67
|
-
-
|
68
|
+
- vector_filter: { in: res, lexicals: '\?' }
|
68
69
|
- text_writer: { ext: non, sep: "\n" }
|
69
70
|
|
70
71
|
# Erstelle Datei mit Endung .ste für Wortstämme
|
71
|
-
- vector_filter: { in:
|
72
|
+
- vector_filter: { in: res, lexicals: z }
|
72
73
|
- text_writer: { ext: ste, sep: "\n" }
|
73
74
|
|
74
75
|
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
75
|
-
- vector_filter: { in:
|
76
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$' }
|
76
77
|
- text_writer: { ext: vec, sep: "\n" }
|
77
78
|
|
78
79
|
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
79
|
-
- vector_filter: { in:
|
80
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
|
80
81
|
- text_writer: { ext: ven, sep: "\n" }
|
81
82
|
|
82
83
|
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
83
|
-
- vector_filter: { in:
|
84
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
|
84
85
|
- text_writer: { ext: ver, sep: "\n" }
|
85
86
|
|
86
87
|
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
87
|
-
- vector_filter: { in:
|
88
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
88
89
|
- text_writer: { ext: vef, sep: "\n" }
|
89
90
|
|
90
91
|
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
91
|
-
- vector_filter: { in:
|
92
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
|
92
93
|
- text_writer: { ext: vet, sep: "\n" }
|
93
94
|
|
94
95
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
95
|
-
- vector_filter: { in:
|
96
|
+
- vector_filter: { in: res, lexicals: m }
|
96
97
|
- text_writer: { ext: mul, sep: "\n" }
|
97
98
|
|
98
99
|
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
99
|
-
- vector_filter: { in:
|
100
|
+
- vector_filter: { in: res, lexicals: q, sort: term_abs }
|
100
101
|
- text_writer: { ext: seq, sep: "\n" }
|
101
102
|
|
102
103
|
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
103
|
-
- vector_filter: { in:
|
104
|
+
- vector_filter: { in: res, lexicals: y, sort: term_abs }
|
104
105
|
- text_writer: { ext: syn, sep: "\n" }
|
106
|
+
|
107
|
+
# Erstelle Datei mit Endung .hal für HAL-Indexterme
|
108
|
+
# - hal_filter: { in: res, lexicals: '^[ksavem]$' }
|
109
|
+
# - text_writer: { ext: hal, sep: "\n" }
|
110
|
+
|
111
|
+
# Erstelle Datei mit Endung .lsi für LSI-Indexterme
|
112
|
+
# - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
|
113
|
+
# - text_writer: { ext: lsi, sep: "\n" }
|
data/config/lir.cfg
CHANGED
@@ -35,9 +35,6 @@ meeting:
|
|
35
35
|
# Schreibweisen variieren und erneut suchen
|
36
36
|
# - variator: { source: sys-dic }
|
37
37
|
|
38
|
-
# Worttrennungen aufheben
|
39
|
-
# - dehyphenizer: { source: sys-dic }
|
40
|
-
|
41
38
|
# Wortstämme für nicht erkannte Wörter einfügen
|
42
39
|
# - stemmer: { }
|
43
40
|
|
@@ -51,7 +48,7 @@ meeting:
|
|
51
48
|
- sequencer: { stopper: 'PUNC,OTHR' }
|
52
49
|
|
53
50
|
# Relationierungen einfügen
|
54
|
-
- synonymer: { skip: '?,t', source: sys-syn, out:
|
51
|
+
- synonymer: { skip: '?,t', source: sys-syn, out: res }
|
55
52
|
|
56
53
|
|
57
54
|
########################################
|
@@ -65,45 +62,57 @@ meeting:
|
|
65
62
|
#
|
66
63
|
|
67
64
|
# Erstelle Datei mit Endung .log für Datenstrom
|
68
|
-
- debug_filter: { in:
|
69
|
-
- text_writer: { ext: log, sep: "\n" }
|
65
|
+
- debug_filter: { in: res, prompt: 'lex:) ' }
|
66
|
+
- text_writer: { ext: log, sep: "\n", lir-format: ~ }
|
67
|
+
|
68
|
+
# Erstelle Datei mit Endung .als für Datenstrom
|
69
|
+
- analysis_filter: { in: res }
|
70
|
+
- text_writer: { ext: als, sep: "\n", lir-format: ~ }
|
70
71
|
|
71
72
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
72
|
-
-
|
73
|
+
- vector_filter: { in: res, lexicals: '\?' }
|
73
74
|
- text_writer: { ext: non, sep: '|' }
|
74
75
|
|
75
76
|
# Erstelle Datei mit Endung .ste für Wortstämme
|
76
|
-
- vector_filter: { in:
|
77
|
+
- vector_filter: { in: res, lexicals: z }
|
77
78
|
- text_writer: { ext: ste, sep: '|' }
|
78
79
|
|
79
80
|
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
80
|
-
- vector_filter: { in:
|
81
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$' }
|
81
82
|
- text_writer: { ext: vec, sep: '|' }
|
82
83
|
|
83
84
|
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
84
|
-
- vector_filter: { in:
|
85
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
|
85
86
|
- text_writer: { ext: ven, sep: '|' }
|
86
87
|
|
87
88
|
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
88
|
-
- vector_filter: { in:
|
89
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
|
89
90
|
- text_writer: { ext: ver, sep: '|' }
|
90
91
|
|
91
92
|
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
92
|
-
- vector_filter: { in:
|
93
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
93
94
|
- text_writer: { ext: vef, sep: '|' }
|
94
95
|
|
95
96
|
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
96
|
-
- vector_filter: { in:
|
97
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
|
97
98
|
- text_writer: { ext: vet, sep: '|' }
|
98
99
|
|
99
100
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
100
|
-
- vector_filter: { in:
|
101
|
+
- vector_filter: { in: res, lexicals: m }
|
101
102
|
- text_writer: { ext: mul, sep: '|' }
|
102
103
|
|
103
104
|
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
104
|
-
- vector_filter: { in:
|
105
|
+
- vector_filter: { in: res, lexicals: q, sort: term_abs }
|
105
106
|
- text_writer: { ext: seq, sep: '|' }
|
106
107
|
|
107
108
|
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
108
|
-
- vector_filter: { in:
|
109
|
+
- vector_filter: { in: res, lexicals: y, sort: term_abs }
|
109
110
|
- text_writer: { ext: syn, sep: '|' }
|
111
|
+
|
112
|
+
# Erstelle Datei mit Endung .hal für HAL-Indexterme
|
113
|
+
# - hal_filter: { in: res, lexicals: '^[ksavem]$' }
|
114
|
+
# - text_writer: { ext: hal, sep: '|' }
|
115
|
+
|
116
|
+
# Erstelle Datei mit Endung .lsi für LSI-Indexterme
|
117
|
+
# - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
|
118
|
+
# - text_writer: { ext: lsi, sep: '|' }
|