lingo 1.8.6 → 1.8.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +40 -4
- data/README +22 -51
- data/Rakefile +3 -17
- data/config/lingo.cfg +24 -15
- data/config/lir.cfg +25 -16
- data/dict/de/test_muh.txt +6 -0
- data/dict/en/lingo-dic.txt +2 -3
- data/lang/de.lang +10 -9
- data/lang/en.lang +1 -1
- data/lib/lingo.rb +4 -4
- data/lib/lingo/attendee.rb +27 -7
- data/lib/lingo/attendee/analysis_filter.rb +81 -0
- data/lib/lingo/attendee/debug_filter.rb +42 -0
- data/lib/lingo/attendee/debugger.rb +2 -11
- data/lib/lingo/attendee/decomposer.rb +6 -3
- data/lib/lingo/attendee/formatter.rb +6 -6
- data/lib/lingo/attendee/hal_filter.rb +94 -0
- data/lib/lingo/attendee/lsi_filter.rb +99 -0
- data/lib/lingo/attendee/multi_worder.rb +69 -43
- data/lib/lingo/attendee/sequencer.rb +32 -19
- data/lib/lingo/attendee/synonymer.rb +2 -2
- data/lib/lingo/attendee/text_reader.rb +63 -92
- data/lib/lingo/attendee/text_writer.rb +12 -21
- data/lib/lingo/attendee/tokenizer.rb +32 -21
- data/lib/lingo/attendee/variator.rb +3 -3
- data/lib/lingo/attendee/vector_filter.rb +7 -9
- data/lib/lingo/attendee/word_searcher.rb +3 -3
- data/lib/lingo/buffered_attendee.rb +3 -36
- data/lib/lingo/config.rb +1 -1
- data/lib/lingo/ctl.rb +7 -155
- data/lib/lingo/ctl/analysis.rb +136 -0
- data/lib/lingo/ctl/files.rb +86 -0
- data/lib/lingo/ctl/other.rb +140 -0
- data/lib/lingo/database.rb +64 -60
- data/lib/lingo/database/crypter.rb +7 -5
- data/lib/lingo/error.rb +5 -4
- data/lib/lingo/language.rb +13 -5
- data/lib/lingo/language/grammar.rb +13 -7
- data/lib/lingo/language/token.rb +6 -0
- data/lib/lingo/language/word.rb +23 -36
- data/lib/lingo/language/word_form.rb +5 -1
- data/lib/lingo/srv.rb +2 -2
- data/lib/lingo/text_utils.rb +96 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/views/index.erb +1 -1
- data/test/attendee/ts_decomposer.rb +23 -5
- data/test/attendee/ts_multi_worder.rb +66 -0
- data/test/attendee/ts_sequencer.rb +28 -4
- data/test/attendee/ts_text_reader.rb +20 -0
- data/test/attendee/ts_tokenizer.rb +20 -0
- data/test/attendee/ts_variator.rb +1 -1
- data/test/attendee/ts_word_searcher.rb +39 -3
- data/test/lir3.txt +12 -0
- data/test/ref/artikel.non +1 -12
- data/test/ref/artikel.seq +3 -1
- data/test/ref/artikel.vec +1 -0
- data/test/ref/artikel.vef +35 -34
- data/test/ref/artikel.ven +8 -7
- data/test/ref/artikel.ver +34 -33
- data/test/ref/artikel.vet +2573 -2563
- data/test/ref/lir.non +77 -78
- data/test/ref/lir.seq +9 -7
- data/test/ref/lir.syn +1 -1
- data/test/ref/lir.vec +41 -41
- data/test/ref/lir.vef +210 -210
- data/test/ref/lir.ven +46 -46
- data/test/ref/lir.ver +72 -72
- data/test/ref/lir.vet +329 -329
- data/test/ts_database.rb +166 -62
- data/test/ts_language.rb +23 -23
- metadata +53 -34
- data/lib/lingo/attendee/dehyphenizer.rb +0 -120
- data/lib/lingo/attendee/noneword_filter.rb +0 -115
- data/test/attendee/ts_noneword_filter.rb +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1130ec52467314ba95af17e635888f60046c5b42
|
4
|
+
data.tar.gz: 6a882ea4f88b1fbcf1a66b1d5fafe8fa05458b89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e0b384a822c28961c99d411bbdd399d9a49b29fdc40688d3534a9897cef984a7447cdd21b05fda45dda9a0fad25d99f60d524064803edb218ff95bbf9fc4fe6
|
7
|
+
data.tar.gz: 5cfa5c7f235113238d0e8568e9948f27a7ec864af63ac696a5efdfceed6eb678724f3ca3df24d3ac296068aceb1bf20d1369f439edf7f670265550ace1ce1cd0
|
data/ChangeLog
CHANGED
@@ -2,6 +2,43 @@
|
|
2
2
|
|
3
3
|
= Revision history for Lingo
|
4
4
|
|
5
|
+
== 1.8.7 [2015-08-07]
|
6
|
+
|
7
|
+
* Added Lingo::Attendee::LsiFilter to correlate semantically related terms
|
8
|
+
(LSI[https://en.wikipedia.org/wiki/Latent_semantic_indexing]) over the
|
9
|
+
"corpus" of all files processed during a single program invocation; requires
|
10
|
+
lsi4r[https://blackwinter.github.com/lsi4r] which in turn requires
|
11
|
+
rb-gsl[https://blackwinter.github.com/rb-gsl]. [EXPERIMENTAL: Interface may
|
12
|
+
be changed or removed in next release.]
|
13
|
+
* Added Lingo::Attendee::HalFilter to correlate semantically related terms
|
14
|
+
(HAL[https://en.wikipedia.org/wiki/Hyperspace_Analogue_to_Language]) over
|
15
|
+
individual documents; requires hal4r[https://blackwinter.github.com/hal4r]
|
16
|
+
which in turn requires rb-gsl[https://blackwinter.github.com/rb-gsl].
|
17
|
+
[EXPERIMENTAL: Interface may be changed or removed in next release.]
|
18
|
+
* Added Lingo::Attendee::AnalysisFilter and associated +lingoctl+ tooling.
|
19
|
+
* Multiword dictionaries can now identify hyphenated variants (e.g.
|
20
|
+
<tt>automatic data-processing</tt>); set <tt>hyphenate: true</tt> in the
|
21
|
+
dictionary config.
|
22
|
+
* Lingo::Attendee::Tokenizer no longer considers hyphens at word edges as part
|
23
|
+
of the word. As a consequence, Lingo::Attendee::Dehyphenizer has been
|
24
|
+
dropped.
|
25
|
+
* Dropped Lingo::Attendee::NonewordFilter; use Lingo::Attendee::VectorFilter
|
26
|
+
with option <tt>lexicals: '\?'</tt> instead.
|
27
|
+
* Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned
|
28
|
+
+encoding+ option to read/write text that is not UTF-8 encoded;
|
29
|
+
configuration files and dictionaries still need to be UTF-8, though.
|
30
|
+
* Lingo::Attendee::TextReader and Lingo::Attendee::TextWriter learned to
|
31
|
+
read/write Gzip-compressed files (file extension +.gz+ or +.gzip+).
|
32
|
+
* Lingo::Attendee::Sequencer learned to recognize +0+ in the pattern to match
|
33
|
+
number tokens.
|
34
|
+
* Fixed Lingo::Attendee::TextReader to recognize BOM in input files; does not
|
35
|
+
apply to input read from +STDIN+.
|
36
|
+
* Fixed regression introduced in 1.8.6 where Lingo::Attendee::Debugger would
|
37
|
+
no longer work immediately behind Lingo::Attendee::TextReader.
|
38
|
+
* Fixed +lingoctl+ copy commands when overwriting existing files.
|
39
|
+
* Refactored Lingo::Database::Crypter into a module.
|
40
|
+
* JRuby 9000 compatibility.
|
41
|
+
|
5
42
|
== 1.8.6 [2015-02-09]
|
6
43
|
|
7
44
|
* Lingo::Attendee::VectorFilter learned +pos+ option to print position and
|
@@ -17,8 +54,7 @@
|
|
17
54
|
* Lingo::Attendee::TextReader no longer removes line endings; option +chomp+
|
18
55
|
is obsolete.
|
19
56
|
* Lingo::Attendee::TextReader passes byte offset to the following attendee.
|
20
|
-
* Lingo::Attendee::Tokenizer records token's byte offset.
|
21
|
-
* Lingo::Attendee::Tokenizer records token's sequence position.
|
57
|
+
* Lingo::Attendee::Tokenizer records token's position and byte offset.
|
22
58
|
* Lingo::Attendee::Tokenizer learned <tt>skip-tags</tt> option to skip over
|
23
59
|
specified tags' contents.
|
24
60
|
* Lingo::Attendee subclasses warn when invalid or obsolete options or names
|
@@ -184,9 +220,9 @@
|
|
184
220
|
the word class for multiword entries (defaults to <tt>def-wc</tt>). Use
|
185
221
|
<tt>def-mul-wc: 'm'</tt> in your <tt>.lang</tt> to restore the previous
|
186
222
|
behaviour.
|
187
|
-
*
|
223
|
+
* Added Lingo::Attendee::Formatter for configurable output formatting as an
|
188
224
|
alternative to Lingo::Attendee::TextWriter.
|
189
|
-
*
|
225
|
+
* Added basic input filters to enable indexing of HTML/XML (and PDF) files.
|
190
226
|
* Updated the system dictionary.
|
191
227
|
* Switched license to Affero GPL.
|
192
228
|
|
data/README
CHANGED
@@ -15,7 +15,6 @@
|
|
15
15
|
* {Example}[rdoc-label:label-EXAMPLE]
|
16
16
|
* {Installation and Usage}[rdoc-label:label-INSTALLATION+AND+USAGE]
|
17
17
|
* {Dictionary and configuration file lookup}[rdoc-label:label-Dictionary+and+configuration+file+lookup]
|
18
|
-
* {Legacy version}[rdoc-label:label-Legacy+version]
|
19
18
|
* {File formats}[rdoc-label:label-FILE+FORMATS]
|
20
19
|
* {Configuration}[rdoc-label:label-Configuration]
|
21
20
|
* {Language definition}[rdoc-label:label-Language+definition]
|
@@ -35,7 +34,7 @@
|
|
35
34
|
|
36
35
|
== VERSION
|
37
36
|
|
38
|
-
This documentation refers to Lingo version 1.8.
|
37
|
+
This documentation refers to Lingo version 1.8.7
|
39
38
|
|
40
39
|
|
41
40
|
== DESCRIPTION
|
@@ -58,7 +57,7 @@ is a minimal configuration example to analyse this README file:
|
|
58
57
|
meeting:
|
59
58
|
attendees:
|
60
59
|
- text_reader: { files: 'README' }
|
61
|
-
- debugger: { eval: 'true', ceval: 'cmd
|
60
|
+
- debugger: { eval: 'true', ceval: 'cmd!=:EOL', prompt: '<debug>: ' }
|
62
61
|
|
63
62
|
Lingo is told to invite two attendees and wants them to talk to each other,
|
64
63
|
hence the name Lingo (= the technical language).
|
@@ -131,8 +130,6 @@ information see each attendee's documentation):
|
|
131
130
|
Lingo::Attendee::Debugger)
|
132
131
|
+variator+:: Tries to correct spelling errors and the like. (see
|
133
132
|
Lingo::Attendee::Variator)
|
134
|
-
+dehyphenizer+:: Tries to undo hyphenation. (see
|
135
|
-
Lingo::Attendee::Dehyphenizer)
|
136
133
|
+multi_worder+:: Identifies phrases (word sequences) based on a multiword
|
137
134
|
dictionary. (see Lingo::Attendee::MultiWorder)
|
138
135
|
+sequencer+:: Identifies phrases (word sequences) based on patterns of
|
@@ -186,14 +183,14 @@ of context to external files.
|
|
186
183
|
|
187
184
|
_Example_:
|
188
185
|
|
189
|
-
#
|
190
|
-
- text_reader:
|
186
|
+
# read files
|
187
|
+
- text_reader: { files: $(files) }
|
191
188
|
# keep whitespace
|
192
|
-
- tokenizer:
|
189
|
+
- tokenizer: { space: true }
|
193
190
|
# do processing...
|
194
|
-
- word_searcher:
|
195
|
-
# insert formatted results (e.g. "[[Name::lingo|Lingo]]
|
196
|
-
- formatter:
|
191
|
+
- word_searcher: { source: sys-dic, mode: first }
|
192
|
+
# insert formatted results (e.g. "[[Name::lingo|Lingo]] finds [[Noun::word|words]].")
|
193
|
+
- formatter: { ext: out, format: '[[%3$s::%2$s|%1$s]]', map: { e: Name, s: Noun } }
|
197
194
|
|
198
195
|
=== Plugins
|
199
196
|
|
@@ -267,7 +264,7 @@ truncated for clarity).
|
|
267
264
|
<Lingo = [(lingo/s), (lingo/e)]>
|
268
265
|
<-|?>
|
269
266
|
<A|?>
|
270
|
-
<full-featured|
|
267
|
+
<full-featured|COM = [(full-featured/k), (full/s+), (full/a+), (full/v+), (featured/a+)]>
|
271
268
|
<automatic = [(automatic/s), (automatic/a)]>
|
272
269
|
<indexing = [(index/v)]>
|
273
270
|
<system = [(system/s)]>
|
@@ -349,9 +346,8 @@ the +lingo+ executable to process your text files. See <tt>lingo --help</tt>
|
|
349
346
|
for available options.
|
350
347
|
|
351
348
|
Please note that Lingo requires Ruby version 1.9.3 or higher to run
|
352
|
-
(2.
|
353
|
-
version).
|
354
|
-
{legacy version}[rdoc-label:label-Legacy+version].
|
349
|
+
(2.2.2[http://ruby-lang.org/en/downloads/] is the currently recommended
|
350
|
+
version).
|
355
351
|
|
356
352
|
Since Lingo depends on native extensions, you need to make sure that
|
357
353
|
development files for your Ruby version are installed. On Debian-based
|
@@ -359,12 +355,8 @@ Linux platforms they are included in the package <tt>ruby-dev</tt>;
|
|
359
355
|
other distributions may have a similarly named package. On Windows those
|
360
356
|
development files are currently not required.
|
361
357
|
|
362
|
-
|
363
|
-
|
364
|
-
you can either download and extract an
|
365
|
-
{archive file}[http://github.com/lex-lingo/lingo/releases] or unpack the
|
366
|
-
Gem archive (<tt>gem unpack lingo</tt>); or you can install the legacy
|
367
|
-
version of Lingo (see below).
|
358
|
+
On JRuby, install gdbm[https://rubygems.org/gems/gdbm] for efficient database
|
359
|
+
operations: <tt>gem install gdbm</tt>.
|
368
360
|
|
369
361
|
=== Dictionary and configuration file lookup
|
370
362
|
|
@@ -395,29 +387,6 @@ typically organized in the following directory structure:
|
|
395
387
|
|
396
388
|
But for compatibility reasons these naming conventions are not enforced.
|
397
389
|
|
398
|
-
=== Legacy version
|
399
|
-
|
400
|
-
As Lingo 1.8 introduced some major disruptions and no longer runs on Ruby 1.8,
|
401
|
-
there is a maintenance branch for Lingo 1.7.x that will remain compatible with
|
402
|
-
both Ruby 1.8 and the previous line of Lingo prior to 1.8. This branch may
|
403
|
-
receive occasional bug fixes and minor feature updates. However, the bulk of
|
404
|
-
the development efforts will be directed towards Lingo 1.8+.
|
405
|
-
|
406
|
-
To install the legacy version, download and extract the
|
407
|
-
{ZIP archive}[http://ixtrieve.fh-koeln.de/buch/lingo-1.7.1.zip].
|
408
|
-
No additional dependencies are required. This version of Lingo works
|
409
|
-
with both Ruby 1.8 (1.8.5 or higher) and 1.9 (1.9.2 or higher).
|
410
|
-
|
411
|
-
The executable is named +lingo.rb+. It's located at the root of the installation
|
412
|
-
directory and may only be run from there. See <tt>ruby lingo.rb -h</tt> for
|
413
|
-
usage instructions.
|
414
|
-
|
415
|
-
Configuration and language definition files are also located at the root of the
|
416
|
-
installation directory (<tt>*.cfg</tt> and <tt>*.lang</tt>, respectively).
|
417
|
-
Dictionary source files are found in language-specific subdirectories (+de/+,
|
418
|
-
+en/+, ...) and are named <tt>*.txt</tt>. The compiled dictionaries are found
|
419
|
-
beneath these language subdirectories in a directory named <tt>store/</tt>.
|
420
|
-
|
421
390
|
|
422
391
|
== FILE FORMATS
|
423
392
|
|
@@ -443,24 +412,24 @@ _Example_:
|
|
443
412
|
|
444
413
|
# input is taken from the previous attendee,
|
445
414
|
# output is sent to the named channel "syn"
|
446
|
-
- synonymer:
|
415
|
+
- synonymer: { skip: '?,t', source: sys-syn, out: syn }
|
447
416
|
|
448
417
|
# input is taken from the named channel "syn",
|
449
418
|
# output is sent to the next attendee
|
450
|
-
- vector_filter:
|
419
|
+
- vector_filter: { in: syn, lexicals: y, sort: term_abs }
|
451
420
|
|
452
421
|
# input is taken from the previous attendee,
|
453
422
|
# output is sent to the next attendee
|
454
|
-
- text_writer:
|
423
|
+
- text_writer: { ext: syn, sep: "\n" }
|
455
424
|
|
456
425
|
# input is taken from the named channel "syn"
|
457
426
|
# (ignoring the output of the previous attendee),
|
458
427
|
# output is sent to the next attendee
|
459
|
-
- vector_filter:
|
428
|
+
- vector_filter: { in: syn, lexicals: m }
|
460
429
|
|
461
430
|
# input is taken from the previous attendee,
|
462
431
|
# output is sent to the next attendee
|
463
|
-
- text_writer:
|
432
|
+
- text_writer: { ext: mul, sep: "\n" }
|
464
433
|
|
465
434
|
=== Language definition
|
466
435
|
|
@@ -532,8 +501,8 @@ the full test suite.
|
|
532
501
|
== LINKS
|
533
502
|
|
534
503
|
Website:: http://lex-lingo.de
|
535
|
-
Demo:: http://
|
536
|
-
Documentation::
|
504
|
+
Demo:: http://lex-lingo.de/lingoweb
|
505
|
+
Documentation:: http://lex-lingo.de/doc
|
537
506
|
Source code:: https://github.com/lex-lingo/lingo
|
538
507
|
RubyGem:: https://rubygems.org/gems/lingo
|
539
508
|
Bug tracker:: https://github.com/lex-lingo/lingo/issues
|
@@ -555,6 +524,8 @@ Travis CI:: https://travis-ci.org/lex-lingo/lingo
|
|
555
524
|
|
556
525
|
=== Research publications
|
557
526
|
|
527
|
+
* Siebenkäs, A.; Markscheffel, B.: <em>{Conception of a workflow for the semi-automatic construction of a thesaurus for the German printing industry}[https://zenodo.org/record/17945]</em>. (English) In: Re:inventing Information Science in the Networked Society. Proceedings of the 14th International Symposium on Information Science (ISI 2015), Zadar, Croatia, 19th-21st May 2015. Eds.: F. Pehar, C. Schlögl, C. Wolff. Glückstadt: Verlag Werner Hülsbusch, 2015. pp 217-229
|
528
|
+
* Grün, S.: <em>Bildung von Komposita-Indextermen auf der Basis einer algorithmischen Mehrwortgruppenanalyse mit Lingo</em>. (German) Köln: Fachhochschule Köln, 2015.
|
558
529
|
* Bredack, J.; Lepsky, K.: <em>{Automatische Extraktion von Fachterminologie aus Volltexten}[http://dx.doi.org/10.1515/abitech-2014-0002]</em>. (German) In: ABI Technik 34 (1), 2014. pp 2-12.
|
559
530
|
* Bredack, J.: <em>{Terminologieextraktion von Mehrwortgruppen in kunsthistorischen Fachtexten}[http://ixtrieve.fh-koeln.de/lehre/bredack-2013.pdf]</em>. (German) Köln: Fachhochschule Köln, 2013.
|
560
531
|
* Maylein, L.; Langenstein, A.: <em>{Neues vom Relevanz-Ranking im HEIDI-Katalog der Universitätsbibliothek Heidelberg}[http://b-i-t-online.de/heft/2013-03-fachbeitrag-maylein.pdf]</em>. (German) In: b.i.t.online 16 (3), 2013. pp 190-200.
|
data/Rakefile
CHANGED
@@ -37,7 +37,7 @@ The main functions of Lingo are:
|
|
37
37
|
|
38
38
|
dependencies: {
|
39
39
|
'cyclops' => '~> 0.1',
|
40
|
-
'nuggets' => '~> 1.
|
40
|
+
'nuggets' => '~> 1.3',
|
41
41
|
'rubyzip' => '~> 1.1',
|
42
42
|
'sinatra-bells' => '~> 0.0',
|
43
43
|
'unicode' => '~> 0.4'
|
@@ -59,10 +59,9 @@ rescue LoadError => err
|
|
59
59
|
end
|
60
60
|
|
61
61
|
CLEAN.include(
|
62
|
-
'txt/*.{log,mul,non,seq,ste,syn,ve?}',
|
62
|
+
'txt/*.{als,hal,log,lsi,mul,non,seq,ste,syn,ve?}',
|
63
63
|
'test/{test.*,text.non}',
|
64
|
-
'store/*/*.rev'
|
65
|
-
'bench/tmp.*'
|
64
|
+
'store/*/*.rev'
|
66
65
|
)
|
67
66
|
|
68
67
|
CLOBBER.include('store')
|
@@ -76,19 +75,6 @@ task('test:txt') { test_ref('artikel', 'lingo') }
|
|
76
75
|
desc 'Test against reference file (LIR)'
|
77
76
|
task('test:lir') { test_ref('lir') }
|
78
77
|
|
79
|
-
unless (benchmarks = Dir[File.expand_path('../bench/*_bench.rb', __FILE__)]).empty?
|
80
|
-
desc 'Run all benchmarks'
|
81
|
-
task :bench
|
82
|
-
|
83
|
-
benchmarks.each { |benchmark|
|
84
|
-
bench = File.basename(benchmark, '_bench.rb')
|
85
|
-
task bench: benchtask = "bench:#{bench}"
|
86
|
-
|
87
|
-
desc "Run #{bench} benchmark"
|
88
|
-
task(benchtask) { system(File.ruby, benchmark) }
|
89
|
-
}
|
90
|
-
end
|
91
|
-
|
92
78
|
def test_ref(name, cfg = name)
|
93
79
|
require 'diff/lcs'
|
94
80
|
require 'diff/lcs/hunk'
|
data/config/lingo.cfg
CHANGED
@@ -30,9 +30,6 @@ meeting:
|
|
30
30
|
# Schreibweisen variieren und erneut suchen
|
31
31
|
# - variator: { source: sys-dic }
|
32
32
|
|
33
|
-
# Worttrennungen aufheben
|
34
|
-
# - dehyphenizer: { source: sys-dic }
|
35
|
-
|
36
33
|
# Wortstämme für nicht erkannte Wörter einfügen
|
37
34
|
# - stemmer: { }
|
38
35
|
|
@@ -46,7 +43,7 @@ meeting:
|
|
46
43
|
- sequencer: { stopper: 'PUNC,OTHR' }
|
47
44
|
|
48
45
|
# Relationierungen einfügen
|
49
|
-
- synonymer: { skip: '?,t', source: sys-syn, out:
|
46
|
+
- synonymer: { skip: '?,t', source: sys-syn, out: res }
|
50
47
|
|
51
48
|
|
52
49
|
########################################
|
@@ -60,45 +57,57 @@ meeting:
|
|
60
57
|
#
|
61
58
|
|
62
59
|
# Erstelle Datei mit Endung .log für Datenstrom
|
63
|
-
- debug_filter: { in:
|
60
|
+
- debug_filter: { in: res, prompt: 'lex:) ' }
|
64
61
|
- text_writer: { ext: log, sep: "\n" }
|
65
62
|
|
63
|
+
# Erstelle Datei mit Endung .als für Datenstrom
|
64
|
+
- analysis_filter: { in: res }
|
65
|
+
- text_writer: { ext: als, sep: "\n" }
|
66
|
+
|
66
67
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
67
|
-
-
|
68
|
+
- vector_filter: { in: res, lexicals: '\?' }
|
68
69
|
- text_writer: { ext: non, sep: "\n" }
|
69
70
|
|
70
71
|
# Erstelle Datei mit Endung .ste für Wortstämme
|
71
|
-
- vector_filter: { in:
|
72
|
+
- vector_filter: { in: res, lexicals: z }
|
72
73
|
- text_writer: { ext: ste, sep: "\n" }
|
73
74
|
|
74
75
|
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
75
|
-
- vector_filter: { in:
|
76
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$' }
|
76
77
|
- text_writer: { ext: vec, sep: "\n" }
|
77
78
|
|
78
79
|
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
79
|
-
- vector_filter: { in:
|
80
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
|
80
81
|
- text_writer: { ext: ven, sep: "\n" }
|
81
82
|
|
82
83
|
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
83
|
-
- vector_filter: { in:
|
84
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
|
84
85
|
- text_writer: { ext: ver, sep: "\n" }
|
85
86
|
|
86
87
|
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
87
|
-
- vector_filter: { in:
|
88
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
88
89
|
- text_writer: { ext: vef, sep: "\n" }
|
89
90
|
|
90
91
|
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
91
|
-
- vector_filter: { in:
|
92
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
|
92
93
|
- text_writer: { ext: vet, sep: "\n" }
|
93
94
|
|
94
95
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
95
|
-
- vector_filter: { in:
|
96
|
+
- vector_filter: { in: res, lexicals: m }
|
96
97
|
- text_writer: { ext: mul, sep: "\n" }
|
97
98
|
|
98
99
|
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
99
|
-
- vector_filter: { in:
|
100
|
+
- vector_filter: { in: res, lexicals: q, sort: term_abs }
|
100
101
|
- text_writer: { ext: seq, sep: "\n" }
|
101
102
|
|
102
103
|
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
103
|
-
- vector_filter: { in:
|
104
|
+
- vector_filter: { in: res, lexicals: y, sort: term_abs }
|
104
105
|
- text_writer: { ext: syn, sep: "\n" }
|
106
|
+
|
107
|
+
# Erstelle Datei mit Endung .hal für HAL-Indexterme
|
108
|
+
# - hal_filter: { in: res, lexicals: '^[ksavem]$' }
|
109
|
+
# - text_writer: { ext: hal, sep: "\n" }
|
110
|
+
|
111
|
+
# Erstelle Datei mit Endung .lsi für LSI-Indexterme
|
112
|
+
# - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
|
113
|
+
# - text_writer: { ext: lsi, sep: "\n" }
|
data/config/lir.cfg
CHANGED
@@ -35,9 +35,6 @@ meeting:
|
|
35
35
|
# Schreibweisen variieren und erneut suchen
|
36
36
|
# - variator: { source: sys-dic }
|
37
37
|
|
38
|
-
# Worttrennungen aufheben
|
39
|
-
# - dehyphenizer: { source: sys-dic }
|
40
|
-
|
41
38
|
# Wortstämme für nicht erkannte Wörter einfügen
|
42
39
|
# - stemmer: { }
|
43
40
|
|
@@ -51,7 +48,7 @@ meeting:
|
|
51
48
|
- sequencer: { stopper: 'PUNC,OTHR' }
|
52
49
|
|
53
50
|
# Relationierungen einfügen
|
54
|
-
- synonymer: { skip: '?,t', source: sys-syn, out:
|
51
|
+
- synonymer: { skip: '?,t', source: sys-syn, out: res }
|
55
52
|
|
56
53
|
|
57
54
|
########################################
|
@@ -65,45 +62,57 @@ meeting:
|
|
65
62
|
#
|
66
63
|
|
67
64
|
# Erstelle Datei mit Endung .log für Datenstrom
|
68
|
-
- debug_filter: { in:
|
69
|
-
- text_writer: { ext: log, sep: "\n" }
|
65
|
+
- debug_filter: { in: res, prompt: 'lex:) ' }
|
66
|
+
- text_writer: { ext: log, sep: "\n", lir-format: ~ }
|
67
|
+
|
68
|
+
# Erstelle Datei mit Endung .als für Datenstrom
|
69
|
+
- analysis_filter: { in: res }
|
70
|
+
- text_writer: { ext: als, sep: "\n", lir-format: ~ }
|
70
71
|
|
71
72
|
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
72
|
-
-
|
73
|
+
- vector_filter: { in: res, lexicals: '\?' }
|
73
74
|
- text_writer: { ext: non, sep: '|' }
|
74
75
|
|
75
76
|
# Erstelle Datei mit Endung .ste für Wortstämme
|
76
|
-
- vector_filter: { in:
|
77
|
+
- vector_filter: { in: res, lexicals: z }
|
77
78
|
- text_writer: { ext: ste, sep: '|' }
|
78
79
|
|
79
80
|
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
80
|
-
- vector_filter: { in:
|
81
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$' }
|
81
82
|
- text_writer: { ext: vec, sep: '|' }
|
82
83
|
|
83
84
|
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
84
|
-
- vector_filter: { in:
|
85
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_abs }
|
85
86
|
- text_writer: { ext: ven, sep: '|' }
|
86
87
|
|
87
88
|
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
88
|
-
- vector_filter: { in:
|
89
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel }
|
89
90
|
- text_writer: { ext: ver, sep: '|' }
|
90
91
|
|
91
92
|
# Erstelle Datei mit Endung .vef für erkannte Indexterme mit TFIDF-Gewichtung
|
92
|
-
- vector_filter: { in:
|
93
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: term_rel, tfidf: true }
|
93
94
|
- text_writer: { ext: vef, sep: '|' }
|
94
95
|
|
95
96
|
# Erstelle Datei mit Endung .vet für erkannte Indexterme mit Positionen
|
96
|
-
- vector_filter: { in:
|
97
|
+
- vector_filter: { in: res, lexicals: '^[ksavem]$', sort: false, pos: true }
|
97
98
|
- text_writer: { ext: vet, sep: '|' }
|
98
99
|
|
99
100
|
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
100
|
-
- vector_filter: { in:
|
101
|
+
- vector_filter: { in: res, lexicals: m }
|
101
102
|
- text_writer: { ext: mul, sep: '|' }
|
102
103
|
|
103
104
|
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
104
|
-
- vector_filter: { in:
|
105
|
+
- vector_filter: { in: res, lexicals: q, sort: term_abs }
|
105
106
|
- text_writer: { ext: seq, sep: '|' }
|
106
107
|
|
107
108
|
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
108
|
-
- vector_filter: { in:
|
109
|
+
- vector_filter: { in: res, lexicals: y, sort: term_abs }
|
109
110
|
- text_writer: { ext: syn, sep: '|' }
|
111
|
+
|
112
|
+
# Erstelle Datei mit Endung .hal für HAL-Indexterme
|
113
|
+
# - hal_filter: { in: res, lexicals: '^[ksavem]$' }
|
114
|
+
# - text_writer: { ext: hal, sep: '|' }
|
115
|
+
|
116
|
+
# Erstelle Datei mit Endung .lsi für LSI-Indexterme
|
117
|
+
# - lsi_filter: { in: res, lexicals: '^[ksavem]$' }
|
118
|
+
# - text_writer: { ext: lsi, sep: '|' }
|