lingo 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/COPYING +663 -0
- data/ChangeLog +754 -0
- data/README +322 -0
- data/Rakefile +100 -0
- data/TODO +28 -0
- data/bin/lingo +5 -0
- data/bin/lingoctl +6 -0
- data/de.lang +121 -0
- data/de/lingo-abk.txt +74 -0
- data/de/lingo-dic.txt +56822 -0
- data/de/lingo-mul.txt +3209 -0
- data/de/lingo-syn.txt +14841 -0
- data/de/test_dic.txt +24 -0
- data/de/test_mul.txt +17 -0
- data/de/test_mul2.txt +2 -0
- data/de/test_singleword.txt +2 -0
- data/de/test_syn.txt +4 -0
- data/de/test_syn2.txt +1 -0
- data/de/user-dic.txt +10 -0
- data/en.lang +113 -0
- data/en/lingo-dic.txt +55434 -0
- data/en/lingo-mul.txt +456 -0
- data/en/user-dic.txt +5 -0
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/gpl-hdr.txt +27 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lib/lingo.rb +321 -0
- data/lib/lingo/attendee/abbreviator.rb +119 -0
- data/lib/lingo/attendee/debugger.rb +111 -0
- data/lib/lingo/attendee/decomposer.rb +101 -0
- data/lib/lingo/attendee/dehyphenizer.rb +167 -0
- data/lib/lingo/attendee/multiworder.rb +301 -0
- data/lib/lingo/attendee/noneword_filter.rb +103 -0
- data/lib/lingo/attendee/objectfilter.rb +86 -0
- data/lib/lingo/attendee/sequencer.rb +190 -0
- data/lib/lingo/attendee/synonymer.rb +105 -0
- data/lib/lingo/attendee/textreader.rb +237 -0
- data/lib/lingo/attendee/textwriter.rb +196 -0
- data/lib/lingo/attendee/tokenizer.rb +218 -0
- data/lib/lingo/attendee/variator.rb +185 -0
- data/lib/lingo/attendee/vector_filter.rb +158 -0
- data/lib/lingo/attendee/wordsearcher.rb +96 -0
- data/lib/lingo/attendees.rb +289 -0
- data/lib/lingo/cli.rb +62 -0
- data/lib/lingo/config.rb +104 -0
- data/lib/lingo/const.rb +131 -0
- data/lib/lingo/ctl.rb +173 -0
- data/lib/lingo/database.rb +587 -0
- data/lib/lingo/language.rb +530 -0
- data/lib/lingo/modules.rb +98 -0
- data/lib/lingo/types.rb +285 -0
- data/lib/lingo/utilities.rb +40 -0
- data/lib/lingo/version.rb +27 -0
- data/lingo-all.cfg +85 -0
- data/lingo-call.cfg +15 -0
- data/lingo.cfg +78 -0
- data/lingo.rb +3 -0
- data/lir.cfg +72 -0
- data/porter/stem.cfg +311 -0
- data/porter/stem.rb +150 -0
- data/spec/spec_helper.rb +0 -0
- data/test.cfg +79 -0
- data/test/attendee/ts_abbreviator.rb +35 -0
- data/test/attendee/ts_decomposer.rb +31 -0
- data/test/attendee/ts_multiworder.rb +390 -0
- data/test/attendee/ts_noneword_filter.rb +19 -0
- data/test/attendee/ts_objectfilter.rb +19 -0
- data/test/attendee/ts_sequencer.rb +43 -0
- data/test/attendee/ts_synonymer.rb +33 -0
- data/test/attendee/ts_textreader.rb +58 -0
- data/test/attendee/ts_textwriter.rb +98 -0
- data/test/attendee/ts_tokenizer.rb +32 -0
- data/test/attendee/ts_variator.rb +24 -0
- data/test/attendee/ts_vector_filter.rb +62 -0
- data/test/attendee/ts_wordsearcher.rb +119 -0
- data/test/lir.csv +3 -0
- data/test/lir.txt +12 -0
- data/test/lir2.txt +12 -0
- data/test/mul.txt +1 -0
- data/test/ref/artikel.mul +1 -0
- data/test/ref/artikel.non +159 -0
- data/test/ref/artikel.seq +270 -0
- data/test/ref/artikel.syn +16 -0
- data/test/ref/artikel.vec +928 -0
- data/test/ref/artikel.ven +928 -0
- data/test/ref/artikel.ver +928 -0
- data/test/ref/lir.csv +328 -0
- data/test/ref/lir.mul +1 -0
- data/test/ref/lir.non +274 -0
- data/test/ref/lir.seq +249 -0
- data/test/ref/lir.syn +94 -0
- data/test/test_helper.rb +113 -0
- data/test/ts_database.rb +269 -0
- data/test/ts_language.rb +396 -0
- data/txt/artikel-en.txt +157 -0
- data/txt/artikel.txt +170 -0
- data/txt/lir.txt +1317 -0
- metadata +211 -0
data/lingo-all.cfg
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
#
|
2
|
+
# Lingo-Konfiguration
|
3
|
+
#
|
4
|
+
---
|
5
|
+
meeting:
|
6
|
+
|
7
|
+
attendees:
|
8
|
+
|
9
|
+
########################################
|
10
|
+
# Text bereitstellen
|
11
|
+
#
|
12
|
+
|
13
|
+
# Angegebene Datei zeilenweise einlesen und verarbeitet
|
14
|
+
- textreader: { files: '$(files)' }
|
15
|
+
|
16
|
+
|
17
|
+
########################################
|
18
|
+
# Inhalte verarbeiten
|
19
|
+
#
|
20
|
+
|
21
|
+
# Zeile in einzelnen Sinnbestandteile (Token) zerlegen
|
22
|
+
- tokenizer: { }
|
23
|
+
|
24
|
+
# Abkürzungen erkennen und auflösen
|
25
|
+
# - abbreviator: { source: 'sys-abk' }
|
26
|
+
|
27
|
+
# Verbleibende Token im Wörterbuch suchen
|
28
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
29
|
+
|
30
|
+
# Schreibweisen variieren und erneut suchen
|
31
|
+
# - variator: { source: 'sys-dic' }
|
32
|
+
|
33
|
+
# Bindestrichergänzungen rekonstruieren
|
34
|
+
# - dehyphenizer: { source: 'sys-dic' }
|
35
|
+
|
36
|
+
# Nicht erkannte Wörter auf Kompositum testen
|
37
|
+
# - decomposer: { source: 'sys-dic' }
|
38
|
+
|
39
|
+
# Mehrwortgruppen im Strom erkennen
|
40
|
+
# - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
41
|
+
|
42
|
+
# Wortsequenzen anhand von Regeln identifizieren
|
43
|
+
# - sequencer: { stopper: 'PUNC,OTHR' }
|
44
|
+
|
45
|
+
# Relationierungen einfügen
|
46
|
+
# - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
|
47
|
+
|
48
|
+
|
49
|
+
########################################
|
50
|
+
# Datenstrom anzeigen
|
51
|
+
#
|
52
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
|
53
|
+
|
54
|
+
|
55
|
+
########################################
|
56
|
+
# Ergebnisse ausgeben
|
57
|
+
#
|
58
|
+
|
59
|
+
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
60
|
+
# - noneword_filter: { in: syn }
|
61
|
+
# - textwriter: { ext: non, sep: "\n" }
|
62
|
+
|
63
|
+
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
64
|
+
# - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
|
65
|
+
# - textwriter: { ext: vec, sep: "\n" }
|
66
|
+
|
67
|
+
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
68
|
+
# - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
|
69
|
+
# - textwriter: { ext: ven, sep: "\n" }
|
70
|
+
|
71
|
+
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
72
|
+
# - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
|
73
|
+
# - textwriter: { ext: ver, sep: "\n" }
|
74
|
+
|
75
|
+
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
76
|
+
# - vector_filter: { in: syn, lexicals: m }
|
77
|
+
# - textwriter: { ext: mul, sep: "\n" }
|
78
|
+
|
79
|
+
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
80
|
+
# - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
|
81
|
+
# - textwriter: { ext: seq, sep: "\n" }
|
82
|
+
|
83
|
+
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
84
|
+
# - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
|
85
|
+
# - textwriter: { ext: syn, sep: "\n" }
|
data/lingo-call.cfg
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
meeting:
|
3
|
+
attendees:
|
4
|
+
- textreader: { }
|
5
|
+
|
6
|
+
- tokenizer: { }
|
7
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
8
|
+
- decomposer: { source: 'sys-dic' }
|
9
|
+
- multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
10
|
+
- sequencer: { stopper: 'PUNC,OTHR' }
|
11
|
+
- synonymer: { skip: '?,t', source: 'sys-syn', out: 'syn' }
|
12
|
+
- debugger: { eval: 'true', ceval: 'false', prompt: '' }
|
13
|
+
|
14
|
+
#- vector_filter: { in: 'syn', lexicals: 'y', sort: 'term_abs' }
|
15
|
+
#- textwriter: { ext: 'STDOUT', sep: "\n" }
|
data/lingo.cfg
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#
|
2
|
+
# Lingo-Konfiguration
|
3
|
+
#
|
4
|
+
---
|
5
|
+
meeting:
|
6
|
+
|
7
|
+
attendees:
|
8
|
+
|
9
|
+
########################################
|
10
|
+
# Text bereitstellen
|
11
|
+
#
|
12
|
+
|
13
|
+
# Angegebene Datei zeilenweise einlesen und verarbeitet
|
14
|
+
- textreader: { files: '$(files)' }
|
15
|
+
|
16
|
+
|
17
|
+
########################################
|
18
|
+
# Inhalte verarbeiten
|
19
|
+
#
|
20
|
+
|
21
|
+
# Zeile in einzelnen Sinnbestandteile (Token) zerlegen
|
22
|
+
- tokenizer: { }
|
23
|
+
|
24
|
+
# Verbleibende Token im Wörterbuch suchen
|
25
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
26
|
+
|
27
|
+
# Nicht erkannte Wörter auf Kompositum testen
|
28
|
+
- decomposer: { source: 'sys-dic' }
|
29
|
+
|
30
|
+
# Mehrwortgruppen im Strom erkennen
|
31
|
+
- multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
32
|
+
|
33
|
+
# Wortsequenzen anhand von Regeln identifizieren
|
34
|
+
- sequencer: { stopper: 'PUNC,OTHR' }
|
35
|
+
|
36
|
+
# Relationierungen einfügen
|
37
|
+
- synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
|
38
|
+
|
39
|
+
|
40
|
+
########################################
|
41
|
+
# Datenstrom anzeigen
|
42
|
+
#
|
43
|
+
#- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
|
44
|
+
|
45
|
+
|
46
|
+
########################################
|
47
|
+
# Ergebnisse ausgeben
|
48
|
+
#
|
49
|
+
- vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
|
50
|
+
- textwriter: { ext: log, sep: "\n" }
|
51
|
+
|
52
|
+
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
53
|
+
- noneword_filter: { in: syn }
|
54
|
+
- textwriter: { ext: non, sep: "\n" }
|
55
|
+
|
56
|
+
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
57
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$' }
|
58
|
+
- textwriter: { ext: vec, sep: "\n" }
|
59
|
+
|
60
|
+
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
61
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
|
62
|
+
- textwriter: { ext: ven, sep: "\n" }
|
63
|
+
|
64
|
+
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
65
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
|
66
|
+
- textwriter: { ext: ver, sep: "\n" }
|
67
|
+
|
68
|
+
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
69
|
+
- vector_filter: { in: syn, lexicals: m }
|
70
|
+
- textwriter: { ext: mul, sep: "\n" }
|
71
|
+
|
72
|
+
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
73
|
+
- vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
|
74
|
+
- textwriter: { ext: seq, sep: "\n" }
|
75
|
+
|
76
|
+
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
77
|
+
- vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
|
78
|
+
- textwriter: { ext: syn, sep: "\n" }
|
data/lingo.rb
ADDED
data/lir.cfg
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
#
|
2
|
+
# Lingo-Konfiguration für den Test mit einer LIR-Datei
|
3
|
+
#
|
4
|
+
# Gebräuchliche Patterns sind
|
5
|
+
# "^\021(\d+\-\d+)\022"
|
6
|
+
# "^\[(\d+)\.\]"
|
7
|
+
#
|
8
|
+
---
|
9
|
+
meeting:
|
10
|
+
|
11
|
+
attendees:
|
12
|
+
|
13
|
+
########################################
|
14
|
+
# Text bereitstellen
|
15
|
+
#
|
16
|
+
|
17
|
+
# Angegebene Datei zeilenweise einlesen und verarbeiten
|
18
|
+
- textreader: { files: '$(files)', lir-record-pattern: '^\[(\d+)\.\]' }
|
19
|
+
|
20
|
+
|
21
|
+
########################################
|
22
|
+
# Inhalte verarbeiten
|
23
|
+
#
|
24
|
+
|
25
|
+
# Zeile in einzelnen Sinnbestandteile (Token) zerlegen
|
26
|
+
- tokenizer: { }
|
27
|
+
|
28
|
+
# Verbleibende Token im Wörterbuch suchen
|
29
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
30
|
+
|
31
|
+
# Nicht erkannte Wörter auf Kompositum testen
|
32
|
+
- decomposer: { source: 'sys-dic' }
|
33
|
+
|
34
|
+
# Mehrwortgruppen im Strom erkennen
|
35
|
+
- multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
36
|
+
|
37
|
+
# Wortsequenzen anhand von Regeln identifizieren
|
38
|
+
- sequencer: { stopper: 'PUNC,OTHR' }
|
39
|
+
|
40
|
+
# Relationierungen einfügen
|
41
|
+
- synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
|
42
|
+
|
43
|
+
|
44
|
+
########################################
|
45
|
+
# Datenstrom anzeigen
|
46
|
+
#
|
47
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
|
48
|
+
|
49
|
+
|
50
|
+
########################################
|
51
|
+
# Ergebnisse ausgeben
|
52
|
+
#
|
53
|
+
|
54
|
+
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
55
|
+
- noneword_filter: { in: syn }
|
56
|
+
- textwriter: { ext: non }
|
57
|
+
|
58
|
+
# Erstelle Datei mit Endung .csv für erkannte Indexterme
|
59
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$' }
|
60
|
+
- textwriter: { ext: csv }
|
61
|
+
|
62
|
+
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
63
|
+
- vector_filter: { in: syn, lexicals: m }
|
64
|
+
- textwriter: { ext: mul }
|
65
|
+
|
66
|
+
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
67
|
+
- vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
|
68
|
+
- textwriter: { ext: seq }
|
69
|
+
|
70
|
+
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
71
|
+
- vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
|
72
|
+
- textwriter: { ext: syn }
|
data/porter/stem.cfg
ADDED
@@ -0,0 +1,311 @@
|
|
1
|
+
# Stem.cfg
|
2
|
+
#
|
3
|
+
# Rules for Porter-Stemmer
|
4
|
+
#
|
5
|
+
#
|
6
|
+
# based on:
|
7
|
+
# An algorithm for suffix stripping
|
8
|
+
#
|
9
|
+
# M.F.Porter
|
10
|
+
# 1980
|
11
|
+
#
|
12
|
+
# Originally published in Program, 14 no. 3, pp 130-137, July 1980. (A
|
13
|
+
# few typos have been corrected.)
|
14
|
+
#
|
15
|
+
# http://tartarus.org/~martin/PorterStemmer/def.txt
|
16
|
+
#
|
17
|
+
# --------------------------------------------------
|
18
|
+
#
|
19
|
+
#
|
20
|
+
#
|
21
|
+
#
|
22
|
+
# 2. THE ALGORITHM
|
23
|
+
#
|
24
|
+
# To present the suffix stripping algorithm in its entirety we will need a few
|
25
|
+
# difinitions.
|
26
|
+
#
|
27
|
+
# A \consonant\ in a word is a letter other than A, E, I, O or U, and other
|
28
|
+
# than Y preceded by a consonant. (The fact that the term `consonant' is
|
29
|
+
# defined to some extent in terms of itself does not make it ambiguous.) So in
|
30
|
+
# TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
|
31
|
+
# letter is not a consonant it is a \vowel\.
|
32
|
+
#
|
33
|
+
# A consonant will be denoted by c, a vowel by v. A list ccc... of length
|
34
|
+
# greater than 0 will be denoted by C, and a list vvv... of length greater
|
35
|
+
# than 0 will be denoted by V. Any word, or part of a word, therefore has one
|
36
|
+
# of the four forms:
|
37
|
+
#
|
38
|
+
# CVCV ... C
|
39
|
+
# CVCV ... V
|
40
|
+
# VCVC ... C
|
41
|
+
# VCVC ... V
|
42
|
+
#
|
43
|
+
# These may all be represented by the single form
|
44
|
+
#
|
45
|
+
# [C]VCVC ... [V]
|
46
|
+
#
|
47
|
+
# where the square brackets denote arbitrary presence of their contents.
|
48
|
+
# Using (VC){m} to denote VC repeated m times, this may again be written as
|
49
|
+
#
|
50
|
+
# [C](VC){m}[V].
|
51
|
+
#
|
52
|
+
# m will be called the \measure\ of any word or word part when represented in
|
53
|
+
# this form. The case m = 0 covers the null word. Here are some examples:
|
54
|
+
#
|
55
|
+
# m=0 TR, EE, TREE, Y, BY.
|
56
|
+
# m=1 TROUBLE, OATS, TREES, IVY.
|
57
|
+
# m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
58
|
+
#
|
59
|
+
# The \rules\ for removing a suffix will be given in the form
|
60
|
+
#
|
61
|
+
# (condition) S1 -> S2
|
62
|
+
#
|
63
|
+
# This means that if a word ends with the suffix S1, and the stem before S1
|
64
|
+
# satisfies the given condition, S1 is replaced by S2. The condition is
|
65
|
+
# usually given in terms of m, e.g.
|
66
|
+
#
|
67
|
+
# (m > 1) EMENT ->
|
68
|
+
#
|
69
|
+
# Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to REPLAC,
|
70
|
+
# since REPLAC is a word part for which m = 2.
|
71
|
+
#
|
72
|
+
# The `condition' part may also contain the following:
|
73
|
+
#
|
74
|
+
# *S - the stem ends with S (and similarly for the other letters).
|
75
|
+
#
|
76
|
+
# *v* - the stem contains a vowel.
|
77
|
+
#
|
78
|
+
# *d - the stem ends with a double consonant (e.g. -TT, -SS).
|
79
|
+
#
|
80
|
+
# *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
|
81
|
+
# -WIL, -HOP).
|
82
|
+
#
|
83
|
+
# And the condition part may also contain expressions with \and\, \or\ and
|
84
|
+
# \not\, so that
|
85
|
+
#
|
86
|
+
# (m>1 and (*S or *T))
|
87
|
+
#
|
88
|
+
# tests for a stem with m>1 ending in S or T, while
|
89
|
+
#
|
90
|
+
# (*d and not (*L or *S or *Z))
|
91
|
+
#
|
92
|
+
# tests for a stem ending witha double consonant other than L, S or Z.
|
93
|
+
# Elaborate conditions like this are required only rarely.
|
94
|
+
#
|
95
|
+
# In a set of rules written beneath each other, only one is obeyed, and this
|
96
|
+
# will be the one with the longest matching S1 for the given word. For
|
97
|
+
# example, with
|
98
|
+
#
|
99
|
+
# SSES -> SS
|
100
|
+
# IES -> I
|
101
|
+
# SS -> SS
|
102
|
+
# S ->
|
103
|
+
#
|
104
|
+
# (here the conditions are all null) CARESSES maps to CARESS since SSES is
|
105
|
+
# the longest match for S1. Equally CARESS maps to CARESS (S1=`SS') and CARES
|
106
|
+
# to CARE (S1=`S').
|
107
|
+
#
|
108
|
+
#
|
109
|
+
---
|
110
|
+
stemmer:
|
111
|
+
# In the rules below, examples of their application, successful or otherwise,
|
112
|
+
# are given on the right in lower case. The algorithm now follows:
|
113
|
+
#
|
114
|
+
# Step 1a
|
115
|
+
# SSES -> SS caresses -> caress
|
116
|
+
# IES -> I ponies -> poni
|
117
|
+
# ties -> ti
|
118
|
+
# SS -> SS caress -> caress
|
119
|
+
# S -> cats -> cat
|
120
|
+
S100:
|
121
|
+
- SSES -> SS
|
122
|
+
- IES -> I
|
123
|
+
- SS -> SS
|
124
|
+
- S ->
|
125
|
+
#
|
126
|
+
# Step 1b
|
127
|
+
#
|
128
|
+
# (m>0) EED -> EE feed -> feed
|
129
|
+
# agreed -> agree
|
130
|
+
# (*v*) ED -> plastered -> plaster
|
131
|
+
# bled -> bled
|
132
|
+
# (*v*) ING -> motoring -> motor
|
133
|
+
# sing -> sing
|
134
|
+
S110:
|
135
|
+
- (m>0) EED -> EE goto(S120)
|
136
|
+
- (*v*) ED -> goto(S111)
|
137
|
+
- (*v*) ING -> goto(S111)
|
138
|
+
- goto(S120)
|
139
|
+
#
|
140
|
+
# If the second or third of the rules in Step 1b is successful, the following
|
141
|
+
# is done:
|
142
|
+
#
|
143
|
+
# AT -> ATE conflat(ed) -> conflate
|
144
|
+
# BL -> BLE troubl(ed) -> trouble
|
145
|
+
# IZ -> IZE siz(ed) -> size
|
146
|
+
# (*d and not (*L or *S or *Z))
|
147
|
+
# -> single letter
|
148
|
+
# hopp(ing) -> hop
|
149
|
+
# tann(ed) -> tan
|
150
|
+
# fall(ing) -> fall
|
151
|
+
# hiss(ing) -> hiss
|
152
|
+
# fizz(ed) -> fizz
|
153
|
+
# (m=1 and *o) -> E fail(ing) -> fail
|
154
|
+
# fil(ing) -> file
|
155
|
+
S111:
|
156
|
+
- AT -> ATE
|
157
|
+
- BL -> BLE
|
158
|
+
- IZ -> IZE
|
159
|
+
- (*d and not (*L or *S or *Z)) -> -1
|
160
|
+
- (m=1 and *o) -> E
|
161
|
+
#
|
162
|
+
# The rule to map to a single letter causes the removal of one of the double
|
163
|
+
# letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes
|
164
|
+
# -ATE, -BLE and -IZE can be recognised later. This E may be removed in step
|
165
|
+
# 4.
|
166
|
+
#
|
167
|
+
# Step 1c
|
168
|
+
#
|
169
|
+
# (*v*) Y -> I happy -> happi
|
170
|
+
# sky -> sky
|
171
|
+
S120:
|
172
|
+
- (*v*) Y -> I
|
173
|
+
#
|
174
|
+
# Step 1 deals with plurals and past participles. The subsequent steps are
|
175
|
+
# much more straightforward.
|
176
|
+
#
|
177
|
+
# Step 2
|
178
|
+
#
|
179
|
+
# (m>0) ATIONAL -> ATE relational -> relate
|
180
|
+
# (m>0) TIONAL -> TION conditional -> condition
|
181
|
+
# rational -> rational
|
182
|
+
# (m>0) ENCI -> ENCE valenci -> valence
|
183
|
+
# (m>0) ANCI -> ANCE hesitanci -> hesitance
|
184
|
+
# (m>0) IZER -> IZE digitizer -> digitize
|
185
|
+
# (m>0) ABLI -> ABLE conformabli -> conformable
|
186
|
+
# (m>0) ALLI -> AL radicalli -> radical
|
187
|
+
# (m>0) ENTLI -> ENT differentli -> different
|
188
|
+
# (m>0) ELI -> E vileli - > vile
|
189
|
+
# (m>0) OUSLI -> OUS analogousli -> analogous
|
190
|
+
# (m>0) IZATION -> IZE vietnamization -> vietnamize
|
191
|
+
# (m>0) ATION -> ATE predication -> predicate
|
192
|
+
# (m>0) ATOR -> ATE operator -> operate
|
193
|
+
# (m>0) ALISM -> AL feudalism -> feudal
|
194
|
+
# (m>0) IVENESS -> IVE decisiveness -> decisive
|
195
|
+
# (m>0) FULNESS -> FUL hopefulness -> hopeful
|
196
|
+
# (m>0) OUSNESS -> OUS callousness -> callous
|
197
|
+
# (m>0) ALITI -> AL formaliti -> formal
|
198
|
+
# (m>0) IVITI -> IVE sensitiviti -> sensitive
|
199
|
+
# (m>0) BILITI -> BLE sensibiliti -> sensible
|
200
|
+
S200:
|
201
|
+
- (m>0) ATIONAL -> ATE
|
202
|
+
- (m>0) TIONAL -> TION
|
203
|
+
- (m>0) ENCI -> ENCE
|
204
|
+
- (m>0) ANCI -> ANCE
|
205
|
+
- (m>0) IZER -> IZE
|
206
|
+
- (m>0) ABLI -> ABLE
|
207
|
+
- (m>0) ALLI -> AL
|
208
|
+
- (m>0) ENTLI -> ENT
|
209
|
+
- (m>0) ELI -> E
|
210
|
+
- (m>0) OUSLI -> OUS
|
211
|
+
- (m>0) IZATION -> IZE
|
212
|
+
- (m>0) ATION -> ATE
|
213
|
+
- (m>0) ATOR -> ATE
|
214
|
+
- (m>0) ALISM -> AL
|
215
|
+
- (m>0) IVENESS -> IVE
|
216
|
+
- (m>0) FULNESS -> FUL
|
217
|
+
- (m>0) OUSNESS -> OUS
|
218
|
+
- (m>0) ALITI -> AL
|
219
|
+
- (m>0) IVITI -> IVE
|
220
|
+
- (m>0) BILITI -> BLE
|
221
|
+
#
|
222
|
+
# The test for the string S1 can be made fast by doing a program switch on
|
223
|
+
# the penultimate letter of the word being tested. This gives a fairly even
|
224
|
+
# breakdown of the possible values of the string S1. It will be seen in fact
|
225
|
+
# that the S1-strings in step 2 are presented here in the alphabetical order
|
226
|
+
# of their penultimate letter. Similar techniques may be applied in the other
|
227
|
+
# steps.
|
228
|
+
#
|
229
|
+
# Step 3
|
230
|
+
#
|
231
|
+
# (m>0) ICATE -> IC triplicate -> triplic
|
232
|
+
# (m>0) ATIVE -> formative -> form
|
233
|
+
# (m>0) ALIZE -> AL formalize -> formal
|
234
|
+
# (m>0) ICITI -> IC electriciti -> electric
|
235
|
+
# (m>0) ICAL -> IC electrical -> electric
|
236
|
+
# (m>0) FUL -> hopeful -> hope
|
237
|
+
# (m>0) NESS -> goodness -> good
|
238
|
+
S300:
|
239
|
+
- (m>0) ICATE -> IC
|
240
|
+
- (m>0) ATIVE ->
|
241
|
+
- (m>0) ALIZE -> AL
|
242
|
+
- (m>0) ICITI -> IC
|
243
|
+
- (m>0) ICAL -> IC
|
244
|
+
- (m>0) FUL ->
|
245
|
+
- (m>0) NESS ->
|
246
|
+
#
|
247
|
+
# Step 4
|
248
|
+
#
|
249
|
+
# (m>1) AL -> revival -> reviv
|
250
|
+
# (m>1) ANCE -> allowance -> allow
|
251
|
+
# (m>1) ENCE -> inference -> infer
|
252
|
+
# (m>1) ER -> airliner -> airlin
|
253
|
+
# (m>1) IC -> gyroscopic -> gyroscop
|
254
|
+
# (m>1) ABLE -> adjustable -> adjust
|
255
|
+
# (m>1) IBLE -> defensible -> defens
|
256
|
+
# (m>1) ANT -> irritant -> irrit
|
257
|
+
# (m>1) EMENT -> replacement -> replac
|
258
|
+
# (m>1) MENT -> adjustment -> adjust
|
259
|
+
# (m>1) ENT -> dependent -> depend
|
260
|
+
# (m>1 and (*S or *T)) ION -> adoption -> adopt
|
261
|
+
# (m>1) OU -> homologou -> homolog
|
262
|
+
# (m>1) ISM -> communism -> commun
|
263
|
+
# (m>1) ATE -> activate -> activ
|
264
|
+
# (m>1) ITI -> angulariti -> angular
|
265
|
+
# (m>1) OUS -> homologous -> homolog
|
266
|
+
# (m>1) IVE -> effective -> effect
|
267
|
+
# (m>1) IZE -> bowdlerize -> bowdler
|
268
|
+
S400:
|
269
|
+
- (m>1) AL ->
|
270
|
+
- (m>1) ANCE ->
|
271
|
+
- (m>1) ENCE ->
|
272
|
+
- (m>1) ER ->
|
273
|
+
- (m>1) IC ->
|
274
|
+
- (m>1) ABLE ->
|
275
|
+
- (m>1) IBLE ->
|
276
|
+
- (m>1) ANT ->
|
277
|
+
- (m>1) EMENT ->
|
278
|
+
- (m>1) MENT ->
|
279
|
+
- (m>1) ENT ->
|
280
|
+
- (m>1 and (*S or *T)) ION ->
|
281
|
+
- (m>1) OU ->
|
282
|
+
- (m>1) ISM ->
|
283
|
+
- (m>1) ATE ->
|
284
|
+
- (m>1) ITI ->
|
285
|
+
- (m>1) OUS ->
|
286
|
+
- (m>1) IVE ->
|
287
|
+
- (m>1) IZE ->
|
288
|
+
#
|
289
|
+
# The suffixes are now removed. All that remains is a little tidying up.
|
290
|
+
#
|
291
|
+
# Step 5a
|
292
|
+
#
|
293
|
+
# (m>1) E -> probate -> probat
|
294
|
+
# rate -> rate
|
295
|
+
# (m=1 and not *o) E -> cease -> ceas
|
296
|
+
S500:
|
297
|
+
- (m>1) E ->
|
298
|
+
- (m=1 and not *o) E ->
|
299
|
+
#
|
300
|
+
# Step 5b
|
301
|
+
#
|
302
|
+
# (m > 1 and *d and *L) -> single letter
|
303
|
+
# controll -> control
|
304
|
+
# roll -> roll
|
305
|
+
S510:
|
306
|
+
- (m > 1 and *d and *L) -> -1
|
307
|
+
#
|
308
|
+
# The algorithm is careful not to remove a suffix when the stem is too short,
|
309
|
+
# the length of the stem being given by its measure, m. There is no linguistic
|
310
|
+
# basis for this approach. It was merely observed that m could be used quite
|
311
|
+
# effectively to help decide whether or not it was wise to take off a suffix.
|