lingo 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/COPYING +663 -0
- data/ChangeLog +754 -0
- data/README +322 -0
- data/Rakefile +100 -0
- data/TODO +28 -0
- data/bin/lingo +5 -0
- data/bin/lingoctl +6 -0
- data/de.lang +121 -0
- data/de/lingo-abk.txt +74 -0
- data/de/lingo-dic.txt +56822 -0
- data/de/lingo-mul.txt +3209 -0
- data/de/lingo-syn.txt +14841 -0
- data/de/test_dic.txt +24 -0
- data/de/test_mul.txt +17 -0
- data/de/test_mul2.txt +2 -0
- data/de/test_singleword.txt +2 -0
- data/de/test_syn.txt +4 -0
- data/de/test_syn2.txt +1 -0
- data/de/user-dic.txt +10 -0
- data/en.lang +113 -0
- data/en/lingo-dic.txt +55434 -0
- data/en/lingo-mul.txt +456 -0
- data/en/user-dic.txt +5 -0
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/gpl-hdr.txt +27 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lib/lingo.rb +321 -0
- data/lib/lingo/attendee/abbreviator.rb +119 -0
- data/lib/lingo/attendee/debugger.rb +111 -0
- data/lib/lingo/attendee/decomposer.rb +101 -0
- data/lib/lingo/attendee/dehyphenizer.rb +167 -0
- data/lib/lingo/attendee/multiworder.rb +301 -0
- data/lib/lingo/attendee/noneword_filter.rb +103 -0
- data/lib/lingo/attendee/objectfilter.rb +86 -0
- data/lib/lingo/attendee/sequencer.rb +190 -0
- data/lib/lingo/attendee/synonymer.rb +105 -0
- data/lib/lingo/attendee/textreader.rb +237 -0
- data/lib/lingo/attendee/textwriter.rb +196 -0
- data/lib/lingo/attendee/tokenizer.rb +218 -0
- data/lib/lingo/attendee/variator.rb +185 -0
- data/lib/lingo/attendee/vector_filter.rb +158 -0
- data/lib/lingo/attendee/wordsearcher.rb +96 -0
- data/lib/lingo/attendees.rb +289 -0
- data/lib/lingo/cli.rb +62 -0
- data/lib/lingo/config.rb +104 -0
- data/lib/lingo/const.rb +131 -0
- data/lib/lingo/ctl.rb +173 -0
- data/lib/lingo/database.rb +587 -0
- data/lib/lingo/language.rb +530 -0
- data/lib/lingo/modules.rb +98 -0
- data/lib/lingo/types.rb +285 -0
- data/lib/lingo/utilities.rb +40 -0
- data/lib/lingo/version.rb +27 -0
- data/lingo-all.cfg +85 -0
- data/lingo-call.cfg +15 -0
- data/lingo.cfg +78 -0
- data/lingo.rb +3 -0
- data/lir.cfg +72 -0
- data/porter/stem.cfg +311 -0
- data/porter/stem.rb +150 -0
- data/spec/spec_helper.rb +0 -0
- data/test.cfg +79 -0
- data/test/attendee/ts_abbreviator.rb +35 -0
- data/test/attendee/ts_decomposer.rb +31 -0
- data/test/attendee/ts_multiworder.rb +390 -0
- data/test/attendee/ts_noneword_filter.rb +19 -0
- data/test/attendee/ts_objectfilter.rb +19 -0
- data/test/attendee/ts_sequencer.rb +43 -0
- data/test/attendee/ts_synonymer.rb +33 -0
- data/test/attendee/ts_textreader.rb +58 -0
- data/test/attendee/ts_textwriter.rb +98 -0
- data/test/attendee/ts_tokenizer.rb +32 -0
- data/test/attendee/ts_variator.rb +24 -0
- data/test/attendee/ts_vector_filter.rb +62 -0
- data/test/attendee/ts_wordsearcher.rb +119 -0
- data/test/lir.csv +3 -0
- data/test/lir.txt +12 -0
- data/test/lir2.txt +12 -0
- data/test/mul.txt +1 -0
- data/test/ref/artikel.mul +1 -0
- data/test/ref/artikel.non +159 -0
- data/test/ref/artikel.seq +270 -0
- data/test/ref/artikel.syn +16 -0
- data/test/ref/artikel.vec +928 -0
- data/test/ref/artikel.ven +928 -0
- data/test/ref/artikel.ver +928 -0
- data/test/ref/lir.csv +328 -0
- data/test/ref/lir.mul +1 -0
- data/test/ref/lir.non +274 -0
- data/test/ref/lir.seq +249 -0
- data/test/ref/lir.syn +94 -0
- data/test/test_helper.rb +113 -0
- data/test/ts_database.rb +269 -0
- data/test/ts_language.rb +396 -0
- data/txt/artikel-en.txt +157 -0
- data/txt/artikel.txt +170 -0
- data/txt/lir.txt +1317 -0
- metadata +211 -0
data/lingo-all.cfg
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
#
|
2
|
+
# Lingo-Konfiguration
|
3
|
+
#
|
4
|
+
---
|
5
|
+
meeting:
|
6
|
+
|
7
|
+
attendees:
|
8
|
+
|
9
|
+
########################################
|
10
|
+
# Text bereitstellen
|
11
|
+
#
|
12
|
+
|
13
|
+
# Angegebene Datei zeilenweise einlesen und verarbeitet
|
14
|
+
- textreader: { files: '$(files)' }
|
15
|
+
|
16
|
+
|
17
|
+
########################################
|
18
|
+
# Inhalte verarbeiten
|
19
|
+
#
|
20
|
+
|
21
|
+
# Zeile in einzelnen Sinnbestandteile (Token) zerlegen
|
22
|
+
- tokenizer: { }
|
23
|
+
|
24
|
+
# Abkürzungen erkennen und auflösen
|
25
|
+
# - abbreviator: { source: 'sys-abk' }
|
26
|
+
|
27
|
+
# Verbleibende Token im Wörterbuch suchen
|
28
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
29
|
+
|
30
|
+
# Schreibweisen variieren und erneut suchen
|
31
|
+
# - variator: { source: 'sys-dic' }
|
32
|
+
|
33
|
+
# Bindestrichergänzungen rekonstruieren
|
34
|
+
# - dehyphenizer: { source: 'sys-dic' }
|
35
|
+
|
36
|
+
# Nicht erkannte Wörter auf Kompositum testen
|
37
|
+
# - decomposer: { source: 'sys-dic' }
|
38
|
+
|
39
|
+
# Mehrwortgruppen im Strom erkennen
|
40
|
+
# - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
41
|
+
|
42
|
+
# Wortsequenzen anhand von Regeln identifizieren
|
43
|
+
# - sequencer: { stopper: 'PUNC,OTHR' }
|
44
|
+
|
45
|
+
# Relationierungen einfügen
|
46
|
+
# - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
|
47
|
+
|
48
|
+
|
49
|
+
########################################
|
50
|
+
# Datenstrom anzeigen
|
51
|
+
#
|
52
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
|
53
|
+
|
54
|
+
|
55
|
+
########################################
|
56
|
+
# Ergebnisse ausgeben
|
57
|
+
#
|
58
|
+
|
59
|
+
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
60
|
+
# - noneword_filter: { in: syn }
|
61
|
+
# - textwriter: { ext: non, sep: "\n" }
|
62
|
+
|
63
|
+
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
64
|
+
# - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
|
65
|
+
# - textwriter: { ext: vec, sep: "\n" }
|
66
|
+
|
67
|
+
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
68
|
+
# - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
|
69
|
+
# - textwriter: { ext: ven, sep: "\n" }
|
70
|
+
|
71
|
+
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
72
|
+
# - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
|
73
|
+
# - textwriter: { ext: ver, sep: "\n" }
|
74
|
+
|
75
|
+
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
76
|
+
# - vector_filter: { in: syn, lexicals: m }
|
77
|
+
# - textwriter: { ext: mul, sep: "\n" }
|
78
|
+
|
79
|
+
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
80
|
+
# - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
|
81
|
+
# - textwriter: { ext: seq, sep: "\n" }
|
82
|
+
|
83
|
+
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
84
|
+
# - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
|
85
|
+
# - textwriter: { ext: syn, sep: "\n" }
|
data/lingo-call.cfg
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
meeting:
|
3
|
+
attendees:
|
4
|
+
- textreader: { }
|
5
|
+
|
6
|
+
- tokenizer: { }
|
7
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
8
|
+
- decomposer: { source: 'sys-dic' }
|
9
|
+
- multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
10
|
+
- sequencer: { stopper: 'PUNC,OTHR' }
|
11
|
+
- synonymer: { skip: '?,t', source: 'sys-syn', out: 'syn' }
|
12
|
+
- debugger: { eval: 'true', ceval: 'false', prompt: '' }
|
13
|
+
|
14
|
+
#- vector_filter: { in: 'syn', lexicals: 'y', sort: 'term_abs' }
|
15
|
+
#- textwriter: { ext: 'STDOUT', sep: "\n" }
|
data/lingo.cfg
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#
|
2
|
+
# Lingo-Konfiguration
|
3
|
+
#
|
4
|
+
---
|
5
|
+
meeting:
|
6
|
+
|
7
|
+
attendees:
|
8
|
+
|
9
|
+
########################################
|
10
|
+
# Text bereitstellen
|
11
|
+
#
|
12
|
+
|
13
|
+
# Angegebene Datei zeilenweise einlesen und verarbeitet
|
14
|
+
- textreader: { files: '$(files)' }
|
15
|
+
|
16
|
+
|
17
|
+
########################################
|
18
|
+
# Inhalte verarbeiten
|
19
|
+
#
|
20
|
+
|
21
|
+
# Zeile in einzelnen Sinnbestandteile (Token) zerlegen
|
22
|
+
- tokenizer: { }
|
23
|
+
|
24
|
+
# Verbleibende Token im Wörterbuch suchen
|
25
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
26
|
+
|
27
|
+
# Nicht erkannte Wörter auf Kompositum testen
|
28
|
+
- decomposer: { source: 'sys-dic' }
|
29
|
+
|
30
|
+
# Mehrwortgruppen im Strom erkennen
|
31
|
+
- multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
32
|
+
|
33
|
+
# Wortsequenzen anhand von Regeln identifizieren
|
34
|
+
- sequencer: { stopper: 'PUNC,OTHR' }
|
35
|
+
|
36
|
+
# Relationierungen einfügen
|
37
|
+
- synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
|
38
|
+
|
39
|
+
|
40
|
+
########################################
|
41
|
+
# Datenstrom anzeigen
|
42
|
+
#
|
43
|
+
#- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
|
44
|
+
|
45
|
+
|
46
|
+
########################################
|
47
|
+
# Ergebnisse ausgeben
|
48
|
+
#
|
49
|
+
- vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
|
50
|
+
- textwriter: { ext: log, sep: "\n" }
|
51
|
+
|
52
|
+
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
53
|
+
- noneword_filter: { in: syn }
|
54
|
+
- textwriter: { ext: non, sep: "\n" }
|
55
|
+
|
56
|
+
# Erstelle Datei mit Endung .vec für erkannte Indexterme
|
57
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$' }
|
58
|
+
- textwriter: { ext: vec, sep: "\n" }
|
59
|
+
|
60
|
+
# Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
|
61
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
|
62
|
+
- textwriter: { ext: ven, sep: "\n" }
|
63
|
+
|
64
|
+
# Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
|
65
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
|
66
|
+
- textwriter: { ext: ver, sep: "\n" }
|
67
|
+
|
68
|
+
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
69
|
+
- vector_filter: { in: syn, lexicals: m }
|
70
|
+
- textwriter: { ext: mul, sep: "\n" }
|
71
|
+
|
72
|
+
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
73
|
+
- vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
|
74
|
+
- textwriter: { ext: seq, sep: "\n" }
|
75
|
+
|
76
|
+
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
77
|
+
- vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
|
78
|
+
- textwriter: { ext: syn, sep: "\n" }
|
data/lingo.rb
ADDED
data/lir.cfg
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
#
|
2
|
+
# Lingo-Konfiguration für den Test mit einer LIR-Datei
|
3
|
+
#
|
4
|
+
# Gebräuchliche Patterns sind
|
5
|
+
# "^\021(\d+\-\d+)\022"
|
6
|
+
# "^\[(\d+)\.\]"
|
7
|
+
#
|
8
|
+
---
|
9
|
+
meeting:
|
10
|
+
|
11
|
+
attendees:
|
12
|
+
|
13
|
+
########################################
|
14
|
+
# Text bereitstellen
|
15
|
+
#
|
16
|
+
|
17
|
+
# Angegebene Datei zeilenweise einlesen und verarbeiten
|
18
|
+
- textreader: { files: '$(files)', lir-record-pattern: '^\[(\d+)\.\]' }
|
19
|
+
|
20
|
+
|
21
|
+
########################################
|
22
|
+
# Inhalte verarbeiten
|
23
|
+
#
|
24
|
+
|
25
|
+
# Zeile in einzelnen Sinnbestandteile (Token) zerlegen
|
26
|
+
- tokenizer: { }
|
27
|
+
|
28
|
+
# Verbleibende Token im Wörterbuch suchen
|
29
|
+
- wordsearcher: { source: 'sys-dic', mode: 'first' }
|
30
|
+
|
31
|
+
# Nicht erkannte Wörter auf Kompositum testen
|
32
|
+
- decomposer: { source: 'sys-dic' }
|
33
|
+
|
34
|
+
# Mehrwortgruppen im Strom erkennen
|
35
|
+
- multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
|
36
|
+
|
37
|
+
# Wortsequenzen anhand von Regeln identifizieren
|
38
|
+
- sequencer: { stopper: 'PUNC,OTHR' }
|
39
|
+
|
40
|
+
# Relationierungen einfügen
|
41
|
+
- synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
|
42
|
+
|
43
|
+
|
44
|
+
########################################
|
45
|
+
# Datenstrom anzeigen
|
46
|
+
#
|
47
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
|
48
|
+
|
49
|
+
|
50
|
+
########################################
|
51
|
+
# Ergebnisse ausgeben
|
52
|
+
#
|
53
|
+
|
54
|
+
# Erstelle Datei mit Endung .non für nicht erkannte Wörter
|
55
|
+
- noneword_filter: { in: syn }
|
56
|
+
- textwriter: { ext: non }
|
57
|
+
|
58
|
+
# Erstelle Datei mit Endung .csv für erkannte Indexterme
|
59
|
+
- vector_filter: { in: syn, lexicals: '^[ksavem]$' }
|
60
|
+
- textwriter: { ext: csv }
|
61
|
+
|
62
|
+
# Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
|
63
|
+
- vector_filter: { in: syn, lexicals: m }
|
64
|
+
- textwriter: { ext: mul }
|
65
|
+
|
66
|
+
# Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
|
67
|
+
- vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
|
68
|
+
- textwriter: { ext: seq }
|
69
|
+
|
70
|
+
# Erstelle Datei mit Endung .syn für erkannte Synonyme
|
71
|
+
- vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
|
72
|
+
- textwriter: { ext: syn }
|
data/porter/stem.cfg
ADDED
@@ -0,0 +1,311 @@
|
|
1
|
+
# Stem.cfg
|
2
|
+
#
|
3
|
+
# Rules for Porter-Stemmer
|
4
|
+
#
|
5
|
+
#
|
6
|
+
# based on:
|
7
|
+
# An algorithm for suffix stripping
|
8
|
+
#
|
9
|
+
# M.F.Porter
|
10
|
+
# 1980
|
11
|
+
#
|
12
|
+
# Originally published in Program, 14 no. 3, pp 130-137, July 1980. (A
|
13
|
+
# few typos have been corrected.)
|
14
|
+
#
|
15
|
+
# http://tartarus.org/~martin/PorterStemmer/def.txt
|
16
|
+
#
|
17
|
+
# --------------------------------------------------
|
18
|
+
#
|
19
|
+
#
|
20
|
+
#
|
21
|
+
#
|
22
|
+
# 2. THE ALGORITHM
|
23
|
+
#
|
24
|
+
# To present the suffix stripping algorithm in its entirety we will need a few
|
25
|
+
# difinitions.
|
26
|
+
#
|
27
|
+
# A \consonant\ in a word is a letter other than A, E, I, O or U, and other
|
28
|
+
# than Y preceded by a consonant. (The fact that the term `consonant' is
|
29
|
+
# defined to some extent in terms of itself does not make it ambiguous.) So in
|
30
|
+
# TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
|
31
|
+
# letter is not a consonant it is a \vowel\.
|
32
|
+
#
|
33
|
+
# A consonant will be denoted by c, a vowel by v. A list ccc... of length
|
34
|
+
# greater than 0 will be denoted by C, and a list vvv... of length greater
|
35
|
+
# than 0 will be denoted by V. Any word, or part of a word, therefore has one
|
36
|
+
# of the four forms:
|
37
|
+
#
|
38
|
+
# CVCV ... C
|
39
|
+
# CVCV ... V
|
40
|
+
# VCVC ... C
|
41
|
+
# VCVC ... V
|
42
|
+
#
|
43
|
+
# These may all be represented by the single form
|
44
|
+
#
|
45
|
+
# [C]VCVC ... [V]
|
46
|
+
#
|
47
|
+
# where the square brackets denote arbitrary presence of their contents.
|
48
|
+
# Using (VC){m} to denote VC repeated m times, this may again be written as
|
49
|
+
#
|
50
|
+
# [C](VC){m}[V].
|
51
|
+
#
|
52
|
+
# m will be called the \measure\ of any word or word part when represented in
|
53
|
+
# this form. The case m = 0 covers the null word. Here are some examples:
|
54
|
+
#
|
55
|
+
# m=0 TR, EE, TREE, Y, BY.
|
56
|
+
# m=1 TROUBLE, OATS, TREES, IVY.
|
57
|
+
# m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
58
|
+
#
|
59
|
+
# The \rules\ for removing a suffix will be given in the form
|
60
|
+
#
|
61
|
+
# (condition) S1 -> S2
|
62
|
+
#
|
63
|
+
# This means that if a word ends with the suffix S1, and the stem before S1
|
64
|
+
# satisfies the given condition, S1 is replaced by S2. The condition is
|
65
|
+
# usually given in terms of m, e.g.
|
66
|
+
#
|
67
|
+
# (m > 1) EMENT ->
|
68
|
+
#
|
69
|
+
# Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to REPLAC,
|
70
|
+
# since REPLAC is a word part for which m = 2.
|
71
|
+
#
|
72
|
+
# The `condition' part may also contain the following:
|
73
|
+
#
|
74
|
+
# *S - the stem ends with S (and similarly for the other letters).
|
75
|
+
#
|
76
|
+
# *v* - the stem contains a vowel.
|
77
|
+
#
|
78
|
+
# *d - the stem ends with a double consonant (e.g. -TT, -SS).
|
79
|
+
#
|
80
|
+
# *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
|
81
|
+
# -WIL, -HOP).
|
82
|
+
#
|
83
|
+
# And the condition part may also contain expressions with \and\, \or\ and
|
84
|
+
# \not\, so that
|
85
|
+
#
|
86
|
+
# (m>1 and (*S or *T))
|
87
|
+
#
|
88
|
+
# tests for a stem with m>1 ending in S or T, while
|
89
|
+
#
|
90
|
+
# (*d and not (*L or *S or *Z))
|
91
|
+
#
|
92
|
+
# tests for a stem ending witha double consonant other than L, S or Z.
|
93
|
+
# Elaborate conditions like this are required only rarely.
|
94
|
+
#
|
95
|
+
# In a set of rules written beneath each other, only one is obeyed, and this
|
96
|
+
# will be the one with the longest matching S1 for the given word. For
|
97
|
+
# example, with
|
98
|
+
#
|
99
|
+
# SSES -> SS
|
100
|
+
# IES -> I
|
101
|
+
# SS -> SS
|
102
|
+
# S ->
|
103
|
+
#
|
104
|
+
# (here the conditions are all null) CARESSES maps to CARESS since SSES is
|
105
|
+
# the longest match for S1. Equally CARESS maps to CARESS (S1=`SS') and CARES
|
106
|
+
# to CARE (S1=`S').
|
107
|
+
#
|
108
|
+
#
|
109
|
+
---
|
110
|
+
stemmer:
|
111
|
+
# In the rules below, examples of their application, successful or otherwise,
|
112
|
+
# are given on the right in lower case. The algorithm now follows:
|
113
|
+
#
|
114
|
+
# Step 1a
|
115
|
+
# SSES -> SS caresses -> caress
|
116
|
+
# IES -> I ponies -> poni
|
117
|
+
# ties -> ti
|
118
|
+
# SS -> SS caress -> caress
|
119
|
+
# S -> cats -> cat
|
120
|
+
S100:
|
121
|
+
- SSES -> SS
|
122
|
+
- IES -> I
|
123
|
+
- SS -> SS
|
124
|
+
- S ->
|
125
|
+
#
|
126
|
+
# Step 1b
|
127
|
+
#
|
128
|
+
# (m>0) EED -> EE feed -> feed
|
129
|
+
# agreed -> agree
|
130
|
+
# (*v*) ED -> plastered -> plaster
|
131
|
+
# bled -> bled
|
132
|
+
# (*v*) ING -> motoring -> motor
|
133
|
+
# sing -> sing
|
134
|
+
S110:
|
135
|
+
- (m>0) EED -> EE goto(S120)
|
136
|
+
- (*v*) ED -> goto(S111)
|
137
|
+
- (*v*) ING -> goto(S111)
|
138
|
+
- goto(S120)
|
139
|
+
#
|
140
|
+
# If the second or third of the rules in Step 1b is successful, the following
|
141
|
+
# is done:
|
142
|
+
#
|
143
|
+
# AT -> ATE conflat(ed) -> conflate
|
144
|
+
# BL -> BLE troubl(ed) -> trouble
|
145
|
+
# IZ -> IZE siz(ed) -> size
|
146
|
+
# (*d and not (*L or *S or *Z))
|
147
|
+
# -> single letter
|
148
|
+
# hopp(ing) -> hop
|
149
|
+
# tann(ed) -> tan
|
150
|
+
# fall(ing) -> fall
|
151
|
+
# hiss(ing) -> hiss
|
152
|
+
# fizz(ed) -> fizz
|
153
|
+
# (m=1 and *o) -> E fail(ing) -> fail
|
154
|
+
# fil(ing) -> file
|
155
|
+
S111:
|
156
|
+
- AT -> ATE
|
157
|
+
- BL -> BLE
|
158
|
+
- IZ -> IZE
|
159
|
+
- (*d and not (*L or *S or *Z)) -> -1
|
160
|
+
- (m=1 and *o) -> E
|
161
|
+
#
|
162
|
+
# The rule to map to a single letter causes the removal of one of the double
|
163
|
+
# letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes
|
164
|
+
# -ATE, -BLE and -IZE can be recognised later. This E may be removed in step
|
165
|
+
# 4.
|
166
|
+
#
|
167
|
+
# Step 1c
|
168
|
+
#
|
169
|
+
# (*v*) Y -> I happy -> happi
|
170
|
+
# sky -> sky
|
171
|
+
S120:
|
172
|
+
- (*v*) Y -> I
|
173
|
+
#
|
174
|
+
# Step 1 deals with plurals and past participles. The subsequent steps are
|
175
|
+
# much more straightforward.
|
176
|
+
#
|
177
|
+
# Step 2
|
178
|
+
#
|
179
|
+
# (m>0) ATIONAL -> ATE relational -> relate
|
180
|
+
# (m>0) TIONAL -> TION conditional -> condition
|
181
|
+
# rational -> rational
|
182
|
+
# (m>0) ENCI -> ENCE valenci -> valence
|
183
|
+
# (m>0) ANCI -> ANCE hesitanci -> hesitance
|
184
|
+
# (m>0) IZER -> IZE digitizer -> digitize
|
185
|
+
# (m>0) ABLI -> ABLE conformabli -> conformable
|
186
|
+
# (m>0) ALLI -> AL radicalli -> radical
|
187
|
+
# (m>0) ENTLI -> ENT differentli -> different
|
188
|
+
# (m>0) ELI -> E vileli - > vile
|
189
|
+
# (m>0) OUSLI -> OUS analogousli -> analogous
|
190
|
+
# (m>0) IZATION -> IZE vietnamization -> vietnamize
|
191
|
+
# (m>0) ATION -> ATE predication -> predicate
|
192
|
+
# (m>0) ATOR -> ATE operator -> operate
|
193
|
+
# (m>0) ALISM -> AL feudalism -> feudal
|
194
|
+
# (m>0) IVENESS -> IVE decisiveness -> decisive
|
195
|
+
# (m>0) FULNESS -> FUL hopefulness -> hopeful
|
196
|
+
# (m>0) OUSNESS -> OUS callousness -> callous
|
197
|
+
# (m>0) ALITI -> AL formaliti -> formal
|
198
|
+
# (m>0) IVITI -> IVE sensitiviti -> sensitive
|
199
|
+
# (m>0) BILITI -> BLE sensibiliti -> sensible
|
200
|
+
S200:
|
201
|
+
- (m>0) ATIONAL -> ATE
|
202
|
+
- (m>0) TIONAL -> TION
|
203
|
+
- (m>0) ENCI -> ENCE
|
204
|
+
- (m>0) ANCI -> ANCE
|
205
|
+
- (m>0) IZER -> IZE
|
206
|
+
- (m>0) ABLI -> ABLE
|
207
|
+
- (m>0) ALLI -> AL
|
208
|
+
- (m>0) ENTLI -> ENT
|
209
|
+
- (m>0) ELI -> E
|
210
|
+
- (m>0) OUSLI -> OUS
|
211
|
+
- (m>0) IZATION -> IZE
|
212
|
+
- (m>0) ATION -> ATE
|
213
|
+
- (m>0) ATOR -> ATE
|
214
|
+
- (m>0) ALISM -> AL
|
215
|
+
- (m>0) IVENESS -> IVE
|
216
|
+
- (m>0) FULNESS -> FUL
|
217
|
+
- (m>0) OUSNESS -> OUS
|
218
|
+
- (m>0) ALITI -> AL
|
219
|
+
- (m>0) IVITI -> IVE
|
220
|
+
- (m>0) BILITI -> BLE
|
221
|
+
#
|
222
|
+
# The test for the string S1 can be made fast by doing a program switch on
|
223
|
+
# the penultimate letter of the word being tested. This gives a fairly even
|
224
|
+
# breakdown of the possible values of the string S1. It will be seen in fact
|
225
|
+
# that the S1-strings in step 2 are presented here in the alphabetical order
|
226
|
+
# of their penultimate letter. Similar techniques may be applied in the other
|
227
|
+
# steps.
|
228
|
+
#
|
229
|
+
# Step 3
|
230
|
+
#
|
231
|
+
# (m>0) ICATE -> IC triplicate -> triplic
|
232
|
+
# (m>0) ATIVE -> formative -> form
|
233
|
+
# (m>0) ALIZE -> AL formalize -> formal
|
234
|
+
# (m>0) ICITI -> IC electriciti -> electric
|
235
|
+
# (m>0) ICAL -> IC electrical -> electric
|
236
|
+
# (m>0) FUL -> hopeful -> hope
|
237
|
+
# (m>0) NESS -> goodness -> good
|
238
|
+
S300:
|
239
|
+
- (m>0) ICATE -> IC
|
240
|
+
- (m>0) ATIVE ->
|
241
|
+
- (m>0) ALIZE -> AL
|
242
|
+
- (m>0) ICITI -> IC
|
243
|
+
- (m>0) ICAL -> IC
|
244
|
+
- (m>0) FUL ->
|
245
|
+
- (m>0) NESS ->
|
246
|
+
#
|
247
|
+
# Step 4
|
248
|
+
#
|
249
|
+
# (m>1) AL -> revival -> reviv
|
250
|
+
# (m>1) ANCE -> allowance -> allow
|
251
|
+
# (m>1) ENCE -> inference -> infer
|
252
|
+
# (m>1) ER -> airliner -> airlin
|
253
|
+
# (m>1) IC -> gyroscopic -> gyroscop
|
254
|
+
# (m>1) ABLE -> adjustable -> adjust
|
255
|
+
# (m>1) IBLE -> defensible -> defens
|
256
|
+
# (m>1) ANT -> irritant -> irrit
|
257
|
+
# (m>1) EMENT -> replacement -> replac
|
258
|
+
# (m>1) MENT -> adjustment -> adjust
|
259
|
+
# (m>1) ENT -> dependent -> depend
|
260
|
+
# (m>1 and (*S or *T)) ION -> adoption -> adopt
|
261
|
+
# (m>1) OU -> homologou -> homolog
|
262
|
+
# (m>1) ISM -> communism -> commun
|
263
|
+
# (m>1) ATE -> activate -> activ
|
264
|
+
# (m>1) ITI -> angulariti -> angular
|
265
|
+
# (m>1) OUS -> homologous -> homolog
|
266
|
+
# (m>1) IVE -> effective -> effect
|
267
|
+
# (m>1) IZE -> bowdlerize -> bowdler
|
268
|
+
S400:
|
269
|
+
- (m>1) AL ->
|
270
|
+
- (m>1) ANCE ->
|
271
|
+
- (m>1) ENCE ->
|
272
|
+
- (m>1) ER ->
|
273
|
+
- (m>1) IC ->
|
274
|
+
- (m>1) ABLE ->
|
275
|
+
- (m>1) IBLE ->
|
276
|
+
- (m>1) ANT ->
|
277
|
+
- (m>1) EMENT ->
|
278
|
+
- (m>1) MENT ->
|
279
|
+
- (m>1) ENT ->
|
280
|
+
- (m>1 and (*S or *T)) ION ->
|
281
|
+
- (m>1) OU ->
|
282
|
+
- (m>1) ISM ->
|
283
|
+
- (m>1) ATE ->
|
284
|
+
- (m>1) ITI ->
|
285
|
+
- (m>1) OUS ->
|
286
|
+
- (m>1) IVE ->
|
287
|
+
- (m>1) IZE ->
|
288
|
+
#
|
289
|
+
# The suffixes are now removed. All that remains is a little tidying up.
|
290
|
+
#
|
291
|
+
# Step 5a
|
292
|
+
#
|
293
|
+
# (m>1) E -> probate -> probat
|
294
|
+
# rate -> rate
|
295
|
+
# (m=1 and not *o) E -> cease -> ceas
|
296
|
+
S500:
|
297
|
+
- (m>1) E ->
|
298
|
+
- (m=1 and not *o) E ->
|
299
|
+
#
|
300
|
+
# Step 5b
|
301
|
+
#
|
302
|
+
# (m > 1 and *d and *L) -> single letter
|
303
|
+
# controll -> control
|
304
|
+
# roll -> roll
|
305
|
+
S510:
|
306
|
+
- (m > 1 and *d and *L) -> -1
|
307
|
+
#
|
308
|
+
# The algorithm is careful not to remove a suffix when the stem is too short,
|
309
|
+
# the length of the stem being given by its measure, m. There is no linguistic
|
310
|
+
# basis for this approach. It was merely observed that m could be used quite
|
311
|
+
# effectively to help decide whether or not it was wise to take off a suffix.
|