lingo 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. data/.rspec +1 -0
  2. data/COPYING +663 -0
  3. data/ChangeLog +754 -0
  4. data/README +322 -0
  5. data/Rakefile +100 -0
  6. data/TODO +28 -0
  7. data/bin/lingo +5 -0
  8. data/bin/lingoctl +6 -0
  9. data/de.lang +121 -0
  10. data/de/lingo-abk.txt +74 -0
  11. data/de/lingo-dic.txt +56822 -0
  12. data/de/lingo-mul.txt +3209 -0
  13. data/de/lingo-syn.txt +14841 -0
  14. data/de/test_dic.txt +24 -0
  15. data/de/test_mul.txt +17 -0
  16. data/de/test_mul2.txt +2 -0
  17. data/de/test_singleword.txt +2 -0
  18. data/de/test_syn.txt +4 -0
  19. data/de/test_syn2.txt +1 -0
  20. data/de/user-dic.txt +10 -0
  21. data/en.lang +113 -0
  22. data/en/lingo-dic.txt +55434 -0
  23. data/en/lingo-mul.txt +456 -0
  24. data/en/user-dic.txt +5 -0
  25. data/info/Objekte.png +0 -0
  26. data/info/Typen.png +0 -0
  27. data/info/database.png +0 -0
  28. data/info/db_small.png +0 -0
  29. data/info/download.png +0 -0
  30. data/info/gpl-hdr.txt +27 -0
  31. data/info/kerze.png +0 -0
  32. data/info/language.png +0 -0
  33. data/info/lingo.png +0 -0
  34. data/info/logo.png +0 -0
  35. data/info/meeting.png +0 -0
  36. data/info/types.png +0 -0
  37. data/lib/lingo.rb +321 -0
  38. data/lib/lingo/attendee/abbreviator.rb +119 -0
  39. data/lib/lingo/attendee/debugger.rb +111 -0
  40. data/lib/lingo/attendee/decomposer.rb +101 -0
  41. data/lib/lingo/attendee/dehyphenizer.rb +167 -0
  42. data/lib/lingo/attendee/multiworder.rb +301 -0
  43. data/lib/lingo/attendee/noneword_filter.rb +103 -0
  44. data/lib/lingo/attendee/objectfilter.rb +86 -0
  45. data/lib/lingo/attendee/sequencer.rb +190 -0
  46. data/lib/lingo/attendee/synonymer.rb +105 -0
  47. data/lib/lingo/attendee/textreader.rb +237 -0
  48. data/lib/lingo/attendee/textwriter.rb +196 -0
  49. data/lib/lingo/attendee/tokenizer.rb +218 -0
  50. data/lib/lingo/attendee/variator.rb +185 -0
  51. data/lib/lingo/attendee/vector_filter.rb +158 -0
  52. data/lib/lingo/attendee/wordsearcher.rb +96 -0
  53. data/lib/lingo/attendees.rb +289 -0
  54. data/lib/lingo/cli.rb +62 -0
  55. data/lib/lingo/config.rb +104 -0
  56. data/lib/lingo/const.rb +131 -0
  57. data/lib/lingo/ctl.rb +173 -0
  58. data/lib/lingo/database.rb +587 -0
  59. data/lib/lingo/language.rb +530 -0
  60. data/lib/lingo/modules.rb +98 -0
  61. data/lib/lingo/types.rb +285 -0
  62. data/lib/lingo/utilities.rb +40 -0
  63. data/lib/lingo/version.rb +27 -0
  64. data/lingo-all.cfg +85 -0
  65. data/lingo-call.cfg +15 -0
  66. data/lingo.cfg +78 -0
  67. data/lingo.rb +3 -0
  68. data/lir.cfg +72 -0
  69. data/porter/stem.cfg +311 -0
  70. data/porter/stem.rb +150 -0
  71. data/spec/spec_helper.rb +0 -0
  72. data/test.cfg +79 -0
  73. data/test/attendee/ts_abbreviator.rb +35 -0
  74. data/test/attendee/ts_decomposer.rb +31 -0
  75. data/test/attendee/ts_multiworder.rb +390 -0
  76. data/test/attendee/ts_noneword_filter.rb +19 -0
  77. data/test/attendee/ts_objectfilter.rb +19 -0
  78. data/test/attendee/ts_sequencer.rb +43 -0
  79. data/test/attendee/ts_synonymer.rb +33 -0
  80. data/test/attendee/ts_textreader.rb +58 -0
  81. data/test/attendee/ts_textwriter.rb +98 -0
  82. data/test/attendee/ts_tokenizer.rb +32 -0
  83. data/test/attendee/ts_variator.rb +24 -0
  84. data/test/attendee/ts_vector_filter.rb +62 -0
  85. data/test/attendee/ts_wordsearcher.rb +119 -0
  86. data/test/lir.csv +3 -0
  87. data/test/lir.txt +12 -0
  88. data/test/lir2.txt +12 -0
  89. data/test/mul.txt +1 -0
  90. data/test/ref/artikel.mul +1 -0
  91. data/test/ref/artikel.non +159 -0
  92. data/test/ref/artikel.seq +270 -0
  93. data/test/ref/artikel.syn +16 -0
  94. data/test/ref/artikel.vec +928 -0
  95. data/test/ref/artikel.ven +928 -0
  96. data/test/ref/artikel.ver +928 -0
  97. data/test/ref/lir.csv +328 -0
  98. data/test/ref/lir.mul +1 -0
  99. data/test/ref/lir.non +274 -0
  100. data/test/ref/lir.seq +249 -0
  101. data/test/ref/lir.syn +94 -0
  102. data/test/test_helper.rb +113 -0
  103. data/test/ts_database.rb +269 -0
  104. data/test/ts_language.rb +396 -0
  105. data/txt/artikel-en.txt +157 -0
  106. data/txt/artikel.txt +170 -0
  107. data/txt/lir.txt +1317 -0
  108. metadata +211 -0
data/lingo-all.cfg ADDED
@@ -0,0 +1,85 @@
1
+ #
2
+ # Lingo-Konfiguration
3
+ #
4
+ ---
5
+ meeting:
6
+
7
+ attendees:
8
+
9
+ ########################################
10
+ # Text bereitstellen
11
+ #
12
+
13
+ # Angegebene Datei zeilenweise einlesen und verarbeitet
14
+ - textreader: { files: '$(files)' }
15
+
16
+
17
+ ########################################
18
+ # Inhalte verarbeiten
19
+ #
20
+
21
+ # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
22
+ - tokenizer: { }
23
+
24
+ # Abkürzungen erkennen und auflösen
25
+ # - abbreviator: { source: 'sys-abk' }
26
+
27
+ # Verbleibende Token im Wörterbuch suchen
28
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
29
+
30
+ # Schreibweisen variieren und erneut suchen
31
+ # - variator: { source: 'sys-dic' }
32
+
33
+ # Bindestrichergänzungen rekonstruieren
34
+ # - dehyphenizer: { source: 'sys-dic' }
35
+
36
+ # Nicht erkannte Wörter auf Kompositum testen
37
+ # - decomposer: { source: 'sys-dic' }
38
+
39
+ # Mehrwortgruppen im Strom erkennen
40
+ # - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
41
+
42
+ # Wortsequenzen anhand von Regeln identifizieren
43
+ # - sequencer: { stopper: 'PUNC,OTHR' }
44
+
45
+ # Relationierungen einfügen
46
+ # - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
47
+
48
+
49
+ ########################################
50
+ # Datenstrom anzeigen
51
+ #
52
+ - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
53
+
54
+
55
+ ########################################
56
+ # Ergebnisse ausgeben
57
+ #
58
+
59
+ # Erstelle Datei mit Endung .non für nicht erkannte Wörter
60
+ # - noneword_filter: { in: syn }
61
+ # - textwriter: { ext: non, sep: "\n" }
62
+
63
+ # Erstelle Datei mit Endung .vec für erkannte Indexterme
64
+ # - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
65
+ # - textwriter: { ext: vec, sep: "\n" }
66
+
67
+ # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
68
+ # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
69
+ # - textwriter: { ext: ven, sep: "\n" }
70
+
71
+ # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
72
+ # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
73
+ # - textwriter: { ext: ver, sep: "\n" }
74
+
75
+ # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
76
+ # - vector_filter: { in: syn, lexicals: m }
77
+ # - textwriter: { ext: mul, sep: "\n" }
78
+
79
+ # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
80
+ # - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
81
+ # - textwriter: { ext: seq, sep: "\n" }
82
+
83
+ # Erstelle Datei mit Endung .syn für erkannte Synonyme
84
+ # - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
85
+ # - textwriter: { ext: syn, sep: "\n" }
data/lingo-call.cfg ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ meeting:
3
+ attendees:
4
+ - textreader: { }
5
+
6
+ - tokenizer: { }
7
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
8
+ - decomposer: { source: 'sys-dic' }
9
+ - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
10
+ - sequencer: { stopper: 'PUNC,OTHR' }
11
+ - synonymer: { skip: '?,t', source: 'sys-syn', out: 'syn' }
12
+ - debugger: { eval: 'true', ceval: 'false', prompt: '' }
13
+
14
+ #- vector_filter: { in: 'syn', lexicals: 'y', sort: 'term_abs' }
15
+ #- textwriter: { ext: 'STDOUT', sep: "\n" }
data/lingo.cfg ADDED
@@ -0,0 +1,78 @@
1
+ #
2
+ # Lingo-Konfiguration
3
+ #
4
+ ---
5
+ meeting:
6
+
7
+ attendees:
8
+
9
+ ########################################
10
+ # Text bereitstellen
11
+ #
12
+
13
+ # Angegebene Datei zeilenweise einlesen und verarbeitet
14
+ - textreader: { files: '$(files)' }
15
+
16
+
17
+ ########################################
18
+ # Inhalte verarbeiten
19
+ #
20
+
21
+ # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
22
+ - tokenizer: { }
23
+
24
+ # Verbleibende Token im Wörterbuch suchen
25
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
26
+
27
+ # Nicht erkannte Wörter auf Kompositum testen
28
+ - decomposer: { source: 'sys-dic' }
29
+
30
+ # Mehrwortgruppen im Strom erkennen
31
+ - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
32
+
33
+ # Wortsequenzen anhand von Regeln identifizieren
34
+ - sequencer: { stopper: 'PUNC,OTHR' }
35
+
36
+ # Relationierungen einfügen
37
+ - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
38
+
39
+
40
+ ########################################
41
+ # Datenstrom anzeigen
42
+ #
43
+ #- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
44
+
45
+
46
+ ########################################
47
+ # Ergebnisse ausgeben
48
+ #
49
+ - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
50
+ - textwriter: { ext: log, sep: "\n" }
51
+
52
+ # Erstelle Datei mit Endung .non für nicht erkannte Wörter
53
+ - noneword_filter: { in: syn }
54
+ - textwriter: { ext: non, sep: "\n" }
55
+
56
+ # Erstelle Datei mit Endung .vec für erkannte Indexterme
57
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
58
+ - textwriter: { ext: vec, sep: "\n" }
59
+
60
+ # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
61
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
62
+ - textwriter: { ext: ven, sep: "\n" }
63
+
64
+ # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
65
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
66
+ - textwriter: { ext: ver, sep: "\n" }
67
+
68
+ # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
69
+ - vector_filter: { in: syn, lexicals: m }
70
+ - textwriter: { ext: mul, sep: "\n" }
71
+
72
+ # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
73
+ - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
74
+ - textwriter: { ext: seq, sep: "\n" }
75
+
76
+ # Erstelle Datei mit Endung .syn für erkannte Synonyme
77
+ - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
78
+ - textwriter: { ext: syn, sep: "\n" }
data/lingo.rb ADDED
@@ -0,0 +1,3 @@
1
+ require_relative 'lib/lingo'
2
+
3
+ Lingo.talk if $0 == __FILE__
data/lir.cfg ADDED
@@ -0,0 +1,72 @@
1
+ #
2
+ # Lingo-Konfiguration für den Test mit einer LIR-Datei
3
+ #
4
+ # Gebräuchliche Patterns sind
5
+ # "^\021(\d+\-\d+)\022"
6
+ # "^\[(\d+)\.\]"
7
+ #
8
+ ---
9
+ meeting:
10
+
11
+ attendees:
12
+
13
+ ########################################
14
+ # Text bereitstellen
15
+ #
16
+
17
+ # Angegebene Datei zeilenweise einlesen und verarbeiten
18
+ - textreader: { files: '$(files)', lir-record-pattern: '^\[(\d+)\.\]' }
19
+
20
+
21
+ ########################################
22
+ # Inhalte verarbeiten
23
+ #
24
+
25
+ # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
26
+ - tokenizer: { }
27
+
28
+ # Verbleibende Token im Wörterbuch suchen
29
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
30
+
31
+ # Nicht erkannte Wörter auf Kompositum testen
32
+ - decomposer: { source: 'sys-dic' }
33
+
34
+ # Mehrwortgruppen im Strom erkennen
35
+ - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
36
+
37
+ # Wortsequenzen anhand von Regeln identifizieren
38
+ - sequencer: { stopper: 'PUNC,OTHR' }
39
+
40
+ # Relationierungen einfügen
41
+ - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
42
+
43
+
44
+ ########################################
45
+ # Datenstrom anzeigen
46
+ #
47
+ - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
48
+
49
+
50
+ ########################################
51
+ # Ergebnisse ausgeben
52
+ #
53
+
54
+ # Erstelle Datei mit Endung .non für nicht erkannte Wörter
55
+ - noneword_filter: { in: syn }
56
+ - textwriter: { ext: non }
57
+
58
+ # Erstelle Datei mit Endung .csv für erkannte Indexterme
59
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
60
+ - textwriter: { ext: csv }
61
+
62
+ # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
63
+ - vector_filter: { in: syn, lexicals: m }
64
+ - textwriter: { ext: mul }
65
+
66
+ # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
67
+ - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
68
+ - textwriter: { ext: seq }
69
+
70
+ # Erstelle Datei mit Endung .syn für erkannte Synonyme
71
+ - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
72
+ - textwriter: { ext: syn }
data/porter/stem.cfg ADDED
@@ -0,0 +1,311 @@
1
+ # Stem.cfg
2
+ #
3
+ # Rules for Porter-Stemmer
4
+ #
5
+ #
6
+ # based on:
7
+ # An algorithm for suffix stripping
8
+ #
9
+ # M.F.Porter
10
+ # 1980
11
+ #
12
+ # Originally published in Program, 14 no. 3, pp 130-137, July 1980. (A
13
+ # few typos have been corrected.)
14
+ #
15
+ # http://tartarus.org/~martin/PorterStemmer/def.txt
16
+ #
17
+ # --------------------------------------------------
18
+ #
19
+ #
20
+ #
21
+ #
22
+ # 2. THE ALGORITHM
23
+ #
24
+ # To present the suffix stripping algorithm in its entirety we will need a few
25
+ # difinitions.
26
+ #
27
+ # A \consonant\ in a word is a letter other than A, E, I, O or U, and other
28
+ # than Y preceded by a consonant. (The fact that the term `consonant' is
29
+ # defined to some extent in terms of itself does not make it ambiguous.) So in
30
+ # TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
31
+ # letter is not a consonant it is a \vowel\.
32
+ #
33
+ # A consonant will be denoted by c, a vowel by v. A list ccc... of length
34
+ # greater than 0 will be denoted by C, and a list vvv... of length greater
35
+ # than 0 will be denoted by V. Any word, or part of a word, therefore has one
36
+ # of the four forms:
37
+ #
38
+ # CVCV ... C
39
+ # CVCV ... V
40
+ # VCVC ... C
41
+ # VCVC ... V
42
+ #
43
+ # These may all be represented by the single form
44
+ #
45
+ # [C]VCVC ... [V]
46
+ #
47
+ # where the square brackets denote arbitrary presence of their contents.
48
+ # Using (VC){m} to denote VC repeated m times, this may again be written as
49
+ #
50
+ # [C](VC){m}[V].
51
+ #
52
+ # m will be called the \measure\ of any word or word part when represented in
53
+ # this form. The case m = 0 covers the null word. Here are some examples:
54
+ #
55
+ # m=0 TR, EE, TREE, Y, BY.
56
+ # m=1 TROUBLE, OATS, TREES, IVY.
57
+ # m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
58
+ #
59
+ # The \rules\ for removing a suffix will be given in the form
60
+ #
61
+ # (condition) S1 -> S2
62
+ #
63
+ # This means that if a word ends with the suffix S1, and the stem before S1
64
+ # satisfies the given condition, S1 is replaced by S2. The condition is
65
+ # usually given in terms of m, e.g.
66
+ #
67
+ # (m > 1) EMENT ->
68
+ #
69
+ # Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to REPLAC,
70
+ # since REPLAC is a word part for which m = 2.
71
+ #
72
+ # The `condition' part may also contain the following:
73
+ #
74
+ # *S - the stem ends with S (and similarly for the other letters).
75
+ #
76
+ # *v* - the stem contains a vowel.
77
+ #
78
+ # *d - the stem ends with a double consonant (e.g. -TT, -SS).
79
+ #
80
+ # *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
81
+ # -WIL, -HOP).
82
+ #
83
+ # And the condition part may also contain expressions with \and\, \or\ and
84
+ # \not\, so that
85
+ #
86
+ # (m>1 and (*S or *T))
87
+ #
88
+ # tests for a stem with m>1 ending in S or T, while
89
+ #
90
+ # (*d and not (*L or *S or *Z))
91
+ #
92
+ # tests for a stem ending witha double consonant other than L, S or Z.
93
+ # Elaborate conditions like this are required only rarely.
94
+ #
95
+ # In a set of rules written beneath each other, only one is obeyed, and this
96
+ # will be the one with the longest matching S1 for the given word. For
97
+ # example, with
98
+ #
99
+ # SSES -> SS
100
+ # IES -> I
101
+ # SS -> SS
102
+ # S ->
103
+ #
104
+ # (here the conditions are all null) CARESSES maps to CARESS since SSES is
105
+ # the longest match for S1. Equally CARESS maps to CARESS (S1=`SS') and CARES
106
+ # to CARE (S1=`S').
107
+ #
108
+ #
109
+ ---
110
+ stemmer:
111
+ # In the rules below, examples of their application, successful or otherwise,
112
+ # are given on the right in lower case. The algorithm now follows:
113
+ #
114
+ # Step 1a
115
+ # SSES -> SS caresses -> caress
116
+ # IES -> I ponies -> poni
117
+ # ties -> ti
118
+ # SS -> SS caress -> caress
119
+ # S -> cats -> cat
120
+ S100:
121
+ - SSES -> SS
122
+ - IES -> I
123
+ - SS -> SS
124
+ - S ->
125
+ #
126
+ # Step 1b
127
+ #
128
+ # (m>0) EED -> EE feed -> feed
129
+ # agreed -> agree
130
+ # (*v*) ED -> plastered -> plaster
131
+ # bled -> bled
132
+ # (*v*) ING -> motoring -> motor
133
+ # sing -> sing
134
+ S110:
135
+ - (m>0) EED -> EE goto(S120)
136
+ - (*v*) ED -> goto(S111)
137
+ - (*v*) ING -> goto(S111)
138
+ - goto(S120)
139
+ #
140
+ # If the second or third of the rules in Step 1b is successful, the following
141
+ # is done:
142
+ #
143
+ # AT -> ATE conflat(ed) -> conflate
144
+ # BL -> BLE troubl(ed) -> trouble
145
+ # IZ -> IZE siz(ed) -> size
146
+ # (*d and not (*L or *S or *Z))
147
+ # -> single letter
148
+ # hopp(ing) -> hop
149
+ # tann(ed) -> tan
150
+ # fall(ing) -> fall
151
+ # hiss(ing) -> hiss
152
+ # fizz(ed) -> fizz
153
+ # (m=1 and *o) -> E fail(ing) -> fail
154
+ # fil(ing) -> file
155
+ S111:
156
+ - AT -> ATE
157
+ - BL -> BLE
158
+ - IZ -> IZE
159
+ - (*d and not (*L or *S or *Z)) -> -1
160
+ - (m=1 and *o) -> E
161
+ #
162
+ # The rule to map to a single letter causes the removal of one of the double
163
+ # letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes
164
+ # -ATE, -BLE and -IZE can be recognised later. This E may be removed in step
165
+ # 4.
166
+ #
167
+ # Step 1c
168
+ #
169
+ # (*v*) Y -> I happy -> happi
170
+ # sky -> sky
171
+ S120:
172
+ - (*v*) Y -> I
173
+ #
174
+ # Step 1 deals with plurals and past participles. The subsequent steps are
175
+ # much more straightforward.
176
+ #
177
+ # Step 2
178
+ #
179
+ # (m>0) ATIONAL -> ATE relational -> relate
180
+ # (m>0) TIONAL -> TION conditional -> condition
181
+ # rational -> rational
182
+ # (m>0) ENCI -> ENCE valenci -> valence
183
+ # (m>0) ANCI -> ANCE hesitanci -> hesitance
184
+ # (m>0) IZER -> IZE digitizer -> digitize
185
+ # (m>0) ABLI -> ABLE conformabli -> conformable
186
+ # (m>0) ALLI -> AL radicalli -> radical
187
+ # (m>0) ENTLI -> ENT differentli -> different
188
+ # (m>0) ELI -> E vileli - > vile
189
+ # (m>0) OUSLI -> OUS analogousli -> analogous
190
+ # (m>0) IZATION -> IZE vietnamization -> vietnamize
191
+ # (m>0) ATION -> ATE predication -> predicate
192
+ # (m>0) ATOR -> ATE operator -> operate
193
+ # (m>0) ALISM -> AL feudalism -> feudal
194
+ # (m>0) IVENESS -> IVE decisiveness -> decisive
195
+ # (m>0) FULNESS -> FUL hopefulness -> hopeful
196
+ # (m>0) OUSNESS -> OUS callousness -> callous
197
+ # (m>0) ALITI -> AL formaliti -> formal
198
+ # (m>0) IVITI -> IVE sensitiviti -> sensitive
199
+ # (m>0) BILITI -> BLE sensibiliti -> sensible
200
+ S200:
201
+ - (m>0) ATIONAL -> ATE
202
+ - (m>0) TIONAL -> TION
203
+ - (m>0) ENCI -> ENCE
204
+ - (m>0) ANCI -> ANCE
205
+ - (m>0) IZER -> IZE
206
+ - (m>0) ABLI -> ABLE
207
+ - (m>0) ALLI -> AL
208
+ - (m>0) ENTLI -> ENT
209
+ - (m>0) ELI -> E
210
+ - (m>0) OUSLI -> OUS
211
+ - (m>0) IZATION -> IZE
212
+ - (m>0) ATION -> ATE
213
+ - (m>0) ATOR -> ATE
214
+ - (m>0) ALISM -> AL
215
+ - (m>0) IVENESS -> IVE
216
+ - (m>0) FULNESS -> FUL
217
+ - (m>0) OUSNESS -> OUS
218
+ - (m>0) ALITI -> AL
219
+ - (m>0) IVITI -> IVE
220
+ - (m>0) BILITI -> BLE
221
+ #
222
+ # The test for the string S1 can be made fast by doing a program switch on
223
+ # the penultimate letter of the word being tested. This gives a fairly even
224
+ # breakdown of the possible values of the string S1. It will be seen in fact
225
+ # that the S1-strings in step 2 are presented here in the alphabetical order
226
+ # of their penultimate letter. Similar techniques may be applied in the other
227
+ # steps.
228
+ #
229
+ # Step 3
230
+ #
231
+ # (m>0) ICATE -> IC triplicate -> triplic
232
+ # (m>0) ATIVE -> formative -> form
233
+ # (m>0) ALIZE -> AL formalize -> formal
234
+ # (m>0) ICITI -> IC electriciti -> electric
235
+ # (m>0) ICAL -> IC electrical -> electric
236
+ # (m>0) FUL -> hopeful -> hope
237
+ # (m>0) NESS -> goodness -> good
238
+ S300:
239
+ - (m>0) ICATE -> IC
240
+ - (m>0) ATIVE ->
241
+ - (m>0) ALIZE -> AL
242
+ - (m>0) ICITI -> IC
243
+ - (m>0) ICAL -> IC
244
+ - (m>0) FUL ->
245
+ - (m>0) NESS ->
246
+ #
247
+ # Step 4
248
+ #
249
+ # (m>1) AL -> revival -> reviv
250
+ # (m>1) ANCE -> allowance -> allow
251
+ # (m>1) ENCE -> inference -> infer
252
+ # (m>1) ER -> airliner -> airlin
253
+ # (m>1) IC -> gyroscopic -> gyroscop
254
+ # (m>1) ABLE -> adjustable -> adjust
255
+ # (m>1) IBLE -> defensible -> defens
256
+ # (m>1) ANT -> irritant -> irrit
257
+ # (m>1) EMENT -> replacement -> replac
258
+ # (m>1) MENT -> adjustment -> adjust
259
+ # (m>1) ENT -> dependent -> depend
260
+ # (m>1 and (*S or *T)) ION -> adoption -> adopt
261
+ # (m>1) OU -> homologou -> homolog
262
+ # (m>1) ISM -> communism -> commun
263
+ # (m>1) ATE -> activate -> activ
264
+ # (m>1) ITI -> angulariti -> angular
265
+ # (m>1) OUS -> homologous -> homolog
266
+ # (m>1) IVE -> effective -> effect
267
+ # (m>1) IZE -> bowdlerize -> bowdler
268
+ S400:
269
+ - (m>1) AL ->
270
+ - (m>1) ANCE ->
271
+ - (m>1) ENCE ->
272
+ - (m>1) ER ->
273
+ - (m>1) IC ->
274
+ - (m>1) ABLE ->
275
+ - (m>1) IBLE ->
276
+ - (m>1) ANT ->
277
+ - (m>1) EMENT ->
278
+ - (m>1) MENT ->
279
+ - (m>1) ENT ->
280
+ - (m>1 and (*S or *T)) ION ->
281
+ - (m>1) OU ->
282
+ - (m>1) ISM ->
283
+ - (m>1) ATE ->
284
+ - (m>1) ITI ->
285
+ - (m>1) OUS ->
286
+ - (m>1) IVE ->
287
+ - (m>1) IZE ->
288
+ #
289
+ # The suffixes are now removed. All that remains is a little tidying up.
290
+ #
291
+ # Step 5a
292
+ #
293
+ # (m>1) E -> probate -> probat
294
+ # rate -> rate
295
+ # (m=1 and not *o) E -> cease -> ceas
296
+ S500:
297
+ - (m>1) E ->
298
+ - (m=1 and not *o) E ->
299
+ #
300
+ # Step 5b
301
+ #
302
+ # (m > 1 and *d and *L) -> single letter
303
+ # controll -> control
304
+ # roll -> roll
305
+ S510:
306
+ - (m > 1 and *d and *L) -> -1
307
+ #
308
+ # The algorithm is careful not to remove a suffix when the stem is too short,
309
+ # the length of the stem being given by its measure, m. There is no linguistic
310
+ # basis for this approach. It was merely observed that m could be used quite
311
+ # effectively to help decide whether or not it was wise to take off a suffix.