lingo 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. data/.rspec +1 -0
  2. data/COPYING +663 -0
  3. data/ChangeLog +754 -0
  4. data/README +322 -0
  5. data/Rakefile +100 -0
  6. data/TODO +28 -0
  7. data/bin/lingo +5 -0
  8. data/bin/lingoctl +6 -0
  9. data/de.lang +121 -0
  10. data/de/lingo-abk.txt +74 -0
  11. data/de/lingo-dic.txt +56822 -0
  12. data/de/lingo-mul.txt +3209 -0
  13. data/de/lingo-syn.txt +14841 -0
  14. data/de/test_dic.txt +24 -0
  15. data/de/test_mul.txt +17 -0
  16. data/de/test_mul2.txt +2 -0
  17. data/de/test_singleword.txt +2 -0
  18. data/de/test_syn.txt +4 -0
  19. data/de/test_syn2.txt +1 -0
  20. data/de/user-dic.txt +10 -0
  21. data/en.lang +113 -0
  22. data/en/lingo-dic.txt +55434 -0
  23. data/en/lingo-mul.txt +456 -0
  24. data/en/user-dic.txt +5 -0
  25. data/info/Objekte.png +0 -0
  26. data/info/Typen.png +0 -0
  27. data/info/database.png +0 -0
  28. data/info/db_small.png +0 -0
  29. data/info/download.png +0 -0
  30. data/info/gpl-hdr.txt +27 -0
  31. data/info/kerze.png +0 -0
  32. data/info/language.png +0 -0
  33. data/info/lingo.png +0 -0
  34. data/info/logo.png +0 -0
  35. data/info/meeting.png +0 -0
  36. data/info/types.png +0 -0
  37. data/lib/lingo.rb +321 -0
  38. data/lib/lingo/attendee/abbreviator.rb +119 -0
  39. data/lib/lingo/attendee/debugger.rb +111 -0
  40. data/lib/lingo/attendee/decomposer.rb +101 -0
  41. data/lib/lingo/attendee/dehyphenizer.rb +167 -0
  42. data/lib/lingo/attendee/multiworder.rb +301 -0
  43. data/lib/lingo/attendee/noneword_filter.rb +103 -0
  44. data/lib/lingo/attendee/objectfilter.rb +86 -0
  45. data/lib/lingo/attendee/sequencer.rb +190 -0
  46. data/lib/lingo/attendee/synonymer.rb +105 -0
  47. data/lib/lingo/attendee/textreader.rb +237 -0
  48. data/lib/lingo/attendee/textwriter.rb +196 -0
  49. data/lib/lingo/attendee/tokenizer.rb +218 -0
  50. data/lib/lingo/attendee/variator.rb +185 -0
  51. data/lib/lingo/attendee/vector_filter.rb +158 -0
  52. data/lib/lingo/attendee/wordsearcher.rb +96 -0
  53. data/lib/lingo/attendees.rb +289 -0
  54. data/lib/lingo/cli.rb +62 -0
  55. data/lib/lingo/config.rb +104 -0
  56. data/lib/lingo/const.rb +131 -0
  57. data/lib/lingo/ctl.rb +173 -0
  58. data/lib/lingo/database.rb +587 -0
  59. data/lib/lingo/language.rb +530 -0
  60. data/lib/lingo/modules.rb +98 -0
  61. data/lib/lingo/types.rb +285 -0
  62. data/lib/lingo/utilities.rb +40 -0
  63. data/lib/lingo/version.rb +27 -0
  64. data/lingo-all.cfg +85 -0
  65. data/lingo-call.cfg +15 -0
  66. data/lingo.cfg +78 -0
  67. data/lingo.rb +3 -0
  68. data/lir.cfg +72 -0
  69. data/porter/stem.cfg +311 -0
  70. data/porter/stem.rb +150 -0
  71. data/spec/spec_helper.rb +0 -0
  72. data/test.cfg +79 -0
  73. data/test/attendee/ts_abbreviator.rb +35 -0
  74. data/test/attendee/ts_decomposer.rb +31 -0
  75. data/test/attendee/ts_multiworder.rb +390 -0
  76. data/test/attendee/ts_noneword_filter.rb +19 -0
  77. data/test/attendee/ts_objectfilter.rb +19 -0
  78. data/test/attendee/ts_sequencer.rb +43 -0
  79. data/test/attendee/ts_synonymer.rb +33 -0
  80. data/test/attendee/ts_textreader.rb +58 -0
  81. data/test/attendee/ts_textwriter.rb +98 -0
  82. data/test/attendee/ts_tokenizer.rb +32 -0
  83. data/test/attendee/ts_variator.rb +24 -0
  84. data/test/attendee/ts_vector_filter.rb +62 -0
  85. data/test/attendee/ts_wordsearcher.rb +119 -0
  86. data/test/lir.csv +3 -0
  87. data/test/lir.txt +12 -0
  88. data/test/lir2.txt +12 -0
  89. data/test/mul.txt +1 -0
  90. data/test/ref/artikel.mul +1 -0
  91. data/test/ref/artikel.non +159 -0
  92. data/test/ref/artikel.seq +270 -0
  93. data/test/ref/artikel.syn +16 -0
  94. data/test/ref/artikel.vec +928 -0
  95. data/test/ref/artikel.ven +928 -0
  96. data/test/ref/artikel.ver +928 -0
  97. data/test/ref/lir.csv +328 -0
  98. data/test/ref/lir.mul +1 -0
  99. data/test/ref/lir.non +274 -0
  100. data/test/ref/lir.seq +249 -0
  101. data/test/ref/lir.syn +94 -0
  102. data/test/test_helper.rb +113 -0
  103. data/test/ts_database.rb +269 -0
  104. data/test/ts_language.rb +396 -0
  105. data/txt/artikel-en.txt +157 -0
  106. data/txt/artikel.txt +170 -0
  107. data/txt/lir.txt +1317 -0
  108. metadata +211 -0
data/lingo-all.cfg ADDED
@@ -0,0 +1,85 @@
1
+ #
2
+ # Lingo-Konfiguration
3
+ #
4
+ ---
5
+ meeting:
6
+
7
+ attendees:
8
+
9
+ ########################################
10
+ # Text bereitstellen
11
+ #
12
+
13
+ # Angegebene Datei zeilenweise einlesen und verarbeitet
14
+ - textreader: { files: '$(files)' }
15
+
16
+
17
+ ########################################
18
+ # Inhalte verarbeiten
19
+ #
20
+
21
+ # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
22
+ - tokenizer: { }
23
+
24
+ # Abkürzungen erkennen und auflösen
25
+ # - abbreviator: { source: 'sys-abk' }
26
+
27
+ # Verbleibende Token im Wörterbuch suchen
28
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
29
+
30
+ # Schreibweisen variieren und erneut suchen
31
+ # - variator: { source: 'sys-dic' }
32
+
33
+ # Bindestrichergänzungen rekonstruieren
34
+ # - dehyphenizer: { source: 'sys-dic' }
35
+
36
+ # Nicht erkannte Wörter auf Kompositum testen
37
+ # - decomposer: { source: 'sys-dic' }
38
+
39
+ # Mehrwortgruppen im Strom erkennen
40
+ # - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
41
+
42
+ # Wortsequenzen anhand von Regeln identifizieren
43
+ # - sequencer: { stopper: 'PUNC,OTHR' }
44
+
45
+ # Relationierungen einfügen
46
+ # - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
47
+
48
+
49
+ ########################################
50
+ # Datenstrom anzeigen
51
+ #
52
+ - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
53
+
54
+
55
+ ########################################
56
+ # Ergebnisse ausgeben
57
+ #
58
+
59
+ # Erstelle Datei mit Endung .non für nicht erkannte Wörter
60
+ # - noneword_filter: { in: syn }
61
+ # - textwriter: { ext: non, sep: "\n" }
62
+
63
+ # Erstelle Datei mit Endung .vec für erkannte Indexterme
64
+ # - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
65
+ # - textwriter: { ext: vec, sep: "\n" }
66
+
67
+ # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
68
+ # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
69
+ # - textwriter: { ext: ven, sep: "\n" }
70
+
71
+ # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
72
+ # - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
73
+ # - textwriter: { ext: ver, sep: "\n" }
74
+
75
+ # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
76
+ # - vector_filter: { in: syn, lexicals: m }
77
+ # - textwriter: { ext: mul, sep: "\n" }
78
+
79
+ # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
80
+ # - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
81
+ # - textwriter: { ext: seq, sep: "\n" }
82
+
83
+ # Erstelle Datei mit Endung .syn für erkannte Synonyme
84
+ # - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
85
+ # - textwriter: { ext: syn, sep: "\n" }
data/lingo-call.cfg ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ meeting:
3
+ attendees:
4
+ - textreader: { }
5
+
6
+ - tokenizer: { }
7
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
8
+ - decomposer: { source: 'sys-dic' }
9
+ - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
10
+ - sequencer: { stopper: 'PUNC,OTHR' }
11
+ - synonymer: { skip: '?,t', source: 'sys-syn', out: 'syn' }
12
+ - debugger: { eval: 'true', ceval: 'false', prompt: '' }
13
+
14
+ #- vector_filter: { in: 'syn', lexicals: 'y', sort: 'term_abs' }
15
+ #- textwriter: { ext: 'STDOUT', sep: "\n" }
data/lingo.cfg ADDED
@@ -0,0 +1,78 @@
1
+ #
2
+ # Lingo-Konfiguration
3
+ #
4
+ ---
5
+ meeting:
6
+
7
+ attendees:
8
+
9
+ ########################################
10
+ # Text bereitstellen
11
+ #
12
+
13
+ # Angegebene Datei zeilenweise einlesen und verarbeitet
14
+ - textreader: { files: '$(files)' }
15
+
16
+
17
+ ########################################
18
+ # Inhalte verarbeiten
19
+ #
20
+
21
+ # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
22
+ - tokenizer: { }
23
+
24
+ # Verbleibende Token im Wörterbuch suchen
25
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
26
+
27
+ # Nicht erkannte Wörter auf Kompositum testen
28
+ - decomposer: { source: 'sys-dic' }
29
+
30
+ # Mehrwortgruppen im Strom erkennen
31
+ - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
32
+
33
+ # Wortsequenzen anhand von Regeln identifizieren
34
+ - sequencer: { stopper: 'PUNC,OTHR' }
35
+
36
+ # Relationierungen einfügen
37
+ - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
38
+
39
+
40
+ ########################################
41
+ # Datenstrom anzeigen
42
+ #
43
+ #- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
44
+
45
+
46
+ ########################################
47
+ # Ergebnisse ausgeben
48
+ #
49
+ - vector_filter: { in: syn, debug: 'true', prompt: 'lex:) ' }
50
+ - textwriter: { ext: log, sep: "\n" }
51
+
52
+ # Erstelle Datei mit Endung .non für nicht erkannte Wörter
53
+ - noneword_filter: { in: syn }
54
+ - textwriter: { ext: non, sep: "\n" }
55
+
56
+ # Erstelle Datei mit Endung .vec für erkannte Indexterme
57
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
58
+ - textwriter: { ext: vec, sep: "\n" }
59
+
60
+ # Erstelle Datei mit Endung .ven für erkannte Indexterme mit absoluter Häufigkeit
61
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_abs' }
62
+ - textwriter: { ext: ven, sep: "\n" }
63
+
64
+ # Erstelle Datei mit Endung .ver für erkannte Indexterme mit relativer Häufigkeit
65
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$', sort: 'term_rel' }
66
+ - textwriter: { ext: ver, sep: "\n" }
67
+
68
+ # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
69
+ - vector_filter: { in: syn, lexicals: m }
70
+ - textwriter: { ext: mul, sep: "\n" }
71
+
72
+ # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
73
+ - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
74
+ - textwriter: { ext: seq, sep: "\n" }
75
+
76
+ # Erstelle Datei mit Endung .syn für erkannte Synonyme
77
+ - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
78
+ - textwriter: { ext: syn, sep: "\n" }
data/lingo.rb ADDED
@@ -0,0 +1,3 @@
1
+ require_relative 'lib/lingo'
2
+
3
+ Lingo.talk if $0 == __FILE__
data/lir.cfg ADDED
@@ -0,0 +1,72 @@
1
+ #
2
+ # Lingo-Konfiguration für den Test mit einer LIR-Datei
3
+ #
4
+ # Gebräuchliche Patterns sind
5
+ # "^\021(\d+\-\d+)\022"
6
+ # "^\[(\d+)\.\]"
7
+ #
8
+ ---
9
+ meeting:
10
+
11
+ attendees:
12
+
13
+ ########################################
14
+ # Text bereitstellen
15
+ #
16
+
17
+ # Angegebene Datei zeilenweise einlesen und verarbeiten
18
+ - textreader: { files: '$(files)', lir-record-pattern: '^\[(\d+)\.\]' }
19
+
20
+
21
+ ########################################
22
+ # Inhalte verarbeiten
23
+ #
24
+
25
+ # Zeile in einzelnen Sinnbestandteile (Token) zerlegen
26
+ - tokenizer: { }
27
+
28
+ # Verbleibende Token im Wörterbuch suchen
29
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
30
+
31
+ # Nicht erkannte Wörter auf Kompositum testen
32
+ - decomposer: { source: 'sys-dic' }
33
+
34
+ # Mehrwortgruppen im Strom erkennen
35
+ - multiworder: { stopper: 'PUNC,OTHR', source: 'sys-mul' }
36
+
37
+ # Wortsequenzen anhand von Regeln identifizieren
38
+ - sequencer: { stopper: 'PUNC,OTHR' }
39
+
40
+ # Relationierungen einfügen
41
+ - synonymer: { skip: '?,t', source: 'sys-syn', out: syn }
42
+
43
+
44
+ ########################################
45
+ # Datenstrom anzeigen
46
+ #
47
+ - debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: 'lex:) ' }
48
+
49
+
50
+ ########################################
51
+ # Ergebnisse ausgeben
52
+ #
53
+
54
+ # Erstelle Datei mit Endung .non für nicht erkannte Wörter
55
+ - noneword_filter: { in: syn }
56
+ - textwriter: { ext: non }
57
+
58
+ # Erstelle Datei mit Endung .csv für erkannte Indexterme
59
+ - vector_filter: { in: syn, lexicals: '^[ksavem]$' }
60
+ - textwriter: { ext: csv }
61
+
62
+ # Erstelle Datei mit Endung .mul für erkannte Mehrwortgruppen
63
+ - vector_filter: { in: syn, lexicals: m }
64
+ - textwriter: { ext: mul }
65
+
66
+ # Erstelle Datei mit Endung .seq für erkannte Wortsequenzen
67
+ - vector_filter: { in: syn, lexicals: q, sort: 'term_abs' }
68
+ - textwriter: { ext: seq }
69
+
70
+ # Erstelle Datei mit Endung .syn für erkannte Synonyme
71
+ - vector_filter: { in: syn, lexicals: y, sort: 'term_abs' }
72
+ - textwriter: { ext: syn }
data/porter/stem.cfg ADDED
@@ -0,0 +1,311 @@
1
+ # Stem.cfg
2
+ #
3
+ # Rules for Porter-Stemmer
4
+ #
5
+ #
6
+ # based on:
7
+ # An algorithm for suffix stripping
8
+ #
9
+ # M.F.Porter
10
+ # 1980
11
+ #
12
+ # Originally published in Program, 14 no. 3, pp 130-137, July 1980. (A
13
+ # few typos have been corrected.)
14
+ #
15
+ # http://tartarus.org/~martin/PorterStemmer/def.txt
16
+ #
17
+ # --------------------------------------------------
18
+ #
19
+ #
20
+ #
21
+ #
22
+ # 2. THE ALGORITHM
23
+ #
24
+ # To present the suffix stripping algorithm in its entirety we will need a few
25
+ # difinitions.
26
+ #
27
+ # A \consonant\ in a word is a letter other than A, E, I, O or U, and other
28
+ # than Y preceded by a consonant. (The fact that the term `consonant' is
29
+ # defined to some extent in terms of itself does not make it ambiguous.) So in
30
+ # TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
31
+ # letter is not a consonant it is a \vowel\.
32
+ #
33
+ # A consonant will be denoted by c, a vowel by v. A list ccc... of length
34
+ # greater than 0 will be denoted by C, and a list vvv... of length greater
35
+ # than 0 will be denoted by V. Any word, or part of a word, therefore has one
36
+ # of the four forms:
37
+ #
38
+ # CVCV ... C
39
+ # CVCV ... V
40
+ # VCVC ... C
41
+ # VCVC ... V
42
+ #
43
+ # These may all be represented by the single form
44
+ #
45
+ # [C]VCVC ... [V]
46
+ #
47
+ # where the square brackets denote arbitrary presence of their contents.
48
+ # Using (VC){m} to denote VC repeated m times, this may again be written as
49
+ #
50
+ # [C](VC){m}[V].
51
+ #
52
+ # m will be called the \measure\ of any word or word part when represented in
53
+ # this form. The case m = 0 covers the null word. Here are some examples:
54
+ #
55
+ # m=0 TR, EE, TREE, Y, BY.
56
+ # m=1 TROUBLE, OATS, TREES, IVY.
57
+ # m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
58
+ #
59
+ # The \rules\ for removing a suffix will be given in the form
60
+ #
61
+ # (condition) S1 -> S2
62
+ #
63
+ # This means that if a word ends with the suffix S1, and the stem before S1
64
+ # satisfies the given condition, S1 is replaced by S2. The condition is
65
+ # usually given in terms of m, e.g.
66
+ #
67
+ # (m > 1) EMENT ->
68
+ #
69
+ # Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to REPLAC,
70
+ # since REPLAC is a word part for which m = 2.
71
+ #
72
+ # The `condition' part may also contain the following:
73
+ #
74
+ # *S - the stem ends with S (and similarly for the other letters).
75
+ #
76
+ # *v* - the stem contains a vowel.
77
+ #
78
+ # *d - the stem ends with a double consonant (e.g. -TT, -SS).
79
+ #
80
+ # *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
81
+ # -WIL, -HOP).
82
+ #
83
+ # And the condition part may also contain expressions with \and\, \or\ and
84
+ # \not\, so that
85
+ #
86
+ # (m>1 and (*S or *T))
87
+ #
88
+ # tests for a stem with m>1 ending in S or T, while
89
+ #
90
+ # (*d and not (*L or *S or *Z))
91
+ #
92
+ # tests for a stem ending witha double consonant other than L, S or Z.
93
+ # Elaborate conditions like this are required only rarely.
94
+ #
95
+ # In a set of rules written beneath each other, only one is obeyed, and this
96
+ # will be the one with the longest matching S1 for the given word. For
97
+ # example, with
98
+ #
99
+ # SSES -> SS
100
+ # IES -> I
101
+ # SS -> SS
102
+ # S ->
103
+ #
104
+ # (here the conditions are all null) CARESSES maps to CARESS since SSES is
105
+ # the longest match for S1. Equally CARESS maps to CARESS (S1=`SS') and CARES
106
+ # to CARE (S1=`S').
107
+ #
108
+ #
109
+ ---
110
+ stemmer:
111
+ # In the rules below, examples of their application, successful or otherwise,
112
+ # are given on the right in lower case. The algorithm now follows:
113
+ #
114
+ # Step 1a
115
+ # SSES -> SS caresses -> caress
116
+ # IES -> I ponies -> poni
117
+ # ties -> ti
118
+ # SS -> SS caress -> caress
119
+ # S -> cats -> cat
120
+ S100:
121
+ - SSES -> SS
122
+ - IES -> I
123
+ - SS -> SS
124
+ - S ->
125
+ #
126
+ # Step 1b
127
+ #
128
+ # (m>0) EED -> EE feed -> feed
129
+ # agreed -> agree
130
+ # (*v*) ED -> plastered -> plaster
131
+ # bled -> bled
132
+ # (*v*) ING -> motoring -> motor
133
+ # sing -> sing
134
+ S110:
135
+ - (m>0) EED -> EE goto(S120)
136
+ - (*v*) ED -> goto(S111)
137
+ - (*v*) ING -> goto(S111)
138
+ - goto(S120)
139
+ #
140
+ # If the second or third of the rules in Step 1b is successful, the following
141
+ # is done:
142
+ #
143
+ # AT -> ATE conflat(ed) -> conflate
144
+ # BL -> BLE troubl(ed) -> trouble
145
+ # IZ -> IZE siz(ed) -> size
146
+ # (*d and not (*L or *S or *Z))
147
+ # -> single letter
148
+ # hopp(ing) -> hop
149
+ # tann(ed) -> tan
150
+ # fall(ing) -> fall
151
+ # hiss(ing) -> hiss
152
+ # fizz(ed) -> fizz
153
+ # (m=1 and *o) -> E fail(ing) -> fail
154
+ # fil(ing) -> file
155
+ S111:
156
+ - AT -> ATE
157
+ - BL -> BLE
158
+ - IZ -> IZE
159
+ - (*d and not (*L or *S or *Z)) -> -1
160
+ - (m=1 and *o) -> E
161
+ #
162
+ # The rule to map to a single letter causes the removal of one of the double
163
+ # letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes
164
+ # -ATE, -BLE and -IZE can be recognised later. This E may be removed in step
165
+ # 4.
166
+ #
167
+ # Step 1c
168
+ #
169
+ # (*v*) Y -> I happy -> happi
170
+ # sky -> sky
171
+ S120:
172
+ - (*v*) Y -> I
173
+ #
174
+ # Step 1 deals with plurals and past participles. The subsequent steps are
175
+ # much more straightforward.
176
+ #
177
+ # Step 2
178
+ #
179
+ # (m>0) ATIONAL -> ATE relational -> relate
180
+ # (m>0) TIONAL -> TION conditional -> condition
181
+ # rational -> rational
182
+ # (m>0) ENCI -> ENCE valenci -> valence
183
+ # (m>0) ANCI -> ANCE hesitanci -> hesitance
184
+ # (m>0) IZER -> IZE digitizer -> digitize
185
+ # (m>0) ABLI -> ABLE conformabli -> conformable
186
+ # (m>0) ALLI -> AL radicalli -> radical
187
+ # (m>0) ENTLI -> ENT differentli -> different
188
+ # (m>0) ELI -> E vileli - > vile
189
+ # (m>0) OUSLI -> OUS analogousli -> analogous
190
+ # (m>0) IZATION -> IZE vietnamization -> vietnamize
191
+ # (m>0) ATION -> ATE predication -> predicate
192
+ # (m>0) ATOR -> ATE operator -> operate
193
+ # (m>0) ALISM -> AL feudalism -> feudal
194
+ # (m>0) IVENESS -> IVE decisiveness -> decisive
195
+ # (m>0) FULNESS -> FUL hopefulness -> hopeful
196
+ # (m>0) OUSNESS -> OUS callousness -> callous
197
+ # (m>0) ALITI -> AL formaliti -> formal
198
+ # (m>0) IVITI -> IVE sensitiviti -> sensitive
199
+ # (m>0) BILITI -> BLE sensibiliti -> sensible
200
+ S200:
201
+ - (m>0) ATIONAL -> ATE
202
+ - (m>0) TIONAL -> TION
203
+ - (m>0) ENCI -> ENCE
204
+ - (m>0) ANCI -> ANCE
205
+ - (m>0) IZER -> IZE
206
+ - (m>0) ABLI -> ABLE
207
+ - (m>0) ALLI -> AL
208
+ - (m>0) ENTLI -> ENT
209
+ - (m>0) ELI -> E
210
+ - (m>0) OUSLI -> OUS
211
+ - (m>0) IZATION -> IZE
212
+ - (m>0) ATION -> ATE
213
+ - (m>0) ATOR -> ATE
214
+ - (m>0) ALISM -> AL
215
+ - (m>0) IVENESS -> IVE
216
+ - (m>0) FULNESS -> FUL
217
+ - (m>0) OUSNESS -> OUS
218
+ - (m>0) ALITI -> AL
219
+ - (m>0) IVITI -> IVE
220
+ - (m>0) BILITI -> BLE
221
+ #
222
+ # The test for the string S1 can be made fast by doing a program switch on
223
+ # the penultimate letter of the word being tested. This gives a fairly even
224
+ # breakdown of the possible values of the string S1. It will be seen in fact
225
+ # that the S1-strings in step 2 are presented here in the alphabetical order
226
+ # of their penultimate letter. Similar techniques may be applied in the other
227
+ # steps.
228
+ #
229
+ # Step 3
230
+ #
231
+ # (m>0) ICATE -> IC triplicate -> triplic
232
+ # (m>0) ATIVE -> formative -> form
233
+ # (m>0) ALIZE -> AL formalize -> formal
234
+ # (m>0) ICITI -> IC electriciti -> electric
235
+ # (m>0) ICAL -> IC electrical -> electric
236
+ # (m>0) FUL -> hopeful -> hope
237
+ # (m>0) NESS -> goodness -> good
238
+ S300:
239
+ - (m>0) ICATE -> IC
240
+ - (m>0) ATIVE ->
241
+ - (m>0) ALIZE -> AL
242
+ - (m>0) ICITI -> IC
243
+ - (m>0) ICAL -> IC
244
+ - (m>0) FUL ->
245
+ - (m>0) NESS ->
246
+ #
247
+ # Step 4
248
+ #
249
+ # (m>1) AL -> revival -> reviv
250
+ # (m>1) ANCE -> allowance -> allow
251
+ # (m>1) ENCE -> inference -> infer
252
+ # (m>1) ER -> airliner -> airlin
253
+ # (m>1) IC -> gyroscopic -> gyroscop
254
+ # (m>1) ABLE -> adjustable -> adjust
255
+ # (m>1) IBLE -> defensible -> defens
256
+ # (m>1) ANT -> irritant -> irrit
257
+ # (m>1) EMENT -> replacement -> replac
258
+ # (m>1) MENT -> adjustment -> adjust
259
+ # (m>1) ENT -> dependent -> depend
260
+ # (m>1 and (*S or *T)) ION -> adoption -> adopt
261
+ # (m>1) OU -> homologou -> homolog
262
+ # (m>1) ISM -> communism -> commun
263
+ # (m>1) ATE -> activate -> activ
264
+ # (m>1) ITI -> angulariti -> angular
265
+ # (m>1) OUS -> homologous -> homolog
266
+ # (m>1) IVE -> effective -> effect
267
+ # (m>1) IZE -> bowdlerize -> bowdler
268
+ S400:
269
+ - (m>1) AL ->
270
+ - (m>1) ANCE ->
271
+ - (m>1) ENCE ->
272
+ - (m>1) ER ->
273
+ - (m>1) IC ->
274
+ - (m>1) ABLE ->
275
+ - (m>1) IBLE ->
276
+ - (m>1) ANT ->
277
+ - (m>1) EMENT ->
278
+ - (m>1) MENT ->
279
+ - (m>1) ENT ->
280
+ - (m>1 and (*S or *T)) ION ->
281
+ - (m>1) OU ->
282
+ - (m>1) ISM ->
283
+ - (m>1) ATE ->
284
+ - (m>1) ITI ->
285
+ - (m>1) OUS ->
286
+ - (m>1) IVE ->
287
+ - (m>1) IZE ->
288
+ #
289
+ # The suffixes are now removed. All that remains is a little tidying up.
290
+ #
291
+ # Step 5a
292
+ #
293
+ # (m>1) E -> probate -> probat
294
+ # rate -> rate
295
+ # (m=1 and not *o) E -> cease -> ceas
296
+ S500:
297
+ - (m>1) E ->
298
+ - (m=1 and not *o) E ->
299
+ #
300
+ # Step 5b
301
+ #
302
+ # (m > 1 and *d and *L) -> single letter
303
+ # controll -> control
304
+ # roll -> roll
305
+ S510:
306
+ - (m > 1 and *d and *L) -> -1
307
+ #
308
+ # The algorithm is careful not to remove a suffix when the stem is too short,
309
+ # the length of the stem being given by its measure, m. There is no linguistic
310
+ # basis for this approach. It was merely observed that m could be used quite
311
+ # effectively to help decide whether or not it was wise to take off a suffix.