ots 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/dictionaries/hu.xml
ADDED
@@ -0,0 +1,280 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="hungarian">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>a</word>
|
62
|
+
<word>addig</word>
|
63
|
+
<word>ám</word>
|
64
|
+
<word>annak</word>
|
65
|
+
<word>annyi</word>
|
66
|
+
<word>arra</word>
|
67
|
+
<word>át</word>
|
68
|
+
<word>attól</word>
|
69
|
+
<word>az</word>
|
70
|
+
<word>azért</word>
|
71
|
+
<word>azok</word>
|
72
|
+
<word>be</word>
|
73
|
+
<word>bele</word>
|
74
|
+
<word>belé</word>
|
75
|
+
<word>beléd</word>
|
76
|
+
<word>beléjük</word>
|
77
|
+
<word>belém</word>
|
78
|
+
<word>belénk</word>
|
79
|
+
<word>belétek</word>
|
80
|
+
<word>belőle</word>
|
81
|
+
<word>belőled</word>
|
82
|
+
<word>belőlem</word>
|
83
|
+
<word>belőletek</word>
|
84
|
+
<word>belőlük</word>
|
85
|
+
<word>belőlünk</word>
|
86
|
+
<word>benne</word>
|
87
|
+
<word>benned</word>
|
88
|
+
<word>bennem</word>
|
89
|
+
<word>bennetek</word>
|
90
|
+
<word>bennük</word>
|
91
|
+
<word>bennünk</word>
|
92
|
+
<word>csak</word>
|
93
|
+
<word>de</word>
|
94
|
+
<word>e</word>
|
95
|
+
<word>eddig</word>
|
96
|
+
<word>egy</word>
|
97
|
+
<word>egyik</word>
|
98
|
+
<word>el</word>
|
99
|
+
<word>én</word>
|
100
|
+
<word>engem</word>
|
101
|
+
<word>ennek</word>
|
102
|
+
<word>ennyi</word>
|
103
|
+
<word>erre</word>
|
104
|
+
<word>érte</word>
|
105
|
+
<word>érted</word>
|
106
|
+
<word>értem</word>
|
107
|
+
<word>értetek</word>
|
108
|
+
<word>értük</word>
|
109
|
+
<word>értünk</word>
|
110
|
+
<word>és</word>
|
111
|
+
<word>év</word>
|
112
|
+
<word>ez</word>
|
113
|
+
<word>ezek</word>
|
114
|
+
<word>ezért</word>
|
115
|
+
<word>ezt</word>
|
116
|
+
<word>fel</word>
|
117
|
+
<word>fog</word>
|
118
|
+
<word>föl</word>
|
119
|
+
<word>ha</word>
|
120
|
+
<word>hanem</word>
|
121
|
+
<word>három</word>
|
122
|
+
<word>hogy</word>
|
123
|
+
<word>hol</word>
|
124
|
+
<word>honnan</word>
|
125
|
+
<word>hozzá</word>
|
126
|
+
<word>hozzád</word>
|
127
|
+
<word>hozzájuk</word>
|
128
|
+
<word>hozzám</word>
|
129
|
+
<word>hozzánk</word>
|
130
|
+
<word>hozzátok</word>
|
131
|
+
<word>ide</word>
|
132
|
+
<word>igen</word>
|
133
|
+
<word>ilyen</word>
|
134
|
+
<word>is</word>
|
135
|
+
<word>ismét</word>
|
136
|
+
<word>itt</word>
|
137
|
+
<word>jó</word>
|
138
|
+
<word>kell</word>
|
139
|
+
<word>két</word>
|
140
|
+
<word>kettő</word>
|
141
|
+
<word>ki</word>
|
142
|
+
<word>kicsi</word>
|
143
|
+
<word>kicsit</word>
|
144
|
+
<word>kis</word>
|
145
|
+
<word>le</word>
|
146
|
+
<word>lehet</word>
|
147
|
+
<word>lesz</word>
|
148
|
+
<word>lett</word>
|
149
|
+
<word>ma</word>
|
150
|
+
<word>majdnem</word>
|
151
|
+
<word>már</word>
|
152
|
+
<word>más</word>
|
153
|
+
<word>másik</word>
|
154
|
+
<word>meddig</word>
|
155
|
+
<word>meg</word>
|
156
|
+
<word>még</word>
|
157
|
+
<word>megint</word>
|
158
|
+
<word>mellett</word>
|
159
|
+
<word>mennyi</word>
|
160
|
+
<word>merre</word>
|
161
|
+
<word>mert</word>
|
162
|
+
<word>mettől</word>
|
163
|
+
<word>mi</word>
|
164
|
+
<word>miért</word>
|
165
|
+
<word>mikor</word>
|
166
|
+
<word>milyen</word>
|
167
|
+
<word>minden</word>
|
168
|
+
<word>mindenki</word>
|
169
|
+
<word>mindig</word>
|
170
|
+
<word>minket</word>
|
171
|
+
<word>most</word>
|
172
|
+
<word>nagy</word>
|
173
|
+
<word>nagyon</word>
|
174
|
+
<word>nála</word>
|
175
|
+
<word>nálad</word>
|
176
|
+
<word>nálam</word>
|
177
|
+
<word>nálatok</word>
|
178
|
+
<word>náluk</word>
|
179
|
+
<word>nálunk</word>
|
180
|
+
<word>ne</word>
|
181
|
+
<word>négy</word>
|
182
|
+
<word>neked</word>
|
183
|
+
<word>nekem</word>
|
184
|
+
<word>neki</word>
|
185
|
+
<word>nekik</word>
|
186
|
+
<word>nektek</word>
|
187
|
+
<word>nekünk</word>
|
188
|
+
<word>nem</word>
|
189
|
+
<word>ő</word>
|
190
|
+
<word>oda</word>
|
191
|
+
<word>ők</word>
|
192
|
+
<word>őket</word>
|
193
|
+
<word>olyan</word>
|
194
|
+
<word>ön</word>
|
195
|
+
<word>önbe</word>
|
196
|
+
<word>önben</word>
|
197
|
+
<word>önbol</word>
|
198
|
+
<word>önért</word>
|
199
|
+
<word>önhöz</word>
|
200
|
+
<word>önnek</word>
|
201
|
+
<word>önnel</word>
|
202
|
+
<word>önnél</word>
|
203
|
+
<word>önök</word>
|
204
|
+
<word>önökbe</word>
|
205
|
+
<word>önökben</word>
|
206
|
+
<word>önökből</word>
|
207
|
+
<word>önökért</word>
|
208
|
+
<word>önöket</word>
|
209
|
+
<word>önökhöz</word>
|
210
|
+
<word>önökkel</word>
|
211
|
+
<word>önöknek</word>
|
212
|
+
<word>önöknél</word>
|
213
|
+
<word>önökön</word>
|
214
|
+
<word>önökre</word>
|
215
|
+
<word>önökről</word>
|
216
|
+
<word>önöktől</word>
|
217
|
+
<word>önön</word>
|
218
|
+
<word>önre</word>
|
219
|
+
<word>önről</word>
|
220
|
+
<word>önt</word>
|
221
|
+
<word>öntől</word>
|
222
|
+
<word>össze</word>
|
223
|
+
<word>őt</word>
|
224
|
+
<word>ott</word>
|
225
|
+
<word>rá</word>
|
226
|
+
<word>rád</word>
|
227
|
+
<word>rajta</word>
|
228
|
+
<word>rajtad</word>
|
229
|
+
<word>rajtam</word>
|
230
|
+
<word>rajtatok</word>
|
231
|
+
<word>rajtuk</word>
|
232
|
+
<word>rajtunk</word>
|
233
|
+
<word>rájuk</word>
|
234
|
+
<word>rám</word>
|
235
|
+
<word>ránk</word>
|
236
|
+
<word>rátok</word>
|
237
|
+
<word>róla</word>
|
238
|
+
<word>rólad</word>
|
239
|
+
<word>rólam</word>
|
240
|
+
<word>rólatok</word>
|
241
|
+
<word>róluk</word>
|
242
|
+
<word>rólunk</word>
|
243
|
+
<word>rossz</word>
|
244
|
+
<word>s</word>
|
245
|
+
<word>se</word>
|
246
|
+
<word>sem</word>
|
247
|
+
<word>semmi</word>
|
248
|
+
<word>senki</word>
|
249
|
+
<word>soha</word>
|
250
|
+
<word>sok</word>
|
251
|
+
<word>stb</word>
|
252
|
+
<word>szét</word>
|
253
|
+
<word>talán</word>
|
254
|
+
<word>te</word>
|
255
|
+
<word>téged</word>
|
256
|
+
<word>ti</word>
|
257
|
+
<word>titeket</word>
|
258
|
+
<word>tőle</word>
|
259
|
+
<word>tőled</word>
|
260
|
+
<word>tolem</word>
|
261
|
+
<word>toletek</word>
|
262
|
+
<word>tőlük</word>
|
263
|
+
<word>tőlünk</word>
|
264
|
+
<word>új</word>
|
265
|
+
<word>újra</word>
|
266
|
+
<word>vagy</word>
|
267
|
+
<word>van</word>
|
268
|
+
<word>vannak</word>
|
269
|
+
<word>vele</word>
|
270
|
+
<word>veled</word>
|
271
|
+
<word>velem</word>
|
272
|
+
<word>veletek</word>
|
273
|
+
<word>velük</word>
|
274
|
+
<word>velünk</word>
|
275
|
+
<word>vissza</word>
|
276
|
+
<word>volt</word>
|
277
|
+
<word>voltak</word>
|
278
|
+
<word></word>
|
279
|
+
</grader-tc>
|
280
|
+
</dictionary>
|
data/dictionaries/ia.xml
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="Interlingua">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>duo</word>
|
62
|
+
<word>e</word>
|
63
|
+
<word>es</word>
|
64
|
+
<word>esser</word>
|
65
|
+
<word>ha</word>
|
66
|
+
<word>haber</word>
|
67
|
+
<word>illa</word>
|
68
|
+
<word>illas</word>
|
69
|
+
<word>ille</word>
|
70
|
+
<word>illes</word>
|
71
|
+
<word>illo</word>
|
72
|
+
<word>illos</word>
|
73
|
+
<word>in</word>
|
74
|
+
<word>io</word>
|
75
|
+
<word>la</word>
|
76
|
+
<word>las</word>
|
77
|
+
<word>le</word>
|
78
|
+
<word>les</word>
|
79
|
+
<word>lo</word>
|
80
|
+
<word>los</word>
|
81
|
+
<word>me</word>
|
82
|
+
<word>minus</word>
|
83
|
+
<word>non</word>
|
84
|
+
<word>nos</word>
|
85
|
+
<word>ora</word>
|
86
|
+
<word>plus</word>
|
87
|
+
<word>quando</word>
|
88
|
+
<word>se</word>
|
89
|
+
<word>sed</word>
|
90
|
+
<word>te</word>
|
91
|
+
<word>tu</word>
|
92
|
+
<word>un</word>
|
93
|
+
<word>va</word>
|
94
|
+
<word>vader</word>
|
95
|
+
<word>vos</word>
|
96
|
+
</grader-tc>
|
97
|
+
</dictionary>
|
data/dictionaries/id.xml
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="Indonesian">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>aku</word>
|
62
|
+
<word>anda</word>
|
63
|
+
<word>bapak</word>
|
64
|
+
<word>dia</word>
|
65
|
+
<word>engkau</word>
|
66
|
+
<word>ibu</word>
|
67
|
+
<word>kalian</word>
|
68
|
+
<word>kami</word>
|
69
|
+
<word>kamu</word>
|
70
|
+
<word>kita</word>
|
71
|
+
<word>mereka</word>
|
72
|
+
<word>saudara</word>
|
73
|
+
<word>saya</word>
|
74
|
+
</grader-tc>
|
75
|
+
</dictionary>
|