ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,67 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="tamil?">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>ako1</word>
62
+ <word>amin</word>
63
+ <word>atin</word>
64
+ <word>mo</word>
65
+ <word>nila</word>
66
+ </grader-tc>
67
+ </dictionary>
@@ -0,0 +1,65 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="turkish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>bir</word>
62
+ <word>bu</word>
63
+ <word>o</word>
64
+ </grader-tc>
65
+ </dictionary>
@@ -0,0 +1,98 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="ukranian">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>іноді</word>
62
+ <word>відкіля</word>
63
+ <word>вітаю</word>
64
+ <word>два</word>
65
+ <word>де</word>
66
+ <word>з</word>
67
+ <word>завжди</word>
68
+ <word>зараз</word>
69
+ <word>ким</word>
70
+ <word>коли</word>
71
+ <word>котрий</word>
72
+ <word>куди</word>
73
+ <word>ні</word>
74
+ <word>ніколи</word>
75
+ <word>нікуди</word>
76
+ <word>навіщо</word>
77
+ <word>нагорі</word>
78
+ <word>незабаром</word>
79
+ <word>нуль</word>
80
+ <word>один</word>
81
+ <word>позаду</word>
82
+ <word>скільки</word>
83
+ <word>сюди</word>
84
+ <word>так</word>
85
+ <word>там</word>
86
+ <word>тоді</word>
87
+ <word>туди</word>
88
+ <word>тут</word>
89
+ <word>унизу</word>
90
+ <word>усе</word>
91
+ <word>хто</word>
92
+ <word>часто</word>
93
+ <word>чому</word>
94
+ <word>що</word>
95
+ <word>як</word>
96
+ <word>який</word>
97
+ </grader-tc>
98
+ </dictionary>
@@ -0,0 +1,293 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="yiddish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ <rule>כ'|</rule>
8
+ <rule>מ'|</rule>
9
+ <rule>ס'|</rule>
10
+ <rule>כ׳|</rule>
11
+ <rule>מ׳|</rule>
12
+ <rule>ס׳|</rule>
13
+ </step1_pre>
14
+
15
+
16
+ <step1_post>
17
+ <rule>."|</rule>
18
+ <rule>,"|</rule>
19
+ <rule>.|</rule>
20
+ <rule>,|</rule>
21
+ <rule>"|</rule>
22
+ <rule>)|</rule>
23
+ <rule>?|</rule>
24
+ <rule>:|</rule>
25
+ <rule>;|</rule>
26
+ <rule>!|</rule>
27
+ </step1_post>
28
+
29
+
30
+ <manual>
31
+ <rule>געקומען|קום</rule>
32
+ <rule>געװען|זײַן</rule>
33
+ </manual>
34
+
35
+ <post>
36
+ <rule>ן|</rule>
37
+ <rule>ער|</rule>
38
+ <rule>ע|</rule>
39
+ <rule>ט|</rule>
40
+ </post>
41
+ <pre>
42
+ <rule>before1|1after</rule>
43
+ </pre>
44
+ </stemmer>
45
+ <parser>
46
+
47
+ <linebreak>
48
+ <rule>."</rule>
49
+ <rule>?"</rule>
50
+ <rule>!"</rule>
51
+ <rule>,"</rule>
52
+ <rule>.</rule>
53
+ <rule>?</rule>
54
+ <rule>;</rule>
55
+ <rule>|</rule>
56
+ <rule>!</rule>
57
+ </linebreak>
58
+
59
+ <linedontbreak>
60
+ <rule>Dr.</rule>
61
+ <rule>Mr.</rule>
62
+ <rule>Mrs.</rule>
63
+ <rule>U.S.</rule>
64
+ <rule>Rep.</rule>
65
+ <rule>Sen.</rule>
66
+ </linedontbreak>
67
+ </parser>
68
+ <grader-tc>
69
+ <word>!</word>
70
+ <word>'</word>
71
+ <word>,</word>
72
+ <word>*</word>
73
+ <word>-</word>
74
+ <word>--</word>
75
+ <word>.</word>
76
+ <word>000</word>
77
+ <word>?</word>
78
+ <word>|</word>
79
+ <word>אַ</word>
80
+ <word>אַװעק</word>
81
+ <word>אַז</word>
82
+ <word>אַזױ</word>
83
+ <word>אַלע</word>
84
+ <word>אַלעמאָל</word>
85
+ <word>אַן</word>
86
+ <word>אַנדער</word>
87
+ <word>אַנדערע</word>
88
+ <word>אַפֿילו</word>
89
+ <word>אַצינד</word>
90
+ <word>אַראָפּ</word>
91
+ <word>אַרױס</word>
92
+ <word>אַרױף</word>
93
+ <word>אַרײַן</word>
94
+ <word>אָבער</word>
95
+ <word>אָדער</word>
96
+ <word>אָט</word>
97
+ <word>אָן</word>
98
+ <word>אָפֿט</word>
99
+ <word>און</word>
100
+ <word>אונדזער</word>
101
+ <word>אונדזערע</word>
102
+ <word>אונטער</word>
103
+ <word>איבער</word>
104
+ <word>איז</word>
105
+ <word>איך</word>
106
+ <word>אים</word>
107
+ <word>אין</word>
108
+ <word>איצט</word>
109
+ <word>איר</word>
110
+ <word>אירע</word>
111
+ <word>אפֿשר</word>
112
+ <word>אױב</word>
113
+ <word>אױך</word>
114
+ <word>אױס</word>
115
+ <word>אױף</word>
116
+ <word>אױפֿן</word>
117
+ <word>אײַער</word>
118
+ <word>אײגן</word>
119
+ <word>אײגענע</word>
120
+ <word>אײגענער</word>
121
+ <word>אײדער</word>
122
+ <word>אײן</word>
123
+ <word>אײנמאָל</word>
124
+ <word>אײנס</word>
125
+ <word>אַלײן</word>
126
+ <word>באַקום</word>
127
+ <word>באַקומט</word>
128
+ <word>באַקומען </word>
129
+ <word>ביז</word>
130
+ <word>בין</word>
131
+ <word>בלױז</word>
132
+ <word>בעסער</word>
133
+ <word>בשעת</word>
134
+ <word>בײַ</word>
135
+ <word>בײדע</word>
136
+ <word>גוט</word>
137
+ <word>גוטע</word>
138
+ <word>גוטער</word>
139
+ <word>גלײַך</word>
140
+ <word>געדאַרפֿט</word>
141
+ <word>געהאַט</word>
142
+ <word>געזאָגט</word>
143
+ <word>געזאָלט</word>
144
+ <word>געטאָן</word>
145
+ <word>געמאַכט</word>
146
+ <word>געמוזט</word>
147
+ <word>געמעגט</word>
148
+ <word>געמײנט</word>
149
+ <word>גענוג</word>
150
+ <word>גענוצט</word>
151
+ <word>געשטעלט</word>
152
+ <word>געװאָלט</word>
153
+ <word>מאָל</word>
154
+ <word>מען</word>
155
+ <word>נאָר</word>
156
+ <word>אַז</word>
157
+ <word>געװעזן</word>
158
+ <word>נאָך</word>
159
+ <word>געװען</word>
160
+ <word>גײט</word>
161
+ <word>גײען</word>
162
+ <word>דאָ</word>
163
+ <word>דאַרף</word>
164
+ <word>דאָזיקע</word>
165
+ <word>דאָזיקער</word>
166
+ <word>דאָס</word>
167
+ <word>דאָך</word>
168
+ <word>דאָרט</word>
169
+ <word>דו</word>
170
+ <word>די</word>
171
+ <word>דיר</word>
172
+ <word>דיך</word>
173
+ <word>דײַן</word>
174
+ <word>דעם</word>
175
+ <word>דעמאָלט</word>
176
+ <word>דער</word>
177
+ <word>דערפֿאַר</word>
178
+ <word>דרײַ</word>
179
+ <word>האָב</word>
180
+ <word>האָבן</word>
181
+ <word>האָט</word>
182
+ <word>הער</word>
183
+ <word>הײסט</word>
184
+ <word>זאַך</word>
185
+ <word>זאַכן</word>
186
+ <word>זאָג</word>
187
+ <word>זאָגן</word>
188
+ <word>זאָל</word>
189
+ <word>זי</word>
190
+ <word>זיך</word>
191
+ <word>זעט</word>
192
+ <word>זעלביקע</word>
193
+ <word>זעלביקער</word>
194
+ <word>זען</word>
195
+ <word>זײ</word>
196
+ <word>זײַן</word>
197
+ <word>זײַנען</word>
198
+ <word>זענען</word>
199
+ <word>זײער</word>
200
+ <word>טאַקע</word>
201
+ <word>טוט</word>
202
+ <word>טאָן</word>
203
+ <word>יאָ</word>
204
+ <word>יעדער</word>
205
+ <word>יעצט</word>
206
+ <word>כּמעתּ</word>
207
+ <word>לאָז</word>
208
+ <word>לאָזט</word>
209
+ <word>לאָמיך</word>
210
+ <word>לאָמיר</word>
211
+ <word>לעצט</word>
212
+ <word>לעצטע</word>
213
+ <word>לעצטער</word>
214
+ <word>מאַכט</word>
215
+ <word>מוז</word>
216
+ <word>מיט</word>
217
+ <word>מיך</word>
218
+ <word>מיר</word>
219
+ <word>מילא</word>
220
+ <word>מעג</word>
221
+ <word>מײַן</word>
222
+ <word>מײנט</word>
223
+ <word>נאָך</word>
224
+ <word>נאָענט</word>
225
+ <word>נאָענטע</word>
226
+ <word>נאָר</word>
227
+ <word>נו</word>
228
+ <word>נוצט</word>
229
+ <word>נוצן</word>
230
+ <word>ניט</word>
231
+ <word>ניצט</word>
232
+ <word>ניצן</word>
233
+ <word>נישט</word>
234
+ <word>נײן</word>
235
+ <word>סך</word>
236
+ <word>סײַ</word>
237
+ <word>סײַדן</word>
238
+ <word>עטלעכע</word>
239
+ <word>עס</word>
240
+ <word>עפּעס</word>
241
+ <word>ער</word>
242
+ <word>ערשט</word>
243
+ <word>ערשטע</word>
244
+ <word>ערשטער</word>
245
+ <word>פֿאַר</word>
246
+ <word>פֿאַרשידן</word>
247
+ <word>פֿאַרשידענע</word>
248
+ <word>פֿאַרװאָס</word>
249
+ <word>פֿון</word>
250
+ <word>פֿיר</word>
251
+ <word>פֿרי</word>
252
+ <word>פֿריִערדיק</word>
253
+ <word>פֿריִערדיקע</word>
254
+ <word>פֿריִערדיקער</word>
255
+ <word>צו</word>
256
+ <word>צום</word>
257
+ <word>צי</word>
258
+ <word>צװישן</word>
259
+ <word>צװײ</word>
260
+ <word>קומט</word>
261
+ <word>קומעדיק</word>
262
+ <word>קומעדיקע</word>
263
+ <word>קומעדיקער</word>
264
+ <word>קען</word>
265
+ <word>קענען</word>
266
+ <word>קײן</word>
267
+ <word>רבֿ</word>
268
+ <word>שטעל</word>
269
+ <word>שטעלט</word>
270
+ <word>שױן</word>
271
+ <word>װאָלט</word>
272
+ <word>װאָס</word>
273
+ <word>װאָסער</word>
274
+ <word>װוּ</word>
275
+ <word>װי</word>
276
+ <word>װידער</word>
277
+ <word>װיל</word>
278
+ <word>װילט</word>
279
+ <word>װינציק</word>
280
+ <word>װינציקער</word>
281
+ <word>װײַטער</word>
282
+ <word>װעג</word>
283
+ <word>װעגן</word>
284
+ <word>װעט</word>
285
+ <word>װעלכער</word>
286
+ <word>װעלן</word>
287
+ <word>װעמען</word>
288
+ <word>װעמענס</word>
289
+ <word>װען</word>
290
+ <word>װער</word>
291
+ <word>װײַל</word>
292
+ </grader-tc>
293
+ </dictionary>