ots 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,67 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="tamil?">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>ako1</word>
62
+ <word>amin</word>
63
+ <word>atin</word>
64
+ <word>mo</word>
65
+ <word>nila</word>
66
+ </grader-tc>
67
+ </dictionary>
@@ -0,0 +1,65 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="turkish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>bir</word>
62
+ <word>bu</word>
63
+ <word>o</word>
64
+ </grader-tc>
65
+ </dictionary>
@@ -0,0 +1,98 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="ukranian">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>іноді</word>
62
+ <word>відкіля</word>
63
+ <word>вітаю</word>
64
+ <word>два</word>
65
+ <word>де</word>
66
+ <word>з</word>
67
+ <word>завжди</word>
68
+ <word>зараз</word>
69
+ <word>ким</word>
70
+ <word>коли</word>
71
+ <word>котрий</word>
72
+ <word>куди</word>
73
+ <word>ні</word>
74
+ <word>ніколи</word>
75
+ <word>нікуди</word>
76
+ <word>навіщо</word>
77
+ <word>нагорі</word>
78
+ <word>незабаром</word>
79
+ <word>нуль</word>
80
+ <word>один</word>
81
+ <word>позаду</word>
82
+ <word>скільки</word>
83
+ <word>сюди</word>
84
+ <word>так</word>
85
+ <word>там</word>
86
+ <word>тоді</word>
87
+ <word>туди</word>
88
+ <word>тут</word>
89
+ <word>унизу</word>
90
+ <word>усе</word>
91
+ <word>хто</word>
92
+ <word>часто</word>
93
+ <word>чому</word>
94
+ <word>що</word>
95
+ <word>як</word>
96
+ <word>який</word>
97
+ </grader-tc>
98
+ </dictionary>
@@ -0,0 +1,293 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="yiddish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ <rule>כ'|</rule>
8
+ <rule>מ'|</rule>
9
+ <rule>ס'|</rule>
10
+ <rule>כ׳|</rule>
11
+ <rule>מ׳|</rule>
12
+ <rule>ס׳|</rule>
13
+ </step1_pre>
14
+
15
+
16
+ <step1_post>
17
+ <rule>."|</rule>
18
+ <rule>,"|</rule>
19
+ <rule>.|</rule>
20
+ <rule>,|</rule>
21
+ <rule>"|</rule>
22
+ <rule>)|</rule>
23
+ <rule>?|</rule>
24
+ <rule>:|</rule>
25
+ <rule>;|</rule>
26
+ <rule>!|</rule>
27
+ </step1_post>
28
+
29
+
30
+ <manual>
31
+ <rule>געקומען|קום</rule>
32
+ <rule>געװען|זײַן</rule>
33
+ </manual>
34
+
35
+ <post>
36
+ <rule>ן|</rule>
37
+ <rule>ער|</rule>
38
+ <rule>ע|</rule>
39
+ <rule>ט|</rule>
40
+ </post>
41
+ <pre>
42
+ <rule>before1|1after</rule>
43
+ </pre>
44
+ </stemmer>
45
+ <parser>
46
+
47
+ <linebreak>
48
+ <rule>."</rule>
49
+ <rule>?"</rule>
50
+ <rule>!"</rule>
51
+ <rule>,"</rule>
52
+ <rule>.</rule>
53
+ <rule>?</rule>
54
+ <rule>;</rule>
55
+ <rule>|</rule>
56
+ <rule>!</rule>
57
+ </linebreak>
58
+
59
+ <linedontbreak>
60
+ <rule>Dr.</rule>
61
+ <rule>Mr.</rule>
62
+ <rule>Mrs.</rule>
63
+ <rule>U.S.</rule>
64
+ <rule>Rep.</rule>
65
+ <rule>Sen.</rule>
66
+ </linedontbreak>
67
+ </parser>
68
+ <grader-tc>
69
+ <word>!</word>
70
+ <word>'</word>
71
+ <word>,</word>
72
+ <word>*</word>
73
+ <word>-</word>
74
+ <word>--</word>
75
+ <word>.</word>
76
+ <word>000</word>
77
+ <word>?</word>
78
+ <word>|</word>
79
+ <word>אַ</word>
80
+ <word>אַװעק</word>
81
+ <word>אַז</word>
82
+ <word>אַזױ</word>
83
+ <word>אַלע</word>
84
+ <word>אַלעמאָל</word>
85
+ <word>אַן</word>
86
+ <word>אַנדער</word>
87
+ <word>אַנדערע</word>
88
+ <word>אַפֿילו</word>
89
+ <word>אַצינד</word>
90
+ <word>אַראָפּ</word>
91
+ <word>אַרױס</word>
92
+ <word>אַרױף</word>
93
+ <word>אַרײַן</word>
94
+ <word>אָבער</word>
95
+ <word>אָדער</word>
96
+ <word>אָט</word>
97
+ <word>אָן</word>
98
+ <word>אָפֿט</word>
99
+ <word>און</word>
100
+ <word>אונדזער</word>
101
+ <word>אונדזערע</word>
102
+ <word>אונטער</word>
103
+ <word>איבער</word>
104
+ <word>איז</word>
105
+ <word>איך</word>
106
+ <word>אים</word>
107
+ <word>אין</word>
108
+ <word>איצט</word>
109
+ <word>איר</word>
110
+ <word>אירע</word>
111
+ <word>אפֿשר</word>
112
+ <word>אױב</word>
113
+ <word>אױך</word>
114
+ <word>אױס</word>
115
+ <word>אױף</word>
116
+ <word>אױפֿן</word>
117
+ <word>אײַער</word>
118
+ <word>אײגן</word>
119
+ <word>אײגענע</word>
120
+ <word>אײגענער</word>
121
+ <word>אײדער</word>
122
+ <word>אײן</word>
123
+ <word>אײנמאָל</word>
124
+ <word>אײנס</word>
125
+ <word>אַלײן</word>
126
+ <word>באַקום</word>
127
+ <word>באַקומט</word>
128
+ <word>באַקומען </word>
129
+ <word>ביז</word>
130
+ <word>בין</word>
131
+ <word>בלױז</word>
132
+ <word>בעסער</word>
133
+ <word>בשעת</word>
134
+ <word>בײַ</word>
135
+ <word>בײדע</word>
136
+ <word>גוט</word>
137
+ <word>גוטע</word>
138
+ <word>גוטער</word>
139
+ <word>גלײַך</word>
140
+ <word>געדאַרפֿט</word>
141
+ <word>געהאַט</word>
142
+ <word>געזאָגט</word>
143
+ <word>געזאָלט</word>
144
+ <word>געטאָן</word>
145
+ <word>געמאַכט</word>
146
+ <word>געמוזט</word>
147
+ <word>געמעגט</word>
148
+ <word>געמײנט</word>
149
+ <word>גענוג</word>
150
+ <word>גענוצט</word>
151
+ <word>געשטעלט</word>
152
+ <word>געװאָלט</word>
153
+ <word>מאָל</word>
154
+ <word>מען</word>
155
+ <word>נאָר</word>
156
+ <word>אַז</word>
157
+ <word>געװעזן</word>
158
+ <word>נאָך</word>
159
+ <word>געװען</word>
160
+ <word>גײט</word>
161
+ <word>גײען</word>
162
+ <word>דאָ</word>
163
+ <word>דאַרף</word>
164
+ <word>דאָזיקע</word>
165
+ <word>דאָזיקער</word>
166
+ <word>דאָס</word>
167
+ <word>דאָך</word>
168
+ <word>דאָרט</word>
169
+ <word>דו</word>
170
+ <word>די</word>
171
+ <word>דיר</word>
172
+ <word>דיך</word>
173
+ <word>דײַן</word>
174
+ <word>דעם</word>
175
+ <word>דעמאָלט</word>
176
+ <word>דער</word>
177
+ <word>דערפֿאַר</word>
178
+ <word>דרײַ</word>
179
+ <word>האָב</word>
180
+ <word>האָבן</word>
181
+ <word>האָט</word>
182
+ <word>הער</word>
183
+ <word>הײסט</word>
184
+ <word>זאַך</word>
185
+ <word>זאַכן</word>
186
+ <word>זאָג</word>
187
+ <word>זאָגן</word>
188
+ <word>זאָל</word>
189
+ <word>זי</word>
190
+ <word>זיך</word>
191
+ <word>זעט</word>
192
+ <word>זעלביקע</word>
193
+ <word>זעלביקער</word>
194
+ <word>זען</word>
195
+ <word>זײ</word>
196
+ <word>זײַן</word>
197
+ <word>זײַנען</word>
198
+ <word>זענען</word>
199
+ <word>זײער</word>
200
+ <word>טאַקע</word>
201
+ <word>טוט</word>
202
+ <word>טאָן</word>
203
+ <word>יאָ</word>
204
+ <word>יעדער</word>
205
+ <word>יעצט</word>
206
+ <word>כּמעתּ</word>
207
+ <word>לאָז</word>
208
+ <word>לאָזט</word>
209
+ <word>לאָמיך</word>
210
+ <word>לאָמיר</word>
211
+ <word>לעצט</word>
212
+ <word>לעצטע</word>
213
+ <word>לעצטער</word>
214
+ <word>מאַכט</word>
215
+ <word>מוז</word>
216
+ <word>מיט</word>
217
+ <word>מיך</word>
218
+ <word>מיר</word>
219
+ <word>מילא</word>
220
+ <word>מעג</word>
221
+ <word>מײַן</word>
222
+ <word>מײנט</word>
223
+ <word>נאָך</word>
224
+ <word>נאָענט</word>
225
+ <word>נאָענטע</word>
226
+ <word>נאָר</word>
227
+ <word>נו</word>
228
+ <word>נוצט</word>
229
+ <word>נוצן</word>
230
+ <word>ניט</word>
231
+ <word>ניצט</word>
232
+ <word>ניצן</word>
233
+ <word>נישט</word>
234
+ <word>נײן</word>
235
+ <word>סך</word>
236
+ <word>סײַ</word>
237
+ <word>סײַדן</word>
238
+ <word>עטלעכע</word>
239
+ <word>עס</word>
240
+ <word>עפּעס</word>
241
+ <word>ער</word>
242
+ <word>ערשט</word>
243
+ <word>ערשטע</word>
244
+ <word>ערשטער</word>
245
+ <word>פֿאַר</word>
246
+ <word>פֿאַרשידן</word>
247
+ <word>פֿאַרשידענע</word>
248
+ <word>פֿאַרװאָס</word>
249
+ <word>פֿון</word>
250
+ <word>פֿיר</word>
251
+ <word>פֿרי</word>
252
+ <word>פֿריִערדיק</word>
253
+ <word>פֿריִערדיקע</word>
254
+ <word>פֿריִערדיקער</word>
255
+ <word>צו</word>
256
+ <word>צום</word>
257
+ <word>צי</word>
258
+ <word>צװישן</word>
259
+ <word>צװײ</word>
260
+ <word>קומט</word>
261
+ <word>קומעדיק</word>
262
+ <word>קומעדיקע</word>
263
+ <word>קומעדיקער</word>
264
+ <word>קען</word>
265
+ <word>קענען</word>
266
+ <word>קײן</word>
267
+ <word>רבֿ</word>
268
+ <word>שטעל</word>
269
+ <word>שטעלט</word>
270
+ <word>שױן</word>
271
+ <word>װאָלט</word>
272
+ <word>װאָס</word>
273
+ <word>װאָסער</word>
274
+ <word>װוּ</word>
275
+ <word>װי</word>
276
+ <word>װידער</word>
277
+ <word>װיל</word>
278
+ <word>װילט</word>
279
+ <word>װינציק</word>
280
+ <word>װינציקער</word>
281
+ <word>װײַטער</word>
282
+ <word>װעג</word>
283
+ <word>װעגן</word>
284
+ <word>װעט</word>
285
+ <word>װעלכער</word>
286
+ <word>װעלן</word>
287
+ <word>װעמען</word>
288
+ <word>װעמענס</word>
289
+ <word>װען</word>
290
+ <word>װער</word>
291
+ <word>װײַל</word>
292
+ </grader-tc>
293
+ </dictionary>