ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,118 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="welsh">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>a</word>
62
+ <word>â</word>
63
+ <word>ac</word>
64
+ <word>achos</word>
65
+ <word>am</word>
66
+ <word>ar</word>
67
+ <word>at</word>
68
+ <word>chi</word>
69
+ <word>dau</word>
70
+ <word>dim</word>
71
+ <word>diolch</word>
72
+ <word>dwy</word>
73
+ <word>e</word>
74
+ <word>ei</word>
75
+ <word>eto</word>
76
+ <word>fe</word>
77
+ <word>fi</word>
78
+ <word>gan</word>
79
+ <word>ger</word>
80
+ <word>gyda</word>
81
+ <word>heb</word>
82
+ <word>heblaw</word>
83
+ <word>hefyd</word>
84
+ <word>hi</word>
85
+ <word>hon</word>
86
+ <word>hwn</word>
87
+ <word>i</word>
88
+ <word>iawn</word>
89
+ <word>mewn</word>
90
+ <word>na</word>
91
+ <word>neb</word>
92
+ <word>nes</word>
93
+ <word>nhw</word>
94
+ <word>ni</word>
95
+ <word>o</word>
96
+ <word>ond</word>
97
+ <word>os</word>
98
+ <word>paham</word>
99
+ <word>pam</word>
100
+ <word>pe</word>
101
+ <word>popeth</word>
102
+ <word>pwy</word>
103
+ <word>rhag</word>
104
+ <word>ti</word>
105
+ <word>trwy</word>
106
+ <word>un</word>
107
+ <word>unwaith</word>
108
+ <word>wedi</word>
109
+ <word>wedyn</word>
110
+ <word>weithiau</word>
111
+ <word>wrth</word>
112
+ <word>ychydig</word>
113
+ <word>ymhlith</word>
114
+ <word>ymlaen</word>
115
+ <word>yn</word>
116
+ <word>yrŵan</word>
117
+ </grader-tc>
118
+ </dictionary>
@@ -0,0 +1,129 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="danish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>aldrig</word>
62
+ <word>anden</word>
63
+ <word>at</word>
64
+ <word>burde</word>
65
+ <word>de</word>
66
+ <word>den</word>
67
+ <word>der</word>
68
+ <word>det</word>
69
+ <word>dig</word>
70
+ <word>du</word>
71
+ <word>eller</word>
72
+ <word>en</word>
73
+ <word>er</word>
74
+ <word>et</word>
75
+ <word>fordi</word>
76
+ <word>fra</word>
77
+ <word>før</word>
78
+ <word>første</word>
79
+ <word>gide</word>
80
+ <word>ham</word>
81
+ <word>han</word>
82
+ <word>har</word>
83
+ <word>hej</word>
84
+ <word>hende</word>
85
+ <word>hun</word>
86
+ <word>hvad</word>
87
+ <word>hvem</word>
88
+ <word>hvilke</word>
89
+ <word>hvilken</word>
90
+ <word>hvilket</word>
91
+ <word>hvis</word>
92
+ <word>hvor</word>
93
+ <word>hvordan</word>
94
+ <word>hvorfor</word>
95
+ <word>hvornår</word>
96
+ <word>i</word>
97
+ <word>ikke</word>
98
+ <word>ingen</word>
99
+ <word>ingenting</word>
100
+ <word>ja</word>
101
+ <word>jeg</word>
102
+ <word>kan</word>
103
+ <word>kunne</word>
104
+ <word>kunne</word>
105
+ <word>med</word>
106
+ <word>men</word>
107
+ <word>mens</word>
108
+ <word>mere</word>
109
+ <word>mest</word>
110
+ <word>mig</word>
111
+ <word>min</word>
112
+ <word>måtte</word>
113
+ <word>nej</word>
114
+ <word>nogen</word>
115
+ <word>noget</word>
116
+ <word>når</word>
117
+ <word>og</word>
118
+ <word>om</word>
119
+ <word>sig</word>
120
+ <word>skulle</word>
121
+ <word>som</word>
122
+ <word>så</word>
123
+ <word>tit</word>
124
+ <word>to</word>
125
+ <word>turde</word>
126
+ <word>vi</word>
127
+ <word>ville</word>
128
+ </grader-tc>
129
+ </dictionary>
@@ -0,0 +1,354 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="german">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>ab</word>
62
+ <word>aber</word>
63
+ <word>ähnlich</word>
64
+ <word>aehnlich</word>
65
+ <word>all</word>
66
+ <word>alle</word>
67
+ <word>allein</word>
68
+ <word>alles</word>
69
+ <word>als</word>
70
+ <word>also</word>
71
+ <word>am</word>
72
+ <word>an</word>
73
+ <word>andere</word>
74
+ <word>anderes</word>
75
+ <word>anstatt</word>
76
+ <word>auch</word>
77
+ <word>auf</word>
78
+ <word>aus</word>
79
+ <word>ausser</word>
80
+ <word>ausserhalb</word>
81
+ <word>bald</word>
82
+ <word>bei</word>
83
+ <word>beide</word>
84
+ <word>beim</word>
85
+ <word>bin</word>
86
+ <word>bis</word>
87
+ <word>bist</word>
88
+ <word>bitte</word>
89
+ <word>brauche</word>
90
+ <word>brauchen</word>
91
+ <word>braucht</word>
92
+ <word>co</word>
93
+ <word>da</word>
94
+ <word>damit</word>
95
+ <word>dann</word>
96
+ <word>darf</word>
97
+ <word>darüber</word>
98
+ <word>darueber</word>
99
+ <word>das</word>
100
+ <word>daß</word>
101
+ <word>dass</word>
102
+ <word>dein</word>
103
+ <word>deine</word>
104
+ <word>dem</word>
105
+ <word>den</word>
106
+ <word>denen</word>
107
+ <word>denke</word>
108
+ <word>denken</word>
109
+ <word>denkst</word>
110
+ <word>der</word>
111
+ <word>des</word>
112
+ <word>dich</word>
113
+ <word>die</word>
114
+ <word>diese</word>
115
+ <word>dieser</word>
116
+ <word>dir</word>
117
+ <word>doch</word>
118
+ <word>dort</word>
119
+ <word>drei</word>
120
+ <word>du</word>
121
+ <word>durch</word>
122
+ <word>dürfen</word>
123
+ <word>duerfen</word>
124
+ <word>ehemalig</word>
125
+ <word>eher</word>
126
+ <word>ein</word>
127
+ <word>eine</word>
128
+ <word>einem</word>
129
+ <word>einen</word>
130
+ <word>einer</word>
131
+ <word>eines</word>
132
+ <word>einmal</word>
133
+ <word>entlang</word>
134
+ <word>er</word>
135
+ <word>erhalt</word>
136
+ <word>erhalten</word>
137
+ <word>erste</word>
138
+ <word>es</word>
139
+ <word>etliche</word>
140
+ <word>etwa</word>
141
+ <word>etwas</word>
142
+ <word>fahre</word>
143
+ <word>fahren</word>
144
+ <word>fahrt</word>
145
+ <word>fast</word>
146
+ <word>frau</word>
147
+ <word>fuer</word>
148
+ <word>für</word>
149
+ <word>fuer</word>
150
+ <word>geben</word>
151
+ <word>gegen</word>
152
+ <word>gegenüber</word>
153
+ <word>gegenueber</word>
154
+ <word>geh</word>
155
+ <word>gehabt</word>
156
+ <word>gehen</word>
157
+ <word>geht</word>
158
+ <word>gekonnt</word>
159
+ <word>gelegen</word>
160
+ <word>gelasse</word>
161
+ <word>gelassen</word>
162
+ <word>gelasst</word>
163
+ <word>genug</word>
164
+ <word>gerade</word>
165
+ <word>gesagt</word>
166
+ <word>gesetzt</word>
167
+ <word>getan</word>
168
+ <word>gewesen</word>
169
+ <word>gibt</word>
170
+ <word>gmbh</word>
171
+ <word>gut</word>
172
+ <word>guten</word>
173
+ <word>gutes</word>
174
+ <word>hab</word>
175
+ <word>habe</word>
176
+ <word>haben</word>
177
+ <word>habt</word>
178
+ <word>hast</word>
179
+ <word>hat</word>
180
+ <word>hatte</word>
181
+ <word>häufig</word>
182
+ <word>haeufig</word>
183
+ <word>herr</word>
184
+ <word>heute</word>
185
+ <word>hier</word>
186
+ <word>ich</word>
187
+ <word>ihn</word>
188
+ <word>ihr</word>
189
+ <word>im</word>
190
+ <word>immer</word>
191
+ <word>in</word>
192
+ <word>initiale</word>
193
+ <word>irgend</word>
194
+ <word>irgendein</word>
195
+ <word>ist</word>
196
+ <word>ja</word>
197
+ <word>jede</word>
198
+ <word>jeden</word>
199
+ <word>jeder</word>
200
+ <word>jedes</word>
201
+ <word>jedoch</word>
202
+ <word>jemand</word>
203
+ <word>jetzt</word>
204
+ <word>kann</word>
205
+ <word>kein</word>
206
+ <word>keine</word>
207
+ <word>keinen</word>
208
+ <word>kenne</word>
209
+ <word>kennen</word>
210
+ <word>kennst</word>
211
+ <word>kennt</word>
212
+ <word>klein</word>
213
+ <word>kleiner</word>
214
+ <word>komm</word>
215
+ <word>kommen</word>
216
+ <word>kommt</word>
217
+ <word>können</word>
218
+ <word>koennen</word>
219
+ <word>konnte</word>
220
+ <word>lag</word>
221
+ <word>letztes</word>
222
+ <word>liegen</word>
223
+ <word>los</word>
224
+ <word>mache</word>
225
+ <word>machen</word>
226
+ <word>machst</word>
227
+ <word>macht</word>
228
+ <word>mag</word>
229
+ <word>man</word>
230
+ <word>manchmal</word>
231
+ <word>mann</word>
232
+ <word>mehr</word>
233
+ <word>mein</word>
234
+ <word>meisten</word>
235
+ <word>mich</word>
236
+ <word>mir</word>
237
+ <word>mit</word>
238
+ <word>möglicherweise</word>
239
+ <word>moeglicherweise</word>
240
+ <word>muss</word>
241
+ <word>müssen</word>
242
+ <word>muessen</word>
243
+ <word>musste</word>
244
+ <word>nach</word>
245
+ <word>nächst</word>
246
+ <word>naechst</word>
247
+ <word>nahe</word>
248
+ <word>nein</word>
249
+ <word>nicht</word>
250
+ <word>nichts</word>
251
+ <word>nie</word>
252
+ <word>niemand</word>
253
+ <word>noch</word>
254
+ <word>nur</word>
255
+ <word>oberhalb</word>
256
+ <word>oder</word>
257
+ <word>oft</word>
258
+ <word>ohne</word>
259
+ <word>ok</word>
260
+ <word>okay</word>
261
+ <word>per</word>
262
+ <word>sache</word>
263
+ <word>sachen</word>
264
+ <word>sagen</word>
265
+ <word>sagt</word>
266
+ <word>satz</word>
267
+ <word>schon</word>
268
+ <word>sehe</word>
269
+ <word>sehen</word>
270
+ <word>sehr</word>
271
+ <word>seid</word>
272
+ <word>seiht</word>
273
+ <word>sein</word>
274
+ <word>seine</word>
275
+ <word>seiner</word>
276
+ <word>seit</word>
277
+ <word>selbar</word>
278
+ <word>selben</word>
279
+ <word>selbst</word>
280
+ <word>selten</word>
281
+ <word>sich</word>
282
+ <word>sie</word>
283
+ <word>sind</word>
284
+ <word>sitzen</word>
285
+ <word>so</word>
286
+ <word>sobald</word>
287
+ <word>sollt</word>
288
+ <word>sollte</word>
289
+ <word>sollten</word>
290
+ <word>sowie</word>
291
+ <word>tat</word>
292
+ <word>trotz</word>
293
+ <word>tue</word>
294
+ <word>tun</word>
295
+ <word>tust</word>
296
+ <word>tut</word>
297
+ <word>über</word>
298
+ <word>ueber</word>
299
+ <word>um</word>
300
+ <word>und</word>
301
+ <word>uns</word>
302
+ <word>unser</word>
303
+ <word>unten</word>
304
+ <word>unter</word>
305
+ <word>unterhalb</word>
306
+ <word>unterschiedlich</word>
307
+ <word>viel</word>
308
+ <word>viele</word>
309
+ <word>vier</word>
310
+ <word>von</word>
311
+ <word>vor</word>
312
+ <word>vorher</word>
313
+ <word>während</word>
314
+ <word>waehrend</word>
315
+ <word>wann</word>
316
+ <word>war</word>
317
+ <word>warum</word>
318
+ <word>was</word>
319
+ <word>wegen</word>
320
+ <word>weil</word>
321
+ <word>weise</word>
322
+ <word>welche</word>
323
+ <word>welchem</word>
324
+ <word>wem</word>
325
+ <word>wen</word>
326
+ <word>wenige</word>
327
+ <word>wenn</word>
328
+ <word>wer</word>
329
+ <word>werde</word>
330
+ <word>werden</word>
331
+ <word>wessen</word>
332
+ <word>wie</word>
333
+ <word>wieder</word>
334
+ <word>will</word>
335
+ <word>willst</word>
336
+ <word>wir</word>
337
+ <word>wird</word>
338
+ <word>wirklich</word>
339
+ <word>wirst</word>
340
+ <word>wissen</word>
341
+ <word>wo</word>
342
+ <word>wollen</word>
343
+ <word>wurde</word>
344
+ <word>z.b.</word>
345
+ <word>zu</word>
346
+ <word>zuerst</word>
347
+ <word>zum</word>
348
+ <word>zur</word>
349
+ <word>zurück</word>
350
+ <word>zurueck</word>
351
+ <word>zwei</word>
352
+ <word>zwischen</word>
353
+ </grader-tc>
354
+ </dictionary>