ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,163 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="romanian">
3
+ <stemmer>
4
+
5
+ <step1_pre>
6
+ <rule>"|</rule>
7
+ <rule>(|</rule>
8
+ </step1_pre>
9
+
10
+
11
+ <step1_post>
12
+ <rule>."|</rule>
13
+ <rule>,"|</rule>
14
+ <rule>.|</rule>
15
+ <rule>,|</rule>
16
+ <rule>"|</rule>
17
+ <rule>)|</rule>
18
+ <rule>?|</rule>
19
+ <rule>:|</rule>
20
+ <rule>;|</rule>
21
+ <rule>!|</rule>
22
+ </step1_post>
23
+
24
+
25
+ <manual>
26
+ <rule>wrote|write</rule>
27
+ <rule>came|come</rule>
28
+ <rule>went|go</rule>
29
+ </manual>
30
+
31
+ <post>
32
+ <rule>before1|1after</rule>
33
+ </post>
34
+ <pre>
35
+ <rule>before1|1after</rule>
36
+ </pre>
37
+ </stemmer>
38
+ <parser>
39
+
40
+ <linebreak>
41
+ <rule>."</rule>
42
+ <rule>?"</rule>
43
+ <rule>!"</rule>
44
+ <rule>,"</rule>
45
+ <rule>.</rule>
46
+ <rule>?</rule>
47
+ <rule>;</rule>
48
+ <rule>|</rule>
49
+ <rule>!</rule>
50
+ </linebreak>
51
+
52
+ <linedontbreak>
53
+ <rule>Dr.</rule>
54
+ <rule>Mr.</rule>
55
+ <rule>Mrs.</rule>
56
+ <rule>U.S.</rule>
57
+ <rule>Rep.</rule>
58
+ <rule>Sen.</rule>
59
+ </linedontbreak>
60
+ </parser>
61
+ <grader-tc>
62
+ <word>acasă</word>
63
+ <word>acest</word>
64
+ <word>acolo</word>
65
+ <word>acum</word>
66
+ <word>acuma</word>
67
+ <word>ai</word>
68
+ <word>aicea</word>
69
+ <word>aici</word>
70
+ <word>alt</word>
71
+ <word>am</word>
72
+ <word>apoi</word>
73
+ <word>aproape</word>
74
+ <word>apropro</word>
75
+ <word>are</word>
76
+ <word>aşa</word>
77
+ <word>au</word>
78
+ <word>avea</word>
79
+ <word>avem</word>
80
+ <word>aveţi</word>
81
+ <word>ca</word>
82
+ <word>că</word>
83
+ <word>când</word>
84
+ <word>ce</word>
85
+ <word>cine</word>
86
+ <word>cît</word>
87
+ <word>cîtă</word>
88
+ <word>cîte</word>
89
+ <word>cîţi</word>
90
+ <word>cu</word>
91
+ <word>da</word>
92
+ <word>deci</word>
93
+ <word>decît</word>
94
+ <word>deja</word>
95
+ <word>doamna</word>
96
+ <word>doi</word>
97
+ <word>domnişoara</word>
98
+ <word>domnul</word>
99
+ <word>două</word>
100
+ <word>dumneaei</word>
101
+ <word>dumnealor</word>
102
+ <word>dumnealui</word>
103
+ <word>dumneata</word>
104
+ <word>dumneavoastră</word>
105
+ <word>după</word>
106
+ <word>ea</word>
107
+ <word>ei</word>
108
+ <word>el</word>
109
+ <word>ele</word>
110
+ <word>este</word>
111
+ <word>eşti</word>
112
+ <word>eu</word>
113
+ <word>face</word>
114
+ <word>fi</word>
115
+ <word>fiindcă</word>
116
+ <word>iar</word>
117
+ <word>ieri</word>
118
+ <word>în</word>
119
+ <word>încă</word>
120
+ <word>într</word>
121
+ <word>între</word>
122
+ <word>la</word>
123
+ <word>lîngă</word>
124
+ <word>lor</word>
125
+ <word>lui</word>
126
+ <word>mai</word>
127
+ <word>merge</word>
128
+ <word>meu</word>
129
+ <word>mîine</word>
130
+ <word>mult</word>
131
+ <word>nicăieri</word>
132
+ <word>nici</word>
133
+ <word>niciodată</word>
134
+ <word>nimeni</word>
135
+ <word>nimic</word>
136
+ <word>nişte</word>
137
+ <word>noi</word>
138
+ <word>nostru</word>
139
+ <word>nu</word>
140
+ <word>o</word>
141
+ <word>pe</word>
142
+ <word>pentru</word>
143
+ <word>puţin</word>
144
+ <word>sînt</word>
145
+ <word>sînt</word>
146
+ <word>sîntem</word>
147
+ <word>sînteţi</word>
148
+ <word>spre</word>
149
+ <word>sub</word>
150
+ <word>şi</word>
151
+ <word>tot</word>
152
+ <word>tu</word>
153
+ <word>un</word>
154
+ <word>una</word>
155
+ <word>unde</word>
156
+ <word>unei</word>
157
+ <word>unor</word>
158
+ <word>unu</word>
159
+ <word>unui</word>
160
+ <word>unul</word>
161
+ <word>voi</word>
162
+ </grader-tc>
163
+ </dictionary>
@@ -0,0 +1,150 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="russian">
3
+ <stemmer>
4
+
5
+ <step1_pre>
6
+ <rule>"|</rule>
7
+ <rule>(|</rule>
8
+ </step1_pre>
9
+
10
+
11
+ <step1_post>
12
+ <rule>."|</rule>
13
+ <rule>,"|</rule>
14
+ <rule>.|</rule>
15
+ <rule>,|</rule>
16
+ <rule>"|</rule>
17
+ <rule>)|</rule>
18
+ <rule>?|</rule>
19
+ <rule>:|</rule>
20
+ <rule>;|</rule>
21
+ <rule>!|</rule>
22
+ </step1_post>
23
+
24
+
25
+ <manual>
26
+ <rule>wrote|write</rule>
27
+ <rule>came|come</rule>
28
+ <rule>went|go</rule>
29
+ </manual>
30
+
31
+ <post>
32
+ <rule>before1|1after</rule>
33
+ </post>
34
+ <pre>
35
+ <rule>before1|1after</rule>
36
+ </pre>
37
+ </stemmer>
38
+ <parser>
39
+
40
+ <linebreak>
41
+ <rule>."</rule>
42
+ <rule>?"</rule>
43
+ <rule>!"</rule>
44
+ <rule>,"</rule>
45
+ <rule>.</rule>
46
+ <rule>?</rule>
47
+ <rule>;</rule>
48
+ <rule>|</rule>
49
+ <rule>!</rule>
50
+ </linebreak>
51
+
52
+ <linedontbreak>
53
+ <rule>Dr.</rule>
54
+ <rule>Mr.</rule>
55
+ <rule>Mrs.</rule>
56
+ <rule>U.S.</rule>
57
+ <rule>Rep.</rule>
58
+ <rule>Sen.</rule>
59
+ </linedontbreak>
60
+ </parser>
61
+ <grader-tc>
62
+ <word>а</word>
63
+ <word>без</word>
64
+ <word>бытовать</word>
65
+ <word>быть</word>
66
+ <word>в</word>
67
+ <word>вещь</word>
68
+ <word>вниз</word>
69
+ <word>внизу</word>
70
+ <word>во</word>
71
+ <word>все</word>
72
+ <word>всегда</word>
73
+ <word>всё</word>
74
+ <word>где</word>
75
+ <word>да</word>
76
+ <word>даже</word>
77
+ <word>два</word>
78
+ <word>две</word>
79
+ <word>для</word>
80
+ <word>должен</word>
81
+ <word>друго</word>
82
+ <word>его</word>
83
+ <word>её</word>
84
+ <word>ей</word>
85
+ <word>ему</word>
86
+ <word>если</word>
87
+ <word>же</word>
88
+ <word>за</word>
89
+ <word>и</word>
90
+ <word>из</word>
91
+ <word>из-за</word>
92
+ <word>или</word>
93
+ <word>им</word>
94
+ <word>к</word>
95
+ <word>каждый</word>
96
+ <word>как</word>
97
+ <word>меня</word>
98
+ <word>мне</word>
99
+ <word>мной</word>
100
+ <word>может</word>
101
+ <word>на</word>
102
+ <word>наверх</word>
103
+ <word>наверху</word>
104
+ <word>над</word>
105
+ <word>не</word>
106
+ <word>ней</word>
107
+ <word>нет</word>
108
+ <word>нём</word>
109
+ <word>нигде</word>
110
+ <word>никто</word>
111
+ <word>ноль</word>
112
+ <word>о</word>
113
+ <word>оба</word>
114
+ <word>обе</word>
115
+ <word>одна</word>
116
+ <word>одно</word>
117
+ <word>около</word>
118
+ <word>он</word>
119
+ <word>она</word>
120
+ <word>оно</word>
121
+ <word>от</word>
122
+ <word>по</word>
123
+ <word>пока</word>
124
+ <word>поперёк</word>
125
+ <word>после</word>
126
+ <word>потом</word>
127
+ <word>почему</word>
128
+ <word>при</word>
129
+ <word>с</word>
130
+ <word>скоро</word>
131
+ <word>сначала</word>
132
+ <word>так</word>
133
+ <word>также</word>
134
+ <word>тебе</word>
135
+ <word>тебя</word>
136
+ <word>теперь</word>
137
+ <word>тобой</word>
138
+ <word>тогда</word>
139
+ <word>тоже</word>
140
+ <word>только</word>
141
+ <word>ты</word>
142
+ <word>у</word>
143
+ <word>уже</word>
144
+ <word>что</word>
145
+ <word>чтобы</word>
146
+ <word>это</word>
147
+ <word>я</word>
148
+ </grader-tc>
149
+ </dictionary>
150
+
@@ -0,0 +1,255 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="swedish">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>all</word>
62
+ <word>alla</word>
63
+ <word>allestädes</word>
64
+ <word>allra</word>
65
+ <word>alls</word>
66
+ <word>allt</word>
67
+ <word>alltför</word>
68
+ <word>alltid</word>
69
+ <word>allting</word>
70
+ <word>alltjämt</word>
71
+ <word>alltmer</word>
72
+ <word>alltnog</word>
73
+ <word>alltsammans</word>
74
+ <word>alltså</word>
75
+ <word>annorlunda</word>
76
+ <word>ar</word>
77
+ <word>att</word>
78
+ <word>av</word>
79
+ <word>bara</word>
80
+ <word>bland</word>
81
+ <word>blev</word>
82
+ <word>bli</word>
83
+ <word>blir</word>
84
+ <word>blivit</word>
85
+ <word>de</word>
86
+ <word>dem</word>
87
+ <word>den</word>
88
+ <word>denna</word>
89
+ <word>densamme</word>
90
+ <word>dess</word>
91
+ <word>dessa</word>
92
+ <word>dessförinnan</word>
93
+ <word>det</word>
94
+ <word>detta</word>
95
+ <word>dig</word>
96
+ <word>dit</word>
97
+ <word>dittills</word>
98
+ <word>dock</word>
99
+ <word>du</word>
100
+ <word>då</word>
101
+ <word>där</word>
102
+ <word>däremot</word>
103
+ <word>därför</word>
104
+ <word>eftersom</word>
105
+ <word>ej</word>
106
+ <word>eller</word>
107
+ <word>emedan</word>
108
+ <word>emellan</word>
109
+ <word>emellanåt</word>
110
+ <word>emellertid</word>
111
+ <word>en</word>
112
+ <word>endast</word>
113
+ <word>endera</word>
114
+ <word>envar</word>
115
+ <word>enär</word>
116
+ <word>er</word>
117
+ <word>ett</word>
118
+ <word>fast</word>
119
+ <word>fastän</word>
120
+ <word>fick</word>
121
+ <word>finnas</word>
122
+ <word>flera</word>
123
+ <word>flesta</word>
124
+ <word>från</word>
125
+ <word>få</word>
126
+ <word>får</word>
127
+ <word>fåt</word>
128
+ <word>förrän</word>
129
+ <word>ha</word>
130
+ <word>han</word>
131
+ <word>heller</word>
132
+ <word>henne</word>
133
+ <word>hit</word>
134
+ <word>hittills</word>
135
+ <word>hitåt</word>
136
+ <word>hon</word>
137
+ <word>honom</word>
138
+ <word>hur</word>
139
+ <word>här</word>
140
+ <word>i</word>
141
+ <word>icke</word>
142
+ <word>ifall</word>
143
+ <word>ifrån</word>
144
+ <word>igen</word>
145
+ <word>igenom</word>
146
+ <word>in</word>
147
+ <word>ingen</word>
148
+ <word>ingendera</word>
149
+ <word>inget</word>
150
+ <word>innan</word>
151
+ <word>innanför</word>
152
+ <word>inne</word>
153
+ <word>ja</word>
154
+ <word>jag</word>
155
+ <word>jo</word>
156
+ <word>kan</word>
157
+ <word>kunde</word>
158
+ <word>kunna</word>
159
+ <word>kunnat</word>
160
+ <word>man</word>
161
+ <word>med</word>
162
+ <word>medan</word>
163
+ <word>mellan</word>
164
+ <word>men</word>
165
+ <word>mer</word>
166
+ <word>mest</word>
167
+ <word>mig</word>
168
+ <word>mycket</word>
169
+ <word>många</word>
170
+ <word>måst</word>
171
+ <word>måsta</word>
172
+ <word>ned</word>
173
+ <word>nedanför</word>
174
+ <word>nedåt</word>
175
+ <word>nej</word>
176
+ <word>ni</word>
177
+ <word>nu</word>
178
+ <word>nyss</word>
179
+ <word>någon</word>
180
+ <word>någondera</word>
181
+ <word>någonsin</word>
182
+ <word>någonstans</word>
183
+ <word>någonting</word>
184
+ <word>några</word>
185
+ <word>när</word>
186
+ <word>och</word>
187
+ <word>också</word>
188
+ <word>om</word>
189
+ <word>oss</word>
190
+ <word>ovan</word>
191
+ <word>ovanför</word>
192
+ <word>ovanpå</word>
193
+ <word>på</word>
194
+ <word>sedan</word>
195
+ <word>senare</word>
196
+ <word>sin</word>
197
+ <word>själv</word>
198
+ <word>ska</word>
199
+ <word>skall</word>
200
+ <word>skulle</word>
201
+ <word>slags</word>
202
+ <word>snart</word>
203
+ <word>som</word>
204
+ <word>somliga</word>
205
+ <word>stundom</word>
206
+ <word>så</word>
207
+ <word>sådan</word>
208
+ <word>således</word>
209
+ <word>sålunda</word>
210
+ <word>såsom</word>
211
+ <word>såvida</word>
212
+ <word>såvitt</word>
213
+ <word>sällan</word>
214
+ <word>tack</word>
215
+ <word>tillbaka</word>
216
+ <word>tills</word>
217
+ <word>upp</word>
218
+ <word>ur</word>
219
+ <word>ut</word>
220
+ <word>utan</word>
221
+ <word>va</word>
222
+ <word>vad</word>
223
+ <word>var</word>
224
+ <word>vara</word>
225
+ <word>varandra</word>
226
+ <word>varav</word>
227
+ <word>vardera</word>
228
+ <word>varenda</word>
229
+ <word>varför</word>
230
+ <word>varifrån</word>
231
+ <word>varje</word>
232
+ <word>vart</word>
233
+ <word>vem</word>
234
+ <word>vi</word>
235
+ <word>vid</word>
236
+ <word>vilja</word>
237
+ <word>vilka</word>
238
+ <word>vilken</word>
239
+ <word>vilket</word>
240
+ <word>vill</word>
241
+ <word>åt</word>
242
+ <word>åtskillig</word>
243
+ <word>åtskilligt</word>
244
+ <word>än</word>
245
+ <word>ändå</word>
246
+ <word>ännu</word>
247
+ <word>äntligen</word>
248
+ <word>är</word>
249
+ <word>även</word>
250
+ <word>ävensom</word>
251
+ <word>ömsom</word>
252
+ <word>över</word>
253
+ <word>överallt</word>
254
+ </grader-tc>
255
+ </dictionary>