ots 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/dictionaries/is.xml
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="icelandic">
|
3
|
+
<stemmer>
|
4
|
+
|
5
|
+
<step1_pre>
|
6
|
+
<rule>"|</rule>
|
7
|
+
<rule>(|</rule>
|
8
|
+
</step1_pre>
|
9
|
+
|
10
|
+
|
11
|
+
<step1_post>
|
12
|
+
<rule>."|</rule>
|
13
|
+
<rule>,"|</rule>
|
14
|
+
<rule>.|</rule>
|
15
|
+
<rule>,|</rule>
|
16
|
+
<rule>"|</rule>
|
17
|
+
<rule>)|</rule>
|
18
|
+
<rule>?|</rule>
|
19
|
+
<rule>:|</rule>
|
20
|
+
<rule>;|</rule>
|
21
|
+
<rule>!|</rule>
|
22
|
+
</step1_post>
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
<manual>
|
27
|
+
<rule>wrote|write</rule>
|
28
|
+
<rule>came|come</rule>
|
29
|
+
<rule>went|go</rule>
|
30
|
+
</manual>
|
31
|
+
|
32
|
+
<post>
|
33
|
+
<rule>before1|1after</rule>
|
34
|
+
</post>
|
35
|
+
<pre>
|
36
|
+
<rule>before1|1after</rule>
|
37
|
+
</pre>
|
38
|
+
</stemmer>
|
39
|
+
<parser>
|
40
|
+
|
41
|
+
<linebreak>
|
42
|
+
<rule>."</rule>
|
43
|
+
<rule>?"</rule>
|
44
|
+
<rule>!"</rule>
|
45
|
+
<rule>,"</rule>
|
46
|
+
<rule>.</rule>
|
47
|
+
<rule>?</rule>
|
48
|
+
<rule>;</rule>
|
49
|
+
<rule>|</rule>
|
50
|
+
<rule>!</rule>
|
51
|
+
</linebreak>
|
52
|
+
|
53
|
+
<linedontbreak>
|
54
|
+
<rule>Dr.</rule>
|
55
|
+
<rule>Mr.</rule>
|
56
|
+
<rule>Mrs.</rule>
|
57
|
+
<rule>U.S.</rule>
|
58
|
+
<rule>Rep.</rule>
|
59
|
+
<rule>Sen.</rule>
|
60
|
+
</linedontbreak>
|
61
|
+
</parser>
|
62
|
+
<grader-tc>
|
63
|
+
<word>að</word>
|
64
|
+
<word>af</word>
|
65
|
+
<word>andspænis</word>
|
66
|
+
<word>annaðhvort</word>
|
67
|
+
<word>auk</word>
|
68
|
+
<word>austan</word>
|
69
|
+
<word>á</word>
|
70
|
+
<word>án</word>
|
71
|
+
<word>ásamt</word>
|
72
|
+
<word>bæði</word>
|
73
|
+
<word>eða</word>
|
74
|
+
<word>ef</word>
|
75
|
+
<word>eftir</word>
|
76
|
+
<word>eiga</word>
|
77
|
+
<word>en</word>
|
78
|
+
<word>er</word>
|
79
|
+
<word>ert</word>
|
80
|
+
<word>eru</word>
|
81
|
+
<word>eruð</word>
|
82
|
+
<word>erum</word>
|
83
|
+
<word>ég</word>
|
84
|
+
<word>fara</word>
|
85
|
+
<word>fá</word>
|
86
|
+
<word>frá</word>
|
87
|
+
<word>fyrir</word>
|
88
|
+
<word>fyrst</word>
|
89
|
+
<word>gagn</word>
|
90
|
+
<word>gagnvart</word>
|
91
|
+
<word>gegnt</word>
|
92
|
+
<word>gegnum</word>
|
93
|
+
<word>geta</word>
|
94
|
+
<word>hafa</word>
|
95
|
+
<word>hana</word>
|
96
|
+
<word>handa</word>
|
97
|
+
<word>hann</word>
|
98
|
+
<word>hans</word>
|
99
|
+
<word>hennar</word>
|
100
|
+
<word>henni</word>
|
101
|
+
<word>hið</word>
|
102
|
+
<word>hin</word>
|
103
|
+
<word>hina</word>
|
104
|
+
<word>hinar</word>
|
105
|
+
<word>hinir</word>
|
106
|
+
<word>hinn</word>
|
107
|
+
<word>hinna</word>
|
108
|
+
<word>hinnar</word>
|
109
|
+
<word>hinni</word>
|
110
|
+
<word>hins</word>
|
111
|
+
<word>hinu</word>
|
112
|
+
<word>hinum</word>
|
113
|
+
<word>hjá</word>
|
114
|
+
<word>honum</word>
|
115
|
+
<word>hún</word>
|
116
|
+
<word>hver</word>
|
117
|
+
<word>hverjum</word>
|
118
|
+
<word>hvorki</word>
|
119
|
+
<word>hvort</word>
|
120
|
+
<word>innan</word>
|
121
|
+
<word>í</word>
|
122
|
+
<word>kringum</word>
|
123
|
+
<word>með</word>
|
124
|
+
<word>meðal</word>
|
125
|
+
<word>meðfram</word>
|
126
|
+
<word>mega</word>
|
127
|
+
<word>megin</word>
|
128
|
+
<word>mér</word>
|
129
|
+
<word>mig</word>
|
130
|
+
<word>milli</word>
|
131
|
+
<word>millum</word>
|
132
|
+
<word>mín</word>
|
133
|
+
<word>mót</word>
|
134
|
+
<word>móti</word>
|
135
|
+
<word>munu</word>
|
136
|
+
<word>nálægt</word>
|
137
|
+
<word>neðan</word>
|
138
|
+
<word>nema</word>
|
139
|
+
<word>né</word>
|
140
|
+
<word>norðan</word>
|
141
|
+
<word>ofan</word>
|
142
|
+
<word>og</word>
|
143
|
+
<word>okkur</word>
|
144
|
+
<word>pro</word>
|
145
|
+
<word>sakir</word>
|
146
|
+
<word>sem</word>
|
147
|
+
<word>sé</word>
|
148
|
+
<word>sért</word>
|
149
|
+
<word>séu</word>
|
150
|
+
<word>séuð</word>
|
151
|
+
<word>séum</word>
|
152
|
+
<word>síðan</word>
|
153
|
+
<word>skulu</word>
|
154
|
+
<word>sunnan</word>
|
155
|
+
<word>sökum</word>
|
156
|
+
<word>til</word>
|
157
|
+
<word>um</word>
|
158
|
+
<word>umfram</word>
|
159
|
+
<word>umhverfis</word>
|
160
|
+
<word>undan</word>
|
161
|
+
<word>undir</word>
|
162
|
+
<word>utan</word>
|
163
|
+
<word>úr</word>
|
164
|
+
<word>var</word>
|
165
|
+
<word>varst</word>
|
166
|
+
<word>vegna</word>
|
167
|
+
<word>vera</word>
|
168
|
+
<word>verandi</word>
|
169
|
+
<word>vestan</word>
|
170
|
+
<word>við</word>
|
171
|
+
<word>voru</word>
|
172
|
+
<word>voruð</word>
|
173
|
+
<word>vorum</word>
|
174
|
+
<word>væri</word>
|
175
|
+
<word>værir</word>
|
176
|
+
<word>væru</word>
|
177
|
+
<word>væruð</word>
|
178
|
+
<word>værum</word>
|
179
|
+
<word>yðar</word>
|
180
|
+
<word>yður</word>
|
181
|
+
<word>yfir</word>
|
182
|
+
<word>ykkar</word>
|
183
|
+
<word>ykkur</word>
|
184
|
+
<word>það</word>
|
185
|
+
<word>þau</word>
|
186
|
+
<word>þá</word>
|
187
|
+
<word>þegar</word>
|
188
|
+
<word>þeim</word>
|
189
|
+
<word>þeir</word>
|
190
|
+
<word>þeirra</word>
|
191
|
+
<word>þess</word>
|
192
|
+
<word>þér</word>
|
193
|
+
<word>þið</word>
|
194
|
+
<word>þig</word>
|
195
|
+
<word>þín</word>
|
196
|
+
<word>þótt</word>
|
197
|
+
<word>þú</word>
|
198
|
+
<word>því</word>
|
199
|
+
<word>þær</word>
|
200
|
+
</grader-tc>
|
201
|
+
</dictionary>
|
data/dictionaries/it.xml
ADDED
@@ -0,0 +1,206 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="italian">
|
3
|
+
<stemmer>
|
4
|
+
|
5
|
+
<step1_pre>
|
6
|
+
<rule>"|</rule>
|
7
|
+
<rule>(|</rule>
|
8
|
+
</step1_pre>
|
9
|
+
|
10
|
+
|
11
|
+
<step1_post>
|
12
|
+
<rule>."|</rule>
|
13
|
+
<rule>,"|</rule>
|
14
|
+
<rule>.|</rule>
|
15
|
+
<rule>,|</rule>
|
16
|
+
<rule>"|</rule>
|
17
|
+
<rule>)|</rule>
|
18
|
+
<rule>?|</rule>
|
19
|
+
<rule>:|</rule>
|
20
|
+
<rule>;|</rule>
|
21
|
+
<rule>!|</rule>
|
22
|
+
</step1_post>
|
23
|
+
|
24
|
+
|
25
|
+
<manual>
|
26
|
+
<rule>wrote|write</rule>
|
27
|
+
<rule>came|come</rule>
|
28
|
+
<rule>went|go</rule>
|
29
|
+
</manual>
|
30
|
+
|
31
|
+
<post>
|
32
|
+
<rule>before1|1after</rule>
|
33
|
+
</post>
|
34
|
+
<pre>
|
35
|
+
<rule>before1|1after</rule>
|
36
|
+
</pre>
|
37
|
+
</stemmer>
|
38
|
+
<parser>
|
39
|
+
|
40
|
+
<linebreak>
|
41
|
+
<rule>."</rule>
|
42
|
+
<rule>?"</rule>
|
43
|
+
<rule>!"</rule>
|
44
|
+
<rule>,"</rule>
|
45
|
+
<rule>.</rule>
|
46
|
+
<rule>?</rule>
|
47
|
+
<rule>;</rule>
|
48
|
+
<rule>|</rule>
|
49
|
+
<rule>!</rule>
|
50
|
+
</linebreak>
|
51
|
+
|
52
|
+
<linedontbreak>
|
53
|
+
<rule>Dr.</rule>
|
54
|
+
<rule>Mr.</rule>
|
55
|
+
<rule>Mrs.</rule>
|
56
|
+
<rule>U.S.</rule>
|
57
|
+
<rule>Rep.</rule>
|
58
|
+
<rule>Sen.</rule>
|
59
|
+
</linedontbreak>
|
60
|
+
</parser>
|
61
|
+
<grader-tc>
|
62
|
+
<word>il</word>
|
63
|
+
<word>lo</word>
|
64
|
+
<word>l'</word>
|
65
|
+
<word>i</word>
|
66
|
+
<word>gli</word>
|
67
|
+
<word>gl'</word>
|
68
|
+
<word>la</word>
|
69
|
+
<word>le</word>
|
70
|
+
<word>un</word>
|
71
|
+
<word>uno</word>
|
72
|
+
<word>una</word>
|
73
|
+
<word>un'</word>
|
74
|
+
<word>io</word>
|
75
|
+
<word>noi</word>
|
76
|
+
<word>mio</word>
|
77
|
+
<word>tu</word>
|
78
|
+
<word>voi</word>
|
79
|
+
<word>vostro</word>
|
80
|
+
<word>lui</word>
|
81
|
+
<word>lei</word>
|
82
|
+
<word>egli</word>
|
83
|
+
<word>ella</word>
|
84
|
+
<word>esso</word>
|
85
|
+
<word>essa</word>
|
86
|
+
<word>loro</word>
|
87
|
+
<word>essi</word>
|
88
|
+
<word>esse</word>
|
89
|
+
<word>suo</word>
|
90
|
+
<word>sé</word>
|
91
|
+
<word>si</word>
|
92
|
+
<word>c'è</word>
|
93
|
+
<word>a</word>
|
94
|
+
<word>ad</word>
|
95
|
+
<word>alcuno</word>
|
96
|
+
<word>che</word>
|
97
|
+
<word>come</word>
|
98
|
+
<word>con</word>
|
99
|
+
<word>così</word>
|
100
|
+
<word>da</word>
|
101
|
+
<word>di</word>
|
102
|
+
<word>domani</word>
|
103
|
+
<word>e</word>
|
104
|
+
<word>ed</word>
|
105
|
+
<word>in</word>
|
106
|
+
<word>infine</word>
|
107
|
+
<word>ma</word>
|
108
|
+
<word>mai</word>
|
109
|
+
<word>mentre</word>
|
110
|
+
<word>molto</word>
|
111
|
+
<word>né</word>
|
112
|
+
<word>nessuno</word>
|
113
|
+
<word>nessun</word>
|
114
|
+
<word>nessuna</word>
|
115
|
+
<word>nessun'</word>
|
116
|
+
<word>niente</word>
|
117
|
+
<word>no</word>
|
118
|
+
<word>non</word>
|
119
|
+
<word>nulla</word>
|
120
|
+
<word>o</word>
|
121
|
+
<word>oggi</word>
|
122
|
+
<word>ora</word>
|
123
|
+
<word>per</word>
|
124
|
+
<word>poi</word>
|
125
|
+
<word>poiché</word>
|
126
|
+
<word>qualche</word>
|
127
|
+
<word>qualcuno</word>
|
128
|
+
<word>quando</word>
|
129
|
+
<word>questo</word>
|
130
|
+
<word>qui</word>
|
131
|
+
<word>se</word>
|
132
|
+
<word>su</word>
|
133
|
+
<word>troppo</word>
|
134
|
+
<word>tutto</word>
|
135
|
+
<word>al</word>
|
136
|
+
<word>ai</word>
|
137
|
+
<word>allo</word>
|
138
|
+
<word>agli</word>
|
139
|
+
<word>alla</word>
|
140
|
+
<word>alle</word>
|
141
|
+
<word>all'</word>
|
142
|
+
<word>col</word>
|
143
|
+
<word>coi</word>
|
144
|
+
<word>collo</word>
|
145
|
+
<word>cogli</word>
|
146
|
+
<word>colla</word>
|
147
|
+
<word>colle</word>
|
148
|
+
<word>coll'</word>
|
149
|
+
<word>dal</word>
|
150
|
+
<word>dai</word>
|
151
|
+
<word>dallo</word>
|
152
|
+
<word>dagli</word>
|
153
|
+
<word>dalla</word>
|
154
|
+
<word>dalle</word>
|
155
|
+
<word>dall'</word>
|
156
|
+
<word>del</word>
|
157
|
+
<word>dei</word>
|
158
|
+
<word>dello</word>
|
159
|
+
<word>degli</word>
|
160
|
+
<word>della</word>
|
161
|
+
<word>delle</word>
|
162
|
+
<word>dell'</word>
|
163
|
+
<word>nel</word>
|
164
|
+
<word>nei</word>
|
165
|
+
<word>nello</word>
|
166
|
+
<word>negli</word>
|
167
|
+
<word>nella</word>
|
168
|
+
<word>nelle</word>
|
169
|
+
<word>nell'</word>
|
170
|
+
<word>pel</word>
|
171
|
+
<word>pei</word>
|
172
|
+
<word>sul</word>
|
173
|
+
<word>sui</word>
|
174
|
+
<word>sullo</word>
|
175
|
+
<word>sugli</word>
|
176
|
+
<word>sulla</word>
|
177
|
+
<word>sulle</word>
|
178
|
+
<word>sull'</word>
|
179
|
+
<word>primo</word>
|
180
|
+
<word>essere</word>
|
181
|
+
<word>sono</word>
|
182
|
+
<word>sei</word>
|
183
|
+
<word>è</word>
|
184
|
+
<word>siamo</word>
|
185
|
+
<word>siete</word>
|
186
|
+
<word>stare</word>
|
187
|
+
<word>sto</word>
|
188
|
+
<word>stai</word>
|
189
|
+
<word>sta</word>
|
190
|
+
<word>stiamo</word>
|
191
|
+
<word>stano</word>
|
192
|
+
<word></word>
|
193
|
+
<word>avere</word>
|
194
|
+
<word>ho</word>
|
195
|
+
<word>hai</word>
|
196
|
+
<word>ha</word>
|
197
|
+
<word>abbiamo</word>
|
198
|
+
<word>avete</word>
|
199
|
+
<word>hanno</word>
|
200
|
+
<word>dovere</word>
|
201
|
+
<word>potere</word>
|
202
|
+
<word>andare</word>
|
203
|
+
<word>va</word>
|
204
|
+
<word></word>
|
205
|
+
</grader-tc>
|
206
|
+
</dictionary>
|
data/dictionaries/lv.xml
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="latvian">
|
3
|
+
<stemmer>
|
4
|
+
|
5
|
+
<step1_pre>
|
6
|
+
<rule>"|</rule>
|
7
|
+
<rule>(|</rule>
|
8
|
+
</step1_pre>
|
9
|
+
|
10
|
+
|
11
|
+
<step1_post>
|
12
|
+
<rule>."|</rule>
|
13
|
+
<rule>,"|</rule>
|
14
|
+
<rule>.|</rule>
|
15
|
+
<rule>,|</rule>
|
16
|
+
<rule>"|</rule>
|
17
|
+
<rule>)|</rule>
|
18
|
+
<rule>?|</rule>
|
19
|
+
<rule>:|</rule>
|
20
|
+
<rule>;|</rule>
|
21
|
+
<rule>!|</rule>
|
22
|
+
</step1_post>
|
23
|
+
|
24
|
+
|
25
|
+
<manual>
|
26
|
+
<rule>wrote|write</rule>
|
27
|
+
<rule>came|come</rule>
|
28
|
+
<rule>went|go</rule>
|
29
|
+
</manual>
|
30
|
+
|
31
|
+
<post>
|
32
|
+
<rule>before1|1after</rule>
|
33
|
+
</post>
|
34
|
+
<pre>
|
35
|
+
<rule>before1|1after</rule>
|
36
|
+
</pre>
|
37
|
+
</stemmer>
|
38
|
+
<parser>
|
39
|
+
|
40
|
+
<linebreak>
|
41
|
+
<rule>."</rule>
|
42
|
+
<rule>?"</rule>
|
43
|
+
<rule>!"</rule>
|
44
|
+
<rule>,"</rule>
|
45
|
+
<rule>.</rule>
|
46
|
+
<rule>?</rule>
|
47
|
+
<rule>;</rule>
|
48
|
+
<rule>|</rule>
|
49
|
+
<rule>!</rule>
|
50
|
+
</linebreak>
|
51
|
+
|
52
|
+
<linedontbreak>
|
53
|
+
<rule>Dr.</rule>
|
54
|
+
<rule>Mr.</rule>
|
55
|
+
<rule>Mrs.</rule>
|
56
|
+
<rule>U.S.</rule>
|
57
|
+
<rule>Rep.</rule>
|
58
|
+
<rule>Sen.</rule>
|
59
|
+
</linedontbreak>
|
60
|
+
</parser>
|
61
|
+
<grader-tc>
|
62
|
+
<word>pa</word>
|
63
|
+
<word>par</word>
|
64
|
+
<word>pat</word>
|
65
|
+
<word>pats</word>
|
66
|
+
<word>pār</word>
|
67
|
+
<word>pārāk</word>
|
68
|
+
<word>pārējais</word>
|
69
|
+
<word>pāri</word>
|
70
|
+
<word>pēc</word>
|
71
|
+
<word>pie</word>
|
72
|
+
<word>pirms</word>
|
73
|
+
<word>pret</word>
|
74
|
+
<word>priekšu</word>
|
75
|
+
<word>projām</word>
|
76
|
+
</grader-tc>
|
77
|
+
</dictionary>
|