ots 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/ext/article.c
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
/*
|
2
|
+
* article.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
extern void ots_free_TF_wordlist (GList * aList);
|
29
|
+
|
30
|
+
#define MAX_WORD_LENGTH 35
|
31
|
+
|
32
|
+
/*Data structure related functions*/
|
33
|
+
|
34
|
+
OtsSentence *
|
35
|
+
ots_new_sentence (void)
|
36
|
+
{
|
37
|
+
OtsSentence *aLine = g_new0 (OtsSentence, 1);
|
38
|
+
aLine->words = NULL;
|
39
|
+
aLine->wc = 0;
|
40
|
+
aLine->selected = 0;
|
41
|
+
aLine->score = 0;
|
42
|
+
return aLine;
|
43
|
+
}
|
44
|
+
|
45
|
+
void
|
46
|
+
ots_free_sentence (OtsSentence * sen)
|
47
|
+
{
|
48
|
+
if (sen != NULL)
|
49
|
+
{
|
50
|
+
g_list_foreach (sen->words, (GFunc) g_free, NULL);
|
51
|
+
g_list_free (sen->words);
|
52
|
+
g_free (sen);
|
53
|
+
}
|
54
|
+
sen=NULL;
|
55
|
+
}
|
56
|
+
|
57
|
+
OtsArticle *
|
58
|
+
ots_new_article (void)
|
59
|
+
{
|
60
|
+
OtsArticle *Doc;
|
61
|
+
Doc = g_new0 (OtsArticle, 1);
|
62
|
+
Doc->lineCount = 0;
|
63
|
+
Doc->title = NULL;
|
64
|
+
Doc->stem=new_stem_rule ();
|
65
|
+
Doc->lines=NULL;
|
66
|
+
Doc->dict = NULL;
|
67
|
+
Doc->ImpWords = NULL;
|
68
|
+
Doc->wordStat = NULL;
|
69
|
+
|
70
|
+
Doc->tf_terms=NULL;
|
71
|
+
return Doc;
|
72
|
+
}
|
73
|
+
|
74
|
+
void
|
75
|
+
ots_free_article (OtsArticle * art)
|
76
|
+
{
|
77
|
+
if (NULL != art)
|
78
|
+
{
|
79
|
+
free_stem_rule (art->stem);
|
80
|
+
ots_free_wordlist (art->dict);
|
81
|
+
ots_free_wordlist (art->ImpWords);
|
82
|
+
ots_free_wordlist (art->wordStat);
|
83
|
+
|
84
|
+
ots_free_TF_wordlist(art->tf_terms);
|
85
|
+
|
86
|
+
g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL);
|
87
|
+
g_list_free (art->lines);
|
88
|
+
|
89
|
+
if (art->title != NULL) g_free (art->title);
|
90
|
+
g_free (art);
|
91
|
+
}
|
92
|
+
art=NULL;
|
93
|
+
}
|
94
|
+
|
95
|
+
OtsSentence *
|
96
|
+
ots_append_line (OtsArticle * Doc)
|
97
|
+
{
|
98
|
+
OtsSentence *aLine = ots_new_sentence ();
|
99
|
+
Doc->lineCount++;
|
100
|
+
Doc->lines = g_list_append (Doc->lines, aLine);
|
101
|
+
return aLine;
|
102
|
+
}
|
103
|
+
|
104
|
+
void
|
105
|
+
ots_append_word (OtsSentence * aLine,unsigned const char *aWord)
|
106
|
+
{
|
107
|
+
if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return;
|
108
|
+
aLine->wc++;
|
109
|
+
aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord));
|
110
|
+
return;
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
gboolean
|
115
|
+
ots_is_line_selected(const OtsSentence *aLine)
|
116
|
+
{
|
117
|
+
if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;}
|
118
|
+
return (aLine->selected);
|
119
|
+
}
|
data/ext/dictionary.c
ADDED
@@ -0,0 +1,335 @@
|
|
1
|
+
/*
|
2
|
+
* dictionary.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
#include <libxml/xmlmemory.h>
|
29
|
+
#include <libxml/parser.h>
|
30
|
+
|
31
|
+
|
32
|
+
/* loads the xml dictionary file to memory*/
|
33
|
+
|
34
|
+
gboolean
|
35
|
+
ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
|
36
|
+
{
|
37
|
+
|
38
|
+
xmlDocPtr doc=NULL;
|
39
|
+
xmlNodePtr head=NULL;
|
40
|
+
xmlNodePtr stem=NULL;
|
41
|
+
xmlNodePtr pre=NULL;
|
42
|
+
xmlNodePtr post=NULL;
|
43
|
+
xmlNodePtr syno=NULL; /* synonyms */
|
44
|
+
xmlNodePtr manual=NULL; /* manual */
|
45
|
+
xmlNodePtr step1_pre=NULL; /* step1 */
|
46
|
+
xmlNodePtr step1_post=NULL; /* step1 */
|
47
|
+
|
48
|
+
xmlNodePtr parse=NULL; /* parser rules */
|
49
|
+
xmlNodePtr pbreak=NULL;
|
50
|
+
xmlNodePtr pdbreak=NULL;
|
51
|
+
|
52
|
+
xmlNodePtr tc_words=NULL; /* term count dictionary */
|
53
|
+
xmlNodePtr tf_words=NULL; /* term frequency dictionary */
|
54
|
+
|
55
|
+
|
56
|
+
OtsStemRule * rule=Doc->stem;
|
57
|
+
|
58
|
+
char *dict_name;
|
59
|
+
char *local_dict_name;
|
60
|
+
|
61
|
+
dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
|
62
|
+
local_dict_name = g_strdup_printf ("%s.xml", name);
|
63
|
+
|
64
|
+
|
65
|
+
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
66
|
+
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
|
67
|
+
if (doc == NULL) doc = xmlParseFile (dict_name);
|
68
|
+
if (doc == NULL) return (FALSE);
|
69
|
+
|
70
|
+
head = xmlDocGetRootElement (doc);
|
71
|
+
if (head == NULL)
|
72
|
+
{
|
73
|
+
fprintf (stderr, "empty document\n");
|
74
|
+
xmlFreeDoc (doc);
|
75
|
+
return (FALSE);
|
76
|
+
}
|
77
|
+
|
78
|
+
if (xmlStrcmp (head->name, (const xmlChar *) "dictionary"))
|
79
|
+
{
|
80
|
+
fprintf (stderr, "%s", head->name);
|
81
|
+
xmlFreeDoc (doc);
|
82
|
+
return (FALSE);
|
83
|
+
}
|
84
|
+
|
85
|
+
if (head != NULL)
|
86
|
+
stem = head->xmlChildrenNode;
|
87
|
+
while ((stem != NULL)
|
88
|
+
&& (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
|
89
|
+
{
|
90
|
+
stem = stem->next;
|
91
|
+
}
|
92
|
+
|
93
|
+
if (head != NULL)
|
94
|
+
parse = head->xmlChildrenNode;
|
95
|
+
while ((parse != NULL)
|
96
|
+
&& (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
|
97
|
+
{
|
98
|
+
parse = parse->next;
|
99
|
+
}
|
100
|
+
|
101
|
+
if (head != NULL)
|
102
|
+
tc_words = head->xmlChildrenNode;
|
103
|
+
while ((tc_words != NULL)
|
104
|
+
&& (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
|
105
|
+
{
|
106
|
+
tc_words = tc_words->next;
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
if (head != NULL)
|
111
|
+
tf_words = head->xmlChildrenNode;
|
112
|
+
while ((tf_words != NULL)
|
113
|
+
&& (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
|
114
|
+
{
|
115
|
+
tf_words = tf_words->next;
|
116
|
+
}
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
if (stem != NULL)
|
121
|
+
pre = stem->xmlChildrenNode;
|
122
|
+
while ((pre != NULL) && (xmlStrcmp (pre->name, (const xmlChar *) "pre")))
|
123
|
+
{
|
124
|
+
pre = pre->next;
|
125
|
+
}
|
126
|
+
|
127
|
+
if (stem != NULL)
|
128
|
+
post = stem->xmlChildrenNode;
|
129
|
+
while ((post != NULL) && (xmlStrcmp (post->name, (const xmlChar *) "post")))
|
130
|
+
{
|
131
|
+
post = post->next;
|
132
|
+
}
|
133
|
+
|
134
|
+
|
135
|
+
if (stem != NULL)
|
136
|
+
syno = stem->xmlChildrenNode;
|
137
|
+
while ((syno != NULL)
|
138
|
+
&& (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
|
139
|
+
{
|
140
|
+
syno = syno->next;
|
141
|
+
}
|
142
|
+
|
143
|
+
if (stem != NULL)
|
144
|
+
manual = stem->xmlChildrenNode;
|
145
|
+
while ((manual != NULL)
|
146
|
+
&& (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
|
147
|
+
{
|
148
|
+
manual = manual->next;
|
149
|
+
}
|
150
|
+
|
151
|
+
|
152
|
+
if (stem != NULL)
|
153
|
+
step1_pre = stem->xmlChildrenNode;
|
154
|
+
while ((step1_pre != NULL)
|
155
|
+
&& (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
|
156
|
+
{
|
157
|
+
step1_pre = step1_pre->next;
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
if (stem != NULL)
|
163
|
+
step1_post = stem->xmlChildrenNode;
|
164
|
+
while ((step1_post != NULL)
|
165
|
+
&& (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
|
166
|
+
{
|
167
|
+
step1_post = step1_post->next;
|
168
|
+
}
|
169
|
+
|
170
|
+
|
171
|
+
if (pre != NULL)
|
172
|
+
pre = pre->xmlChildrenNode; /*point to first word */
|
173
|
+
while (pre != NULL)
|
174
|
+
{
|
175
|
+
if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
|
176
|
+
rule->RemovePre =
|
177
|
+
g_list_append (rule->RemovePre,
|
178
|
+
(xmlNodeListGetString
|
179
|
+
(doc, pre->xmlChildrenNode, 1)));
|
180
|
+
pre = pre->next;
|
181
|
+
}
|
182
|
+
|
183
|
+
|
184
|
+
if (post != NULL)
|
185
|
+
post = post->xmlChildrenNode;
|
186
|
+
while (post != NULL)
|
187
|
+
{
|
188
|
+
if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
|
189
|
+
rule->RemovePost =
|
190
|
+
g_list_append (rule->RemovePost,
|
191
|
+
(xmlNodeListGetString
|
192
|
+
(doc, post->xmlChildrenNode, 1)));
|
193
|
+
post = post->next;
|
194
|
+
}
|
195
|
+
|
196
|
+
if (syno != NULL)
|
197
|
+
syno = syno->xmlChildrenNode;
|
198
|
+
while (syno != NULL)
|
199
|
+
{
|
200
|
+
if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
|
201
|
+
rule->synonyms =
|
202
|
+
g_list_append (rule->synonyms,
|
203
|
+
(xmlNodeListGetString
|
204
|
+
(doc, syno->xmlChildrenNode, 1)));
|
205
|
+
syno = syno->next;
|
206
|
+
}
|
207
|
+
|
208
|
+
if (manual != NULL)
|
209
|
+
manual = manual->xmlChildrenNode;
|
210
|
+
while (manual != NULL)
|
211
|
+
{
|
212
|
+
if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
|
213
|
+
rule->manual =
|
214
|
+
g_list_append (rule->manual,
|
215
|
+
(xmlNodeListGetString
|
216
|
+
(doc, manual->xmlChildrenNode, 1)));
|
217
|
+
manual = manual->next;
|
218
|
+
}
|
219
|
+
|
220
|
+
|
221
|
+
|
222
|
+
|
223
|
+
if (step1_pre != NULL)
|
224
|
+
step1_pre = step1_pre->xmlChildrenNode;
|
225
|
+
while (step1_pre != NULL)
|
226
|
+
{
|
227
|
+
if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
|
228
|
+
rule->step1_pre =
|
229
|
+
g_list_append (rule->step1_pre,
|
230
|
+
(xmlNodeListGetString
|
231
|
+
(doc, step1_pre->xmlChildrenNode, 1)));
|
232
|
+
step1_pre = step1_pre->next;
|
233
|
+
}
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
if (step1_post != NULL)
|
238
|
+
step1_post = step1_post->xmlChildrenNode;
|
239
|
+
while (step1_post != NULL)
|
240
|
+
{
|
241
|
+
if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
|
242
|
+
rule->step1_post =
|
243
|
+
g_list_append (rule->step1_post,
|
244
|
+
(xmlNodeListGetString
|
245
|
+
(doc, step1_post->xmlChildrenNode, 1)));
|
246
|
+
step1_post = step1_post->next;
|
247
|
+
}
|
248
|
+
|
249
|
+
if (parse != NULL)
|
250
|
+
pbreak = parse->xmlChildrenNode;
|
251
|
+
while ((pbreak != NULL) && (xmlStrcmp (pbreak->name, (const xmlChar *) "linebreak")))
|
252
|
+
{
|
253
|
+
pbreak = pbreak->next;
|
254
|
+
}
|
255
|
+
|
256
|
+
|
257
|
+
|
258
|
+
if (parse != NULL)
|
259
|
+
pdbreak = parse->xmlChildrenNode;
|
260
|
+
while ((pdbreak != NULL) && (xmlStrcmp (pdbreak->name, (const xmlChar *) "linedontbreak")))
|
261
|
+
{
|
262
|
+
pdbreak = pdbreak->next;
|
263
|
+
}
|
264
|
+
|
265
|
+
|
266
|
+
/*Parser break*/
|
267
|
+
if (pbreak != NULL)
|
268
|
+
pbreak = pbreak->xmlChildrenNode;
|
269
|
+
while (pbreak != NULL)
|
270
|
+
{
|
271
|
+
if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
|
272
|
+
rule->ParserBreak =
|
273
|
+
g_list_append (rule->ParserBreak,
|
274
|
+
(xmlNodeListGetString
|
275
|
+
(doc, pbreak->xmlChildrenNode, 1)));
|
276
|
+
pbreak = pbreak->next;
|
277
|
+
}
|
278
|
+
|
279
|
+
/*Parser Don't break*/
|
280
|
+
if (pdbreak != NULL)
|
281
|
+
pdbreak = pdbreak->xmlChildrenNode;
|
282
|
+
while (pdbreak != NULL)
|
283
|
+
{
|
284
|
+
if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
|
285
|
+
rule->ParserDontBreak =
|
286
|
+
g_list_append (rule->ParserDontBreak,
|
287
|
+
(xmlNodeListGetString
|
288
|
+
(doc, pdbreak->xmlChildrenNode, 1)));
|
289
|
+
pdbreak = pdbreak->next;
|
290
|
+
}
|
291
|
+
|
292
|
+
/*Term Count load dict*/
|
293
|
+
|
294
|
+
if (tc_words != NULL)
|
295
|
+
tc_words = tc_words->xmlChildrenNode;
|
296
|
+
while (tc_words != NULL)
|
297
|
+
{
|
298
|
+
if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
|
299
|
+
{
|
300
|
+
xmlChar *key;
|
301
|
+
key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
|
302
|
+
Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
|
303
|
+
xmlFree(key);
|
304
|
+
}
|
305
|
+
tc_words = tc_words->next;
|
306
|
+
}
|
307
|
+
|
308
|
+
|
309
|
+
/*Term Frequency load dict*/
|
310
|
+
|
311
|
+
if (tf_words != NULL)
|
312
|
+
tf_words = tf_words->xmlChildrenNode;
|
313
|
+
while (tf_words != NULL)
|
314
|
+
{
|
315
|
+
if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
|
316
|
+
{
|
317
|
+
xmlChar *key;
|
318
|
+
xmlChar *idf_key;
|
319
|
+
key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
|
320
|
+
|
321
|
+
idf_key=xmlGetProp(tf_words,"idf");
|
322
|
+
Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
|
323
|
+
xmlFree(key);
|
324
|
+
xmlFree(idf_key);
|
325
|
+
}
|
326
|
+
tf_words = tf_words->next;
|
327
|
+
}
|
328
|
+
|
329
|
+
|
330
|
+
xmlFreeDoc(doc);
|
331
|
+
//xmlCleanupParser ();
|
332
|
+
g_free(dict_name);
|
333
|
+
g_free(local_dict_name);
|
334
|
+
return (TRUE);
|
335
|
+
}
|