ots 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +80 -0
- data/dictionaries/bg.xml +101 -0
- data/dictionaries/ca.xml +141 -0
- data/dictionaries/cs.xml +161 -0
- data/dictionaries/cy.xml +118 -0
- data/dictionaries/da.xml +129 -0
- data/dictionaries/de.xml +354 -0
- data/dictionaries/el.xml +80 -0
- data/dictionaries/en.xml +606 -0
- data/dictionaries/eo.xml +171 -0
- data/dictionaries/es.xml +369 -0
- data/dictionaries/et.xml +172 -0
- data/dictionaries/eu.xml +77 -0
- data/dictionaries/fi.xml +105 -0
- data/dictionaries/fr.xml +199 -0
- data/dictionaries/ga.xml +124 -0
- data/dictionaries/gl.xml +290 -0
- data/dictionaries/he.xml +334 -0
- data/dictionaries/hu.xml +280 -0
- data/dictionaries/ia.xml +97 -0
- data/dictionaries/id.xml +75 -0
- data/dictionaries/is.xml +201 -0
- data/dictionaries/it.xml +206 -0
- data/dictionaries/lv.xml +77 -0
- data/dictionaries/mi.xml +76 -0
- data/dictionaries/ms.xml +160 -0
- data/dictionaries/mt.xml +73 -0
- data/dictionaries/nl.xml +245 -0
- data/dictionaries/nn.xml +264 -0
- data/dictionaries/pl.xml +92 -0
- data/dictionaries/pt.xml +365 -0
- data/dictionaries/ro.xml +163 -0
- data/dictionaries/ru.xml +150 -0
- data/dictionaries/sv.xml +255 -0
- data/dictionaries/tl.xml +67 -0
- data/dictionaries/tr.xml +65 -0
- data/dictionaries/uk.xml +98 -0
- data/dictionaries/yi.xml +293 -0
- data/ext/article.c +119 -0
- data/ext/dictionary.c +335 -0
- data/ext/extconf.rb +13 -14
- data/ext/grader-tc.c +185 -0
- data/ext/grader-tc.h +64 -0
- data/ext/grader-tf.c +116 -0
- data/ext/grader.c +85 -0
- data/ext/highlighter.c +128 -0
- data/ext/html.c +131 -0
- data/ext/libots.h +158 -0
- data/ext/ots.c +130 -151
- data/ext/ots.h +15 -0
- data/ext/parser.c +173 -0
- data/ext/relations.c +163 -0
- data/ext/stemmer.c +332 -0
- data/ext/text.c +98 -0
- data/ext/version.h +2 -0
- data/ext/wordlist.c +220 -0
- data/test/helper.rb +3 -0
- data/test/test_article.rb +52 -0
- data/test/test_ots.rb +23 -0
- metadata +122 -38
- data/README +0 -25
- data/VERSION +0 -1
- data/lib/ots.rb +0 -1
- data/test/ots_test.rb +0 -62
data/ext/article.c
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
/*
|
2
|
+
* article.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
extern void ots_free_TF_wordlist (GList * aList);
|
29
|
+
|
30
|
+
#define MAX_WORD_LENGTH 35
|
31
|
+
|
32
|
+
/*Data structure related functions*/
|
33
|
+
|
34
|
+
OtsSentence *
|
35
|
+
ots_new_sentence (void)
|
36
|
+
{
|
37
|
+
OtsSentence *aLine = g_new0 (OtsSentence, 1);
|
38
|
+
aLine->words = NULL;
|
39
|
+
aLine->wc = 0;
|
40
|
+
aLine->selected = 0;
|
41
|
+
aLine->score = 0;
|
42
|
+
return aLine;
|
43
|
+
}
|
44
|
+
|
45
|
+
void
|
46
|
+
ots_free_sentence (OtsSentence * sen)
|
47
|
+
{
|
48
|
+
if (sen != NULL)
|
49
|
+
{
|
50
|
+
g_list_foreach (sen->words, (GFunc) g_free, NULL);
|
51
|
+
g_list_free (sen->words);
|
52
|
+
g_free (sen);
|
53
|
+
}
|
54
|
+
sen=NULL;
|
55
|
+
}
|
56
|
+
|
57
|
+
OtsArticle *
|
58
|
+
ots_new_article (void)
|
59
|
+
{
|
60
|
+
OtsArticle *Doc;
|
61
|
+
Doc = g_new0 (OtsArticle, 1);
|
62
|
+
Doc->lineCount = 0;
|
63
|
+
Doc->title = NULL;
|
64
|
+
Doc->stem=new_stem_rule ();
|
65
|
+
Doc->lines=NULL;
|
66
|
+
Doc->dict = NULL;
|
67
|
+
Doc->ImpWords = NULL;
|
68
|
+
Doc->wordStat = NULL;
|
69
|
+
|
70
|
+
Doc->tf_terms=NULL;
|
71
|
+
return Doc;
|
72
|
+
}
|
73
|
+
|
74
|
+
void
|
75
|
+
ots_free_article (OtsArticle * art)
|
76
|
+
{
|
77
|
+
if (NULL != art)
|
78
|
+
{
|
79
|
+
free_stem_rule (art->stem);
|
80
|
+
ots_free_wordlist (art->dict);
|
81
|
+
ots_free_wordlist (art->ImpWords);
|
82
|
+
ots_free_wordlist (art->wordStat);
|
83
|
+
|
84
|
+
ots_free_TF_wordlist(art->tf_terms);
|
85
|
+
|
86
|
+
g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL);
|
87
|
+
g_list_free (art->lines);
|
88
|
+
|
89
|
+
if (art->title != NULL) g_free (art->title);
|
90
|
+
g_free (art);
|
91
|
+
}
|
92
|
+
art=NULL;
|
93
|
+
}
|
94
|
+
|
95
|
+
OtsSentence *
|
96
|
+
ots_append_line (OtsArticle * Doc)
|
97
|
+
{
|
98
|
+
OtsSentence *aLine = ots_new_sentence ();
|
99
|
+
Doc->lineCount++;
|
100
|
+
Doc->lines = g_list_append (Doc->lines, aLine);
|
101
|
+
return aLine;
|
102
|
+
}
|
103
|
+
|
104
|
+
void
|
105
|
+
ots_append_word (OtsSentence * aLine,unsigned const char *aWord)
|
106
|
+
{
|
107
|
+
if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return;
|
108
|
+
aLine->wc++;
|
109
|
+
aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord));
|
110
|
+
return;
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
gboolean
|
115
|
+
ots_is_line_selected(const OtsSentence *aLine)
|
116
|
+
{
|
117
|
+
if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;}
|
118
|
+
return (aLine->selected);
|
119
|
+
}
|
data/ext/dictionary.c
ADDED
@@ -0,0 +1,335 @@
|
|
1
|
+
/*
|
2
|
+
* dictionary.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
#include <libxml/xmlmemory.h>
|
29
|
+
#include <libxml/parser.h>
|
30
|
+
|
31
|
+
|
32
|
+
/* loads the xml dictionary file to memory*/
|
33
|
+
|
34
|
+
gboolean
|
35
|
+
ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
|
36
|
+
{
|
37
|
+
|
38
|
+
xmlDocPtr doc=NULL;
|
39
|
+
xmlNodePtr head=NULL;
|
40
|
+
xmlNodePtr stem=NULL;
|
41
|
+
xmlNodePtr pre=NULL;
|
42
|
+
xmlNodePtr post=NULL;
|
43
|
+
xmlNodePtr syno=NULL; /* synonyms */
|
44
|
+
xmlNodePtr manual=NULL; /* manual */
|
45
|
+
xmlNodePtr step1_pre=NULL; /* step1 */
|
46
|
+
xmlNodePtr step1_post=NULL; /* step1 */
|
47
|
+
|
48
|
+
xmlNodePtr parse=NULL; /* parser rules */
|
49
|
+
xmlNodePtr pbreak=NULL;
|
50
|
+
xmlNodePtr pdbreak=NULL;
|
51
|
+
|
52
|
+
xmlNodePtr tc_words=NULL; /* term count dictionary */
|
53
|
+
xmlNodePtr tf_words=NULL; /* term frequency dictionary */
|
54
|
+
|
55
|
+
|
56
|
+
OtsStemRule * rule=Doc->stem;
|
57
|
+
|
58
|
+
char *dict_name;
|
59
|
+
char *local_dict_name;
|
60
|
+
|
61
|
+
dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
|
62
|
+
local_dict_name = g_strdup_printf ("%s.xml", name);
|
63
|
+
|
64
|
+
|
65
|
+
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
66
|
+
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
|
67
|
+
if (doc == NULL) doc = xmlParseFile (dict_name);
|
68
|
+
if (doc == NULL) return (FALSE);
|
69
|
+
|
70
|
+
head = xmlDocGetRootElement (doc);
|
71
|
+
if (head == NULL)
|
72
|
+
{
|
73
|
+
fprintf (stderr, "empty document\n");
|
74
|
+
xmlFreeDoc (doc);
|
75
|
+
return (FALSE);
|
76
|
+
}
|
77
|
+
|
78
|
+
if (xmlStrcmp (head->name, (const xmlChar *) "dictionary"))
|
79
|
+
{
|
80
|
+
fprintf (stderr, "%s", head->name);
|
81
|
+
xmlFreeDoc (doc);
|
82
|
+
return (FALSE);
|
83
|
+
}
|
84
|
+
|
85
|
+
if (head != NULL)
|
86
|
+
stem = head->xmlChildrenNode;
|
87
|
+
while ((stem != NULL)
|
88
|
+
&& (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
|
89
|
+
{
|
90
|
+
stem = stem->next;
|
91
|
+
}
|
92
|
+
|
93
|
+
if (head != NULL)
|
94
|
+
parse = head->xmlChildrenNode;
|
95
|
+
while ((parse != NULL)
|
96
|
+
&& (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
|
97
|
+
{
|
98
|
+
parse = parse->next;
|
99
|
+
}
|
100
|
+
|
101
|
+
if (head != NULL)
|
102
|
+
tc_words = head->xmlChildrenNode;
|
103
|
+
while ((tc_words != NULL)
|
104
|
+
&& (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
|
105
|
+
{
|
106
|
+
tc_words = tc_words->next;
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
if (head != NULL)
|
111
|
+
tf_words = head->xmlChildrenNode;
|
112
|
+
while ((tf_words != NULL)
|
113
|
+
&& (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
|
114
|
+
{
|
115
|
+
tf_words = tf_words->next;
|
116
|
+
}
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
if (stem != NULL)
|
121
|
+
pre = stem->xmlChildrenNode;
|
122
|
+
while ((pre != NULL) && (xmlStrcmp (pre->name, (const xmlChar *) "pre")))
|
123
|
+
{
|
124
|
+
pre = pre->next;
|
125
|
+
}
|
126
|
+
|
127
|
+
if (stem != NULL)
|
128
|
+
post = stem->xmlChildrenNode;
|
129
|
+
while ((post != NULL) && (xmlStrcmp (post->name, (const xmlChar *) "post")))
|
130
|
+
{
|
131
|
+
post = post->next;
|
132
|
+
}
|
133
|
+
|
134
|
+
|
135
|
+
if (stem != NULL)
|
136
|
+
syno = stem->xmlChildrenNode;
|
137
|
+
while ((syno != NULL)
|
138
|
+
&& (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
|
139
|
+
{
|
140
|
+
syno = syno->next;
|
141
|
+
}
|
142
|
+
|
143
|
+
if (stem != NULL)
|
144
|
+
manual = stem->xmlChildrenNode;
|
145
|
+
while ((manual != NULL)
|
146
|
+
&& (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
|
147
|
+
{
|
148
|
+
manual = manual->next;
|
149
|
+
}
|
150
|
+
|
151
|
+
|
152
|
+
if (stem != NULL)
|
153
|
+
step1_pre = stem->xmlChildrenNode;
|
154
|
+
while ((step1_pre != NULL)
|
155
|
+
&& (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
|
156
|
+
{
|
157
|
+
step1_pre = step1_pre->next;
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
if (stem != NULL)
|
163
|
+
step1_post = stem->xmlChildrenNode;
|
164
|
+
while ((step1_post != NULL)
|
165
|
+
&& (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
|
166
|
+
{
|
167
|
+
step1_post = step1_post->next;
|
168
|
+
}
|
169
|
+
|
170
|
+
|
171
|
+
if (pre != NULL)
|
172
|
+
pre = pre->xmlChildrenNode; /*point to first word */
|
173
|
+
while (pre != NULL)
|
174
|
+
{
|
175
|
+
if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
|
176
|
+
rule->RemovePre =
|
177
|
+
g_list_append (rule->RemovePre,
|
178
|
+
(xmlNodeListGetString
|
179
|
+
(doc, pre->xmlChildrenNode, 1)));
|
180
|
+
pre = pre->next;
|
181
|
+
}
|
182
|
+
|
183
|
+
|
184
|
+
if (post != NULL)
|
185
|
+
post = post->xmlChildrenNode;
|
186
|
+
while (post != NULL)
|
187
|
+
{
|
188
|
+
if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
|
189
|
+
rule->RemovePost =
|
190
|
+
g_list_append (rule->RemovePost,
|
191
|
+
(xmlNodeListGetString
|
192
|
+
(doc, post->xmlChildrenNode, 1)));
|
193
|
+
post = post->next;
|
194
|
+
}
|
195
|
+
|
196
|
+
if (syno != NULL)
|
197
|
+
syno = syno->xmlChildrenNode;
|
198
|
+
while (syno != NULL)
|
199
|
+
{
|
200
|
+
if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
|
201
|
+
rule->synonyms =
|
202
|
+
g_list_append (rule->synonyms,
|
203
|
+
(xmlNodeListGetString
|
204
|
+
(doc, syno->xmlChildrenNode, 1)));
|
205
|
+
syno = syno->next;
|
206
|
+
}
|
207
|
+
|
208
|
+
if (manual != NULL)
|
209
|
+
manual = manual->xmlChildrenNode;
|
210
|
+
while (manual != NULL)
|
211
|
+
{
|
212
|
+
if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
|
213
|
+
rule->manual =
|
214
|
+
g_list_append (rule->manual,
|
215
|
+
(xmlNodeListGetString
|
216
|
+
(doc, manual->xmlChildrenNode, 1)));
|
217
|
+
manual = manual->next;
|
218
|
+
}
|
219
|
+
|
220
|
+
|
221
|
+
|
222
|
+
|
223
|
+
if (step1_pre != NULL)
|
224
|
+
step1_pre = step1_pre->xmlChildrenNode;
|
225
|
+
while (step1_pre != NULL)
|
226
|
+
{
|
227
|
+
if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
|
228
|
+
rule->step1_pre =
|
229
|
+
g_list_append (rule->step1_pre,
|
230
|
+
(xmlNodeListGetString
|
231
|
+
(doc, step1_pre->xmlChildrenNode, 1)));
|
232
|
+
step1_pre = step1_pre->next;
|
233
|
+
}
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
if (step1_post != NULL)
|
238
|
+
step1_post = step1_post->xmlChildrenNode;
|
239
|
+
while (step1_post != NULL)
|
240
|
+
{
|
241
|
+
if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
|
242
|
+
rule->step1_post =
|
243
|
+
g_list_append (rule->step1_post,
|
244
|
+
(xmlNodeListGetString
|
245
|
+
(doc, step1_post->xmlChildrenNode, 1)));
|
246
|
+
step1_post = step1_post->next;
|
247
|
+
}
|
248
|
+
|
249
|
+
if (parse != NULL)
|
250
|
+
pbreak = parse->xmlChildrenNode;
|
251
|
+
while ((pbreak != NULL) && (xmlStrcmp (pbreak->name, (const xmlChar *) "linebreak")))
|
252
|
+
{
|
253
|
+
pbreak = pbreak->next;
|
254
|
+
}
|
255
|
+
|
256
|
+
|
257
|
+
|
258
|
+
if (parse != NULL)
|
259
|
+
pdbreak = parse->xmlChildrenNode;
|
260
|
+
while ((pdbreak != NULL) && (xmlStrcmp (pdbreak->name, (const xmlChar *) "linedontbreak")))
|
261
|
+
{
|
262
|
+
pdbreak = pdbreak->next;
|
263
|
+
}
|
264
|
+
|
265
|
+
|
266
|
+
/*Parser break*/
|
267
|
+
if (pbreak != NULL)
|
268
|
+
pbreak = pbreak->xmlChildrenNode;
|
269
|
+
while (pbreak != NULL)
|
270
|
+
{
|
271
|
+
if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
|
272
|
+
rule->ParserBreak =
|
273
|
+
g_list_append (rule->ParserBreak,
|
274
|
+
(xmlNodeListGetString
|
275
|
+
(doc, pbreak->xmlChildrenNode, 1)));
|
276
|
+
pbreak = pbreak->next;
|
277
|
+
}
|
278
|
+
|
279
|
+
/*Parser Don't break*/
|
280
|
+
if (pdbreak != NULL)
|
281
|
+
pdbreak = pdbreak->xmlChildrenNode;
|
282
|
+
while (pdbreak != NULL)
|
283
|
+
{
|
284
|
+
if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
|
285
|
+
rule->ParserDontBreak =
|
286
|
+
g_list_append (rule->ParserDontBreak,
|
287
|
+
(xmlNodeListGetString
|
288
|
+
(doc, pdbreak->xmlChildrenNode, 1)));
|
289
|
+
pdbreak = pdbreak->next;
|
290
|
+
}
|
291
|
+
|
292
|
+
/*Term Count load dict*/
|
293
|
+
|
294
|
+
if (tc_words != NULL)
|
295
|
+
tc_words = tc_words->xmlChildrenNode;
|
296
|
+
while (tc_words != NULL)
|
297
|
+
{
|
298
|
+
if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
|
299
|
+
{
|
300
|
+
xmlChar *key;
|
301
|
+
key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
|
302
|
+
Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
|
303
|
+
xmlFree(key);
|
304
|
+
}
|
305
|
+
tc_words = tc_words->next;
|
306
|
+
}
|
307
|
+
|
308
|
+
|
309
|
+
/*Term Frequency load dict*/
|
310
|
+
|
311
|
+
if (tf_words != NULL)
|
312
|
+
tf_words = tf_words->xmlChildrenNode;
|
313
|
+
while (tf_words != NULL)
|
314
|
+
{
|
315
|
+
if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
|
316
|
+
{
|
317
|
+
xmlChar *key;
|
318
|
+
xmlChar *idf_key;
|
319
|
+
key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
|
320
|
+
|
321
|
+
idf_key=xmlGetProp(tf_words,"idf");
|
322
|
+
Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
|
323
|
+
xmlFree(key);
|
324
|
+
xmlFree(idf_key);
|
325
|
+
}
|
326
|
+
tf_words = tf_words->next;
|
327
|
+
}
|
328
|
+
|
329
|
+
|
330
|
+
xmlFreeDoc(doc);
|
331
|
+
//xmlCleanupParser ();
|
332
|
+
g_free(dict_name);
|
333
|
+
g_free(local_dict_name);
|
334
|
+
return (TRUE);
|
335
|
+
}
|