ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,119 @@
1
+ /*
2
+ * article.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ extern void ots_free_TF_wordlist (GList * aList);
29
+
30
+ #define MAX_WORD_LENGTH 35
31
+
32
+ /*Data structure related functions*/
33
+
34
+ OtsSentence *
35
+ ots_new_sentence (void)
36
+ {
37
+ OtsSentence *aLine = g_new0 (OtsSentence, 1);
38
+ aLine->words = NULL;
39
+ aLine->wc = 0;
40
+ aLine->selected = 0;
41
+ aLine->score = 0;
42
+ return aLine;
43
+ }
44
+
45
+ void
46
+ ots_free_sentence (OtsSentence * sen)
47
+ {
48
+ if (sen != NULL)
49
+ {
50
+ g_list_foreach (sen->words, (GFunc) g_free, NULL);
51
+ g_list_free (sen->words);
52
+ g_free (sen);
53
+ }
54
+ sen=NULL;
55
+ }
56
+
57
+ OtsArticle *
58
+ ots_new_article (void)
59
+ {
60
+ OtsArticle *Doc;
61
+ Doc = g_new0 (OtsArticle, 1);
62
+ Doc->lineCount = 0;
63
+ Doc->title = NULL;
64
+ Doc->stem=new_stem_rule ();
65
+ Doc->lines=NULL;
66
+ Doc->dict = NULL;
67
+ Doc->ImpWords = NULL;
68
+ Doc->wordStat = NULL;
69
+
70
+ Doc->tf_terms=NULL;
71
+ return Doc;
72
+ }
73
+
74
+ void
75
+ ots_free_article (OtsArticle * art)
76
+ {
77
+ if (NULL != art)
78
+ {
79
+ free_stem_rule (art->stem);
80
+ ots_free_wordlist (art->dict);
81
+ ots_free_wordlist (art->ImpWords);
82
+ ots_free_wordlist (art->wordStat);
83
+
84
+ ots_free_TF_wordlist(art->tf_terms);
85
+
86
+ g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL);
87
+ g_list_free (art->lines);
88
+
89
+ if (art->title != NULL) g_free (art->title);
90
+ g_free (art);
91
+ }
92
+ art=NULL;
93
+ }
94
+
95
+ OtsSentence *
96
+ ots_append_line (OtsArticle * Doc)
97
+ {
98
+ OtsSentence *aLine = ots_new_sentence ();
99
+ Doc->lineCount++;
100
+ Doc->lines = g_list_append (Doc->lines, aLine);
101
+ return aLine;
102
+ }
103
+
104
+ void
105
+ ots_append_word (OtsSentence * aLine,unsigned const char *aWord)
106
+ {
107
+ if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return;
108
+ aLine->wc++;
109
+ aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord));
110
+ return;
111
+ }
112
+
113
+
114
+ gboolean
115
+ ots_is_line_selected(const OtsSentence *aLine)
116
+ {
117
+ if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;}
118
+ return (aLine->selected);
119
+ }
@@ -0,0 +1,335 @@
1
+ /*
2
+ * dictionary.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ #include <libxml/xmlmemory.h>
29
+ #include <libxml/parser.h>
30
+
31
+
32
+ /* loads the xml dictionary file to memory*/
33
+
34
+ gboolean
35
+ ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
36
+ {
37
+
38
+ xmlDocPtr doc=NULL;
39
+ xmlNodePtr head=NULL;
40
+ xmlNodePtr stem=NULL;
41
+ xmlNodePtr pre=NULL;
42
+ xmlNodePtr post=NULL;
43
+ xmlNodePtr syno=NULL; /* synonyms */
44
+ xmlNodePtr manual=NULL; /* manual */
45
+ xmlNodePtr step1_pre=NULL; /* step1 */
46
+ xmlNodePtr step1_post=NULL; /* step1 */
47
+
48
+ xmlNodePtr parse=NULL; /* parser rules */
49
+ xmlNodePtr pbreak=NULL;
50
+ xmlNodePtr pdbreak=NULL;
51
+
52
+ xmlNodePtr tc_words=NULL; /* term count dictionary */
53
+ xmlNodePtr tf_words=NULL; /* term frequency dictionary */
54
+
55
+
56
+ OtsStemRule * rule=Doc->stem;
57
+
58
+ char *dict_name;
59
+ char *local_dict_name;
60
+
61
+ dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
62
+ local_dict_name = g_strdup_printf ("%s.xml", name);
63
+
64
+
65
+ if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
66
+ doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
67
+ if (doc == NULL) doc = xmlParseFile (dict_name);
68
+ if (doc == NULL) return (FALSE);
69
+
70
+ head = xmlDocGetRootElement (doc);
71
+ if (head == NULL)
72
+ {
73
+ fprintf (stderr, "empty document\n");
74
+ xmlFreeDoc (doc);
75
+ return (FALSE);
76
+ }
77
+
78
+ if (xmlStrcmp (head->name, (const xmlChar *) "dictionary"))
79
+ {
80
+ fprintf (stderr, "%s", head->name);
81
+ xmlFreeDoc (doc);
82
+ return (FALSE);
83
+ }
84
+
85
+ if (head != NULL)
86
+ stem = head->xmlChildrenNode;
87
+ while ((stem != NULL)
88
+ && (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
89
+ {
90
+ stem = stem->next;
91
+ }
92
+
93
+ if (head != NULL)
94
+ parse = head->xmlChildrenNode;
95
+ while ((parse != NULL)
96
+ && (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
97
+ {
98
+ parse = parse->next;
99
+ }
100
+
101
+ if (head != NULL)
102
+ tc_words = head->xmlChildrenNode;
103
+ while ((tc_words != NULL)
104
+ && (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
105
+ {
106
+ tc_words = tc_words->next;
107
+ }
108
+
109
+
110
+ if (head != NULL)
111
+ tf_words = head->xmlChildrenNode;
112
+ while ((tf_words != NULL)
113
+ && (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
114
+ {
115
+ tf_words = tf_words->next;
116
+ }
117
+
118
+
119
+
120
+ if (stem != NULL)
121
+ pre = stem->xmlChildrenNode;
122
+ while ((pre != NULL) && (xmlStrcmp (pre->name, (const xmlChar *) "pre")))
123
+ {
124
+ pre = pre->next;
125
+ }
126
+
127
+ if (stem != NULL)
128
+ post = stem->xmlChildrenNode;
129
+ while ((post != NULL) && (xmlStrcmp (post->name, (const xmlChar *) "post")))
130
+ {
131
+ post = post->next;
132
+ }
133
+
134
+
135
+ if (stem != NULL)
136
+ syno = stem->xmlChildrenNode;
137
+ while ((syno != NULL)
138
+ && (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
139
+ {
140
+ syno = syno->next;
141
+ }
142
+
143
+ if (stem != NULL)
144
+ manual = stem->xmlChildrenNode;
145
+ while ((manual != NULL)
146
+ && (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
147
+ {
148
+ manual = manual->next;
149
+ }
150
+
151
+
152
+ if (stem != NULL)
153
+ step1_pre = stem->xmlChildrenNode;
154
+ while ((step1_pre != NULL)
155
+ && (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
156
+ {
157
+ step1_pre = step1_pre->next;
158
+ }
159
+
160
+
161
+
162
+ if (stem != NULL)
163
+ step1_post = stem->xmlChildrenNode;
164
+ while ((step1_post != NULL)
165
+ && (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
166
+ {
167
+ step1_post = step1_post->next;
168
+ }
169
+
170
+
171
+ if (pre != NULL)
172
+ pre = pre->xmlChildrenNode; /*point to first word */
173
+ while (pre != NULL)
174
+ {
175
+ if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
176
+ rule->RemovePre =
177
+ g_list_append (rule->RemovePre,
178
+ (xmlNodeListGetString
179
+ (doc, pre->xmlChildrenNode, 1)));
180
+ pre = pre->next;
181
+ }
182
+
183
+
184
+ if (post != NULL)
185
+ post = post->xmlChildrenNode;
186
+ while (post != NULL)
187
+ {
188
+ if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
189
+ rule->RemovePost =
190
+ g_list_append (rule->RemovePost,
191
+ (xmlNodeListGetString
192
+ (doc, post->xmlChildrenNode, 1)));
193
+ post = post->next;
194
+ }
195
+
196
+ if (syno != NULL)
197
+ syno = syno->xmlChildrenNode;
198
+ while (syno != NULL)
199
+ {
200
+ if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
201
+ rule->synonyms =
202
+ g_list_append (rule->synonyms,
203
+ (xmlNodeListGetString
204
+ (doc, syno->xmlChildrenNode, 1)));
205
+ syno = syno->next;
206
+ }
207
+
208
+ if (manual != NULL)
209
+ manual = manual->xmlChildrenNode;
210
+ while (manual != NULL)
211
+ {
212
+ if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
213
+ rule->manual =
214
+ g_list_append (rule->manual,
215
+ (xmlNodeListGetString
216
+ (doc, manual->xmlChildrenNode, 1)));
217
+ manual = manual->next;
218
+ }
219
+
220
+
221
+
222
+
223
+ if (step1_pre != NULL)
224
+ step1_pre = step1_pre->xmlChildrenNode;
225
+ while (step1_pre != NULL)
226
+ {
227
+ if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
228
+ rule->step1_pre =
229
+ g_list_append (rule->step1_pre,
230
+ (xmlNodeListGetString
231
+ (doc, step1_pre->xmlChildrenNode, 1)));
232
+ step1_pre = step1_pre->next;
233
+ }
234
+
235
+
236
+
237
+ if (step1_post != NULL)
238
+ step1_post = step1_post->xmlChildrenNode;
239
+ while (step1_post != NULL)
240
+ {
241
+ if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
242
+ rule->step1_post =
243
+ g_list_append (rule->step1_post,
244
+ (xmlNodeListGetString
245
+ (doc, step1_post->xmlChildrenNode, 1)));
246
+ step1_post = step1_post->next;
247
+ }
248
+
249
+ if (parse != NULL)
250
+ pbreak = parse->xmlChildrenNode;
251
+ while ((pbreak != NULL) && (xmlStrcmp (pbreak->name, (const xmlChar *) "linebreak")))
252
+ {
253
+ pbreak = pbreak->next;
254
+ }
255
+
256
+
257
+
258
+ if (parse != NULL)
259
+ pdbreak = parse->xmlChildrenNode;
260
+ while ((pdbreak != NULL) && (xmlStrcmp (pdbreak->name, (const xmlChar *) "linedontbreak")))
261
+ {
262
+ pdbreak = pdbreak->next;
263
+ }
264
+
265
+
266
+ /*Parser break*/
267
+ if (pbreak != NULL)
268
+ pbreak = pbreak->xmlChildrenNode;
269
+ while (pbreak != NULL)
270
+ {
271
+ if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
272
+ rule->ParserBreak =
273
+ g_list_append (rule->ParserBreak,
274
+ (xmlNodeListGetString
275
+ (doc, pbreak->xmlChildrenNode, 1)));
276
+ pbreak = pbreak->next;
277
+ }
278
+
279
+ /*Parser Don't break*/
280
+ if (pdbreak != NULL)
281
+ pdbreak = pdbreak->xmlChildrenNode;
282
+ while (pdbreak != NULL)
283
+ {
284
+ if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
285
+ rule->ParserDontBreak =
286
+ g_list_append (rule->ParserDontBreak,
287
+ (xmlNodeListGetString
288
+ (doc, pdbreak->xmlChildrenNode, 1)));
289
+ pdbreak = pdbreak->next;
290
+ }
291
+
292
+ /*Term Count load dict*/
293
+
294
+ if (tc_words != NULL)
295
+ tc_words = tc_words->xmlChildrenNode;
296
+ while (tc_words != NULL)
297
+ {
298
+ if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
299
+ {
300
+ xmlChar *key;
301
+ key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
302
+ Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
303
+ xmlFree(key);
304
+ }
305
+ tc_words = tc_words->next;
306
+ }
307
+
308
+
309
+ /*Term Frequency load dict*/
310
+
311
+ if (tf_words != NULL)
312
+ tf_words = tf_words->xmlChildrenNode;
313
+ while (tf_words != NULL)
314
+ {
315
+ if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
316
+ {
317
+ xmlChar *key;
318
+ xmlChar *idf_key;
319
+ key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
320
+
321
+ idf_key=xmlGetProp(tf_words,"idf");
322
+ Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
323
+ xmlFree(key);
324
+ xmlFree(idf_key);
325
+ }
326
+ tf_words = tf_words->next;
327
+ }
328
+
329
+
330
+ xmlFreeDoc(doc);
331
+ //xmlCleanupParser ();
332
+ g_free(dict_name);
333
+ g_free(local_dict_name);
334
+ return (TRUE);
335
+ }