summarize 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/README.markdown +42 -0
- data/Rakefile +49 -0
- data/ext/summarize/article.c +119 -0
- data/ext/summarize/dic/bg.xml +101 -0
- data/ext/summarize/dic/ca.xml +141 -0
- data/ext/summarize/dic/cs.xml +161 -0
- data/ext/summarize/dic/cy.xml +118 -0
- data/ext/summarize/dic/da.xml +129 -0
- data/ext/summarize/dic/de.xml +354 -0
- data/ext/summarize/dic/el.xml +80 -0
- data/ext/summarize/dic/en.xml +606 -0
- data/ext/summarize/dic/eo.xml +171 -0
- data/ext/summarize/dic/es.xml +369 -0
- data/ext/summarize/dic/et.xml +172 -0
- data/ext/summarize/dic/eu.xml +77 -0
- data/ext/summarize/dic/fi.xml +105 -0
- data/ext/summarize/dic/fr.xml +199 -0
- data/ext/summarize/dic/ga.xml +124 -0
- data/ext/summarize/dic/gl.xml +290 -0
- data/ext/summarize/dic/he.xml +334 -0
- data/ext/summarize/dic/hu.xml +280 -0
- data/ext/summarize/dic/ia.xml +97 -0
- data/ext/summarize/dic/id.xml +75 -0
- data/ext/summarize/dic/is.xml +201 -0
- data/ext/summarize/dic/it.xml +206 -0
- data/ext/summarize/dic/lv.xml +77 -0
- data/ext/summarize/dic/mi.xml +76 -0
- data/ext/summarize/dic/ms.xml +160 -0
- data/ext/summarize/dic/mt.xml +73 -0
- data/ext/summarize/dic/nl.xml +245 -0
- data/ext/summarize/dic/nn.xml +264 -0
- data/ext/summarize/dic/pl.xml +92 -0
- data/ext/summarize/dic/pt.xml +365 -0
- data/ext/summarize/dic/ro.xml +163 -0
- data/ext/summarize/dic/ru.xml +150 -0
- data/ext/summarize/dic/sv.xml +255 -0
- data/ext/summarize/dic/tl.xml +67 -0
- data/ext/summarize/dic/tr.xml +65 -0
- data/ext/summarize/dic/uk.xml +98 -0
- data/ext/summarize/dic/yi.xml +293 -0
- data/ext/summarize/dictionary.c +331 -0
- data/ext/summarize/extconf.rb +6 -0
- data/ext/summarize/grader-tc.c +185 -0
- data/ext/summarize/grader-tc.h +64 -0
- data/ext/summarize/grader-tf.c +116 -0
- data/ext/summarize/grader.c +85 -0
- data/ext/summarize/highlighter.c +128 -0
- data/ext/summarize/html.c +131 -0
- data/ext/summarize/libots.h +158 -0
- data/ext/summarize/parser.c +173 -0
- data/ext/summarize/relations.c +163 -0
- data/ext/summarize/stemmer.c +332 -0
- data/ext/summarize/summarize.c +43 -0
- data/ext/summarize/summarize.h +12 -0
- data/ext/summarize/text.c +98 -0
- data/ext/summarize/wordlist.c +220 -0
- data/lib/summarize.rb +91 -0
- data/lib/summarize/summarize.bundle +0 -0
- data/sample_data/jupiter.txt +15 -0
- data/summarize.gemspec +21 -0
- metadata +140 -0
@@ -0,0 +1,331 @@
|
|
1
|
+
/*
|
2
|
+
* dictionary.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
#include <libxml/xmlmemory.h>
|
29
|
+
#include <libxml/parser.h>
|
30
|
+
|
31
|
+
|
32
|
+
/* loads the xml dictionary file to memory*/
|
33
|
+
|
34
|
+
gboolean
|
35
|
+
ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
|
36
|
+
{
|
37
|
+
|
38
|
+
xmlDocPtr doc=NULL;
|
39
|
+
xmlNodePtr head=NULL;
|
40
|
+
xmlNodePtr stem=NULL;
|
41
|
+
xmlNodePtr pre=NULL;
|
42
|
+
xmlNodePtr post=NULL;
|
43
|
+
xmlNodePtr syno=NULL; /* synonyms */
|
44
|
+
xmlNodePtr manual=NULL; /* manual */
|
45
|
+
xmlNodePtr step1_pre=NULL; /* step1 */
|
46
|
+
xmlNodePtr step1_post=NULL; /* step1 */
|
47
|
+
|
48
|
+
xmlNodePtr parse=NULL; /* parser rules */
|
49
|
+
xmlNodePtr pbreak=NULL;
|
50
|
+
xmlNodePtr pdbreak=NULL;
|
51
|
+
|
52
|
+
xmlNodePtr tc_words=NULL; /* term count dictionary */
|
53
|
+
xmlNodePtr tf_words=NULL; /* term frequency dictionary */
|
54
|
+
|
55
|
+
|
56
|
+
OtsStemRule * rule=Doc->stem;
|
57
|
+
|
58
|
+
char *local_dict_name;
|
59
|
+
|
60
|
+
local_dict_name = g_strdup_printf ("%s.xml", name);
|
61
|
+
|
62
|
+
|
63
|
+
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
64
|
+
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
|
65
|
+
if (doc == NULL) return (FALSE);
|
66
|
+
|
67
|
+
head = xmlDocGetRootElement (doc);
|
68
|
+
if (head == NULL)
|
69
|
+
{
|
70
|
+
fprintf (stderr, "empty document\n");
|
71
|
+
xmlFreeDoc (doc);
|
72
|
+
return (FALSE);
|
73
|
+
}
|
74
|
+
|
75
|
+
if (xmlStrcmp (head->name, (const xmlChar *) "dictionary"))
|
76
|
+
{
|
77
|
+
fprintf (stderr, "%s", head->name);
|
78
|
+
xmlFreeDoc (doc);
|
79
|
+
return (FALSE);
|
80
|
+
}
|
81
|
+
|
82
|
+
if (head != NULL)
|
83
|
+
stem = head->xmlChildrenNode;
|
84
|
+
while ((stem != NULL)
|
85
|
+
&& (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
|
86
|
+
{
|
87
|
+
stem = stem->next;
|
88
|
+
}
|
89
|
+
|
90
|
+
if (head != NULL)
|
91
|
+
parse = head->xmlChildrenNode;
|
92
|
+
while ((parse != NULL)
|
93
|
+
&& (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
|
94
|
+
{
|
95
|
+
parse = parse->next;
|
96
|
+
}
|
97
|
+
|
98
|
+
if (head != NULL)
|
99
|
+
tc_words = head->xmlChildrenNode;
|
100
|
+
while ((tc_words != NULL)
|
101
|
+
&& (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
|
102
|
+
{
|
103
|
+
tc_words = tc_words->next;
|
104
|
+
}
|
105
|
+
|
106
|
+
|
107
|
+
if (head != NULL)
|
108
|
+
tf_words = head->xmlChildrenNode;
|
109
|
+
while ((tf_words != NULL)
|
110
|
+
&& (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
|
111
|
+
{
|
112
|
+
tf_words = tf_words->next;
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
if (stem != NULL)
|
118
|
+
pre = stem->xmlChildrenNode;
|
119
|
+
while ((pre != NULL) && (xmlStrcmp (pre->name, (const xmlChar *) "pre")))
|
120
|
+
{
|
121
|
+
pre = pre->next;
|
122
|
+
}
|
123
|
+
|
124
|
+
if (stem != NULL)
|
125
|
+
post = stem->xmlChildrenNode;
|
126
|
+
while ((post != NULL) && (xmlStrcmp (post->name, (const xmlChar *) "post")))
|
127
|
+
{
|
128
|
+
post = post->next;
|
129
|
+
}
|
130
|
+
|
131
|
+
|
132
|
+
if (stem != NULL)
|
133
|
+
syno = stem->xmlChildrenNode;
|
134
|
+
while ((syno != NULL)
|
135
|
+
&& (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
|
136
|
+
{
|
137
|
+
syno = syno->next;
|
138
|
+
}
|
139
|
+
|
140
|
+
if (stem != NULL)
|
141
|
+
manual = stem->xmlChildrenNode;
|
142
|
+
while ((manual != NULL)
|
143
|
+
&& (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
|
144
|
+
{
|
145
|
+
manual = manual->next;
|
146
|
+
}
|
147
|
+
|
148
|
+
|
149
|
+
if (stem != NULL)
|
150
|
+
step1_pre = stem->xmlChildrenNode;
|
151
|
+
while ((step1_pre != NULL)
|
152
|
+
&& (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
|
153
|
+
{
|
154
|
+
step1_pre = step1_pre->next;
|
155
|
+
}
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
if (stem != NULL)
|
160
|
+
step1_post = stem->xmlChildrenNode;
|
161
|
+
while ((step1_post != NULL)
|
162
|
+
&& (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
|
163
|
+
{
|
164
|
+
step1_post = step1_post->next;
|
165
|
+
}
|
166
|
+
|
167
|
+
|
168
|
+
if (pre != NULL)
|
169
|
+
pre = pre->xmlChildrenNode; /*point to first word */
|
170
|
+
while (pre != NULL)
|
171
|
+
{
|
172
|
+
if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
|
173
|
+
rule->RemovePre =
|
174
|
+
g_list_append (rule->RemovePre,
|
175
|
+
(xmlNodeListGetString
|
176
|
+
(doc, pre->xmlChildrenNode, 1)));
|
177
|
+
pre = pre->next;
|
178
|
+
}
|
179
|
+
|
180
|
+
|
181
|
+
if (post != NULL)
|
182
|
+
post = post->xmlChildrenNode;
|
183
|
+
while (post != NULL)
|
184
|
+
{
|
185
|
+
if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
|
186
|
+
rule->RemovePost =
|
187
|
+
g_list_append (rule->RemovePost,
|
188
|
+
(xmlNodeListGetString
|
189
|
+
(doc, post->xmlChildrenNode, 1)));
|
190
|
+
post = post->next;
|
191
|
+
}
|
192
|
+
|
193
|
+
if (syno != NULL)
|
194
|
+
syno = syno->xmlChildrenNode;
|
195
|
+
while (syno != NULL)
|
196
|
+
{
|
197
|
+
if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
|
198
|
+
rule->synonyms =
|
199
|
+
g_list_append (rule->synonyms,
|
200
|
+
(xmlNodeListGetString
|
201
|
+
(doc, syno->xmlChildrenNode, 1)));
|
202
|
+
syno = syno->next;
|
203
|
+
}
|
204
|
+
|
205
|
+
if (manual != NULL)
|
206
|
+
manual = manual->xmlChildrenNode;
|
207
|
+
while (manual != NULL)
|
208
|
+
{
|
209
|
+
if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
|
210
|
+
rule->manual =
|
211
|
+
g_list_append (rule->manual,
|
212
|
+
(xmlNodeListGetString
|
213
|
+
(doc, manual->xmlChildrenNode, 1)));
|
214
|
+
manual = manual->next;
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
if (step1_pre != NULL)
|
221
|
+
step1_pre = step1_pre->xmlChildrenNode;
|
222
|
+
while (step1_pre != NULL)
|
223
|
+
{
|
224
|
+
if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
|
225
|
+
rule->step1_pre =
|
226
|
+
g_list_append (rule->step1_pre,
|
227
|
+
(xmlNodeListGetString
|
228
|
+
(doc, step1_pre->xmlChildrenNode, 1)));
|
229
|
+
step1_pre = step1_pre->next;
|
230
|
+
}
|
231
|
+
|
232
|
+
|
233
|
+
|
234
|
+
if (step1_post != NULL)
|
235
|
+
step1_post = step1_post->xmlChildrenNode;
|
236
|
+
while (step1_post != NULL)
|
237
|
+
{
|
238
|
+
if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
|
239
|
+
rule->step1_post =
|
240
|
+
g_list_append (rule->step1_post,
|
241
|
+
(xmlNodeListGetString
|
242
|
+
(doc, step1_post->xmlChildrenNode, 1)));
|
243
|
+
step1_post = step1_post->next;
|
244
|
+
}
|
245
|
+
|
246
|
+
if (parse != NULL)
|
247
|
+
pbreak = parse->xmlChildrenNode;
|
248
|
+
while ((pbreak != NULL) && (xmlStrcmp (pbreak->name, (const xmlChar *) "linebreak")))
|
249
|
+
{
|
250
|
+
pbreak = pbreak->next;
|
251
|
+
}
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
if (parse != NULL)
|
256
|
+
pdbreak = parse->xmlChildrenNode;
|
257
|
+
while ((pdbreak != NULL) && (xmlStrcmp (pdbreak->name, (const xmlChar *) "linedontbreak")))
|
258
|
+
{
|
259
|
+
pdbreak = pdbreak->next;
|
260
|
+
}
|
261
|
+
|
262
|
+
|
263
|
+
/*Parser break*/
|
264
|
+
if (pbreak != NULL)
|
265
|
+
pbreak = pbreak->xmlChildrenNode;
|
266
|
+
while (pbreak != NULL)
|
267
|
+
{
|
268
|
+
if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
|
269
|
+
rule->ParserBreak =
|
270
|
+
g_list_append (rule->ParserBreak,
|
271
|
+
(xmlNodeListGetString
|
272
|
+
(doc, pbreak->xmlChildrenNode, 1)));
|
273
|
+
pbreak = pbreak->next;
|
274
|
+
}
|
275
|
+
|
276
|
+
/*Parser Don't break*/
|
277
|
+
if (pdbreak != NULL)
|
278
|
+
pdbreak = pdbreak->xmlChildrenNode;
|
279
|
+
while (pdbreak != NULL)
|
280
|
+
{
|
281
|
+
if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
|
282
|
+
rule->ParserDontBreak =
|
283
|
+
g_list_append (rule->ParserDontBreak,
|
284
|
+
(xmlNodeListGetString
|
285
|
+
(doc, pdbreak->xmlChildrenNode, 1)));
|
286
|
+
pdbreak = pdbreak->next;
|
287
|
+
}
|
288
|
+
|
289
|
+
/*Term Count load dict*/
|
290
|
+
|
291
|
+
if (tc_words != NULL)
|
292
|
+
tc_words = tc_words->xmlChildrenNode;
|
293
|
+
while (tc_words != NULL)
|
294
|
+
{
|
295
|
+
if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
|
296
|
+
{
|
297
|
+
xmlChar *key;
|
298
|
+
key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
|
299
|
+
Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
|
300
|
+
xmlFree(key);
|
301
|
+
}
|
302
|
+
tc_words = tc_words->next;
|
303
|
+
}
|
304
|
+
|
305
|
+
|
306
|
+
/*Term Frequency load dict*/
|
307
|
+
|
308
|
+
if (tf_words != NULL)
|
309
|
+
tf_words = tf_words->xmlChildrenNode;
|
310
|
+
while (tf_words != NULL)
|
311
|
+
{
|
312
|
+
if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
|
313
|
+
{
|
314
|
+
xmlChar *key;
|
315
|
+
xmlChar *idf_key;
|
316
|
+
key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
|
317
|
+
|
318
|
+
idf_key=xmlGetProp(tf_words,"idf");
|
319
|
+
Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
|
320
|
+
xmlFree(key);
|
321
|
+
xmlFree(idf_key);
|
322
|
+
}
|
323
|
+
tf_words = tf_words->next;
|
324
|
+
}
|
325
|
+
|
326
|
+
|
327
|
+
xmlFreeDoc(doc);
|
328
|
+
xmlCleanupParser ();
|
329
|
+
g_free(local_dict_name);
|
330
|
+
return (TRUE);
|
331
|
+
}
|
@@ -0,0 +1,185 @@
|
|
1
|
+
/*
|
2
|
+
* grader-tc.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
#include "libots.h"
|
25
|
+
|
26
|
+
|
27
|
+
#include "grader-tc.h"
|
28
|
+
|
29
|
+
|
30
|
+
/*Grader - Term count algorithm*/
|
31
|
+
/*This is non-normelized term frequency algorithm without using inverse document frequency database */
|
32
|
+
|
33
|
+
#define NUM_KEY_WORDS 100 /* use first n key words only */
|
34
|
+
|
35
|
+
int
|
36
|
+
ots_get_article_word_count (const OtsArticle * Doc)
|
37
|
+
{
|
38
|
+
GList *li;
|
39
|
+
int articleWC;
|
40
|
+
articleWC = 0;
|
41
|
+
|
42
|
+
if (Doc==NULL) return 0;
|
43
|
+
|
44
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
45
|
+
{
|
46
|
+
articleWC += ((OtsSentence *) li->data)->wc;
|
47
|
+
}
|
48
|
+
|
49
|
+
return articleWC;
|
50
|
+
}
|
51
|
+
|
52
|
+
|
53
|
+
/*take this line and add each word to the "wordStat" list
|
54
|
+
* this list will hold all of the words in the article and the number
|
55
|
+
* of times they appeared in the article.
|
56
|
+
*/
|
57
|
+
|
58
|
+
static void
|
59
|
+
ots_line_add_wordlist(OtsArticle * Doc,const OtsSentence * aLine)
|
60
|
+
{
|
61
|
+
GList *li;
|
62
|
+
if ((aLine==NULL) ||(NULL==Doc)) { return;}
|
63
|
+
|
64
|
+
for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
|
65
|
+
if (li->data && strlen (li->data)) ots_add_wordstat (Doc, (char *)li->data);
|
66
|
+
|
67
|
+
return;
|
68
|
+
}
|
69
|
+
|
70
|
+
static void
|
71
|
+
ots_create_wordlist(OtsArticle * Doc)
|
72
|
+
{
|
73
|
+
GList *line;
|
74
|
+
if (Doc==NULL) return;
|
75
|
+
|
76
|
+
for (line = (GList *) Doc->lines; line != NULL; line = line->next)
|
77
|
+
{
|
78
|
+
OtsSentence * aLine=line->data;
|
79
|
+
if (aLine)
|
80
|
+
ots_line_add_wordlist(Doc,aLine);
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
static int
|
88
|
+
keyVal (const int n) /* Ugly , I know */
|
89
|
+
{
|
90
|
+
if (n == 1) return 3;
|
91
|
+
if (n == 2) return 2;
|
92
|
+
if (n == 3) return 2;
|
93
|
+
if (n == 4) return 2;
|
94
|
+
return 1;
|
95
|
+
}
|
96
|
+
|
97
|
+
|
98
|
+
static void
|
99
|
+
ots_grade_line (GList *impList, OtsSentence * aLine,
|
100
|
+
OtsStemRule * rule)
|
101
|
+
{
|
102
|
+
GList *li;
|
103
|
+
GList *di;
|
104
|
+
int n;
|
105
|
+
char *tmp_stem;
|
106
|
+
|
107
|
+
if ((aLine==NULL)||(rule==NULL)||(impList==NULL)) return;
|
108
|
+
|
109
|
+
for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word */
|
110
|
+
{
|
111
|
+
n = 0;
|
112
|
+
tmp_stem = ots_stem_strip ((unsigned char *) li->data, rule);
|
113
|
+
|
114
|
+
for (di = (GList *) impList;
|
115
|
+
((di != NULL) && (n < NUM_KEY_WORDS)); di = di->next)
|
116
|
+
{
|
117
|
+
n++;
|
118
|
+
if ((NULL!=((OtsWordEntery *) di->data)->stem) && (NULL!=tmp_stem))
|
119
|
+
if (0 == strcmp ((((OtsWordEntery *) di->data)->stem), tmp_stem))
|
120
|
+
{
|
121
|
+
/* debug:
|
122
|
+
if (0!=strcmp((((OtsWordEntery *) di->data)->word),li->data))
|
123
|
+
printf("[%s][%s] stem[%s]\n",(((OtsWordEntery *) di->data)->word),li->data,tmp);*/
|
124
|
+
|
125
|
+
aLine->score += (((OtsWordEntery *) di->data)->occ) * keyVal (n);
|
126
|
+
}
|
127
|
+
|
128
|
+
}
|
129
|
+
|
130
|
+
g_free (tmp_stem);
|
131
|
+
}
|
132
|
+
|
133
|
+
}
|
134
|
+
|
135
|
+
|
136
|
+
void
|
137
|
+
ots_create_title_tc(OtsArticle * Doc)
|
138
|
+
{
|
139
|
+
|
140
|
+
char *tmp;
|
141
|
+
char *word;
|
142
|
+
int i;
|
143
|
+
GString *title;
|
144
|
+
if (NULL==Doc) return;
|
145
|
+
|
146
|
+
title=g_string_new(NULL);
|
147
|
+
|
148
|
+
for (i=0;i<5;i++)
|
149
|
+
{
|
150
|
+
word = ots_word_in_list(Doc->ImpWords,i);
|
151
|
+
if (word) g_string_append(title,word); else break;
|
152
|
+
if (i<4) g_string_append(title,",");
|
153
|
+
}
|
154
|
+
|
155
|
+
tmp=title->str;
|
156
|
+
if (NULL!=title) g_string_free(title,FALSE);
|
157
|
+
Doc->title=tmp;
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
void
|
162
|
+
ots_grade_doc_tc (OtsArticle * Doc)
|
163
|
+
{
|
164
|
+
|
165
|
+
GList *li;
|
166
|
+
if (NULL==Doc) return;
|
167
|
+
ots_create_wordlist(Doc);
|
168
|
+
|
169
|
+
|
170
|
+
Doc->ImpWords=ots_union_list (Doc->wordStat, Doc->dict); /* subtract from the Article wordlist all the words in the dic file (on , the , is...) */
|
171
|
+
Doc->ImpWords=ots_sort_list (Doc->ImpWords); /* sort the list , top 3 is what the article talks about (SARS , virus , cure ... ) */
|
172
|
+
|
173
|
+
/*to print wordlist: ots_print_wordlist (stdout, Doc->ImpWords);*/
|
174
|
+
|
175
|
+
if (0 == Doc->lineCount) return;
|
176
|
+
|
177
|
+
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
178
|
+
{
|
179
|
+
if (li->data)
|
180
|
+
ots_grade_line (Doc->ImpWords, (OtsSentence *) li->data, Doc->stem);
|
181
|
+
}
|
182
|
+
|
183
|
+
|
184
|
+
ots_create_title_tc(Doc);
|
185
|
+
}
|