summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,331 @@
1
+ /*
2
+ * dictionary.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ #include <libxml/xmlmemory.h>
29
+ #include <libxml/parser.h>
30
+
31
+
32
+ /* loads the xml dictionary file to memory*/
33
+
34
+ gboolean
35
+ ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
36
+ {
37
+
38
+ xmlDocPtr doc=NULL;
39
+ xmlNodePtr head=NULL;
40
+ xmlNodePtr stem=NULL;
41
+ xmlNodePtr pre=NULL;
42
+ xmlNodePtr post=NULL;
43
+ xmlNodePtr syno=NULL; /* synonyms */
44
+ xmlNodePtr manual=NULL; /* manual */
45
+ xmlNodePtr step1_pre=NULL; /* step1 */
46
+ xmlNodePtr step1_post=NULL; /* step1 */
47
+
48
+ xmlNodePtr parse=NULL; /* parser rules */
49
+ xmlNodePtr pbreak=NULL;
50
+ xmlNodePtr pdbreak=NULL;
51
+
52
+ xmlNodePtr tc_words=NULL; /* term count dictionary */
53
+ xmlNodePtr tf_words=NULL; /* term frequency dictionary */
54
+
55
+
56
+ OtsStemRule * rule=Doc->stem;
57
+
58
+ char *local_dict_name;
59
+
60
+ local_dict_name = g_strdup_printf ("%s.xml", name);
61
+
62
+
63
+ if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
64
+ doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
65
+ if (doc == NULL) return (FALSE);
66
+
67
+ head = xmlDocGetRootElement (doc);
68
+ if (head == NULL)
69
+ {
70
+ fprintf (stderr, "empty document\n");
71
+ xmlFreeDoc (doc);
72
+ return (FALSE);
73
+ }
74
+
75
+ if (xmlStrcmp (head->name, (const xmlChar *) "dictionary"))
76
+ {
77
+ fprintf (stderr, "%s", head->name);
78
+ xmlFreeDoc (doc);
79
+ return (FALSE);
80
+ }
81
+
82
+ if (head != NULL)
83
+ stem = head->xmlChildrenNode;
84
+ while ((stem != NULL)
85
+ && (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
86
+ {
87
+ stem = stem->next;
88
+ }
89
+
90
+ if (head != NULL)
91
+ parse = head->xmlChildrenNode;
92
+ while ((parse != NULL)
93
+ && (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
94
+ {
95
+ parse = parse->next;
96
+ }
97
+
98
+ if (head != NULL)
99
+ tc_words = head->xmlChildrenNode;
100
+ while ((tc_words != NULL)
101
+ && (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
102
+ {
103
+ tc_words = tc_words->next;
104
+ }
105
+
106
+
107
+ if (head != NULL)
108
+ tf_words = head->xmlChildrenNode;
109
+ while ((tf_words != NULL)
110
+ && (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
111
+ {
112
+ tf_words = tf_words->next;
113
+ }
114
+
115
+
116
+
117
+ if (stem != NULL)
118
+ pre = stem->xmlChildrenNode;
119
+ while ((pre != NULL) && (xmlStrcmp (pre->name, (const xmlChar *) "pre")))
120
+ {
121
+ pre = pre->next;
122
+ }
123
+
124
+ if (stem != NULL)
125
+ post = stem->xmlChildrenNode;
126
+ while ((post != NULL) && (xmlStrcmp (post->name, (const xmlChar *) "post")))
127
+ {
128
+ post = post->next;
129
+ }
130
+
131
+
132
+ if (stem != NULL)
133
+ syno = stem->xmlChildrenNode;
134
+ while ((syno != NULL)
135
+ && (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
136
+ {
137
+ syno = syno->next;
138
+ }
139
+
140
+ if (stem != NULL)
141
+ manual = stem->xmlChildrenNode;
142
+ while ((manual != NULL)
143
+ && (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
144
+ {
145
+ manual = manual->next;
146
+ }
147
+
148
+
149
+ if (stem != NULL)
150
+ step1_pre = stem->xmlChildrenNode;
151
+ while ((step1_pre != NULL)
152
+ && (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
153
+ {
154
+ step1_pre = step1_pre->next;
155
+ }
156
+
157
+
158
+
159
+ if (stem != NULL)
160
+ step1_post = stem->xmlChildrenNode;
161
+ while ((step1_post != NULL)
162
+ && (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
163
+ {
164
+ step1_post = step1_post->next;
165
+ }
166
+
167
+
168
+ if (pre != NULL)
169
+ pre = pre->xmlChildrenNode; /*point to first word */
170
+ while (pre != NULL)
171
+ {
172
+ if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
173
+ rule->RemovePre =
174
+ g_list_append (rule->RemovePre,
175
+ (xmlNodeListGetString
176
+ (doc, pre->xmlChildrenNode, 1)));
177
+ pre = pre->next;
178
+ }
179
+
180
+
181
+ if (post != NULL)
182
+ post = post->xmlChildrenNode;
183
+ while (post != NULL)
184
+ {
185
+ if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
186
+ rule->RemovePost =
187
+ g_list_append (rule->RemovePost,
188
+ (xmlNodeListGetString
189
+ (doc, post->xmlChildrenNode, 1)));
190
+ post = post->next;
191
+ }
192
+
193
+ if (syno != NULL)
194
+ syno = syno->xmlChildrenNode;
195
+ while (syno != NULL)
196
+ {
197
+ if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
198
+ rule->synonyms =
199
+ g_list_append (rule->synonyms,
200
+ (xmlNodeListGetString
201
+ (doc, syno->xmlChildrenNode, 1)));
202
+ syno = syno->next;
203
+ }
204
+
205
+ if (manual != NULL)
206
+ manual = manual->xmlChildrenNode;
207
+ while (manual != NULL)
208
+ {
209
+ if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
210
+ rule->manual =
211
+ g_list_append (rule->manual,
212
+ (xmlNodeListGetString
213
+ (doc, manual->xmlChildrenNode, 1)));
214
+ manual = manual->next;
215
+ }
216
+
217
+
218
+
219
+
220
+ if (step1_pre != NULL)
221
+ step1_pre = step1_pre->xmlChildrenNode;
222
+ while (step1_pre != NULL)
223
+ {
224
+ if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
225
+ rule->step1_pre =
226
+ g_list_append (rule->step1_pre,
227
+ (xmlNodeListGetString
228
+ (doc, step1_pre->xmlChildrenNode, 1)));
229
+ step1_pre = step1_pre->next;
230
+ }
231
+
232
+
233
+
234
+ if (step1_post != NULL)
235
+ step1_post = step1_post->xmlChildrenNode;
236
+ while (step1_post != NULL)
237
+ {
238
+ if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
239
+ rule->step1_post =
240
+ g_list_append (rule->step1_post,
241
+ (xmlNodeListGetString
242
+ (doc, step1_post->xmlChildrenNode, 1)));
243
+ step1_post = step1_post->next;
244
+ }
245
+
246
+ if (parse != NULL)
247
+ pbreak = parse->xmlChildrenNode;
248
+ while ((pbreak != NULL) && (xmlStrcmp (pbreak->name, (const xmlChar *) "linebreak")))
249
+ {
250
+ pbreak = pbreak->next;
251
+ }
252
+
253
+
254
+
255
+ if (parse != NULL)
256
+ pdbreak = parse->xmlChildrenNode;
257
+ while ((pdbreak != NULL) && (xmlStrcmp (pdbreak->name, (const xmlChar *) "linedontbreak")))
258
+ {
259
+ pdbreak = pdbreak->next;
260
+ }
261
+
262
+
263
+ /*Parser break*/
264
+ if (pbreak != NULL)
265
+ pbreak = pbreak->xmlChildrenNode;
266
+ while (pbreak != NULL)
267
+ {
268
+ if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
269
+ rule->ParserBreak =
270
+ g_list_append (rule->ParserBreak,
271
+ (xmlNodeListGetString
272
+ (doc, pbreak->xmlChildrenNode, 1)));
273
+ pbreak = pbreak->next;
274
+ }
275
+
276
+ /*Parser Don't break*/
277
+ if (pdbreak != NULL)
278
+ pdbreak = pdbreak->xmlChildrenNode;
279
+ while (pdbreak != NULL)
280
+ {
281
+ if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
282
+ rule->ParserDontBreak =
283
+ g_list_append (rule->ParserDontBreak,
284
+ (xmlNodeListGetString
285
+ (doc, pdbreak->xmlChildrenNode, 1)));
286
+ pdbreak = pdbreak->next;
287
+ }
288
+
289
+ /*Term Count load dict*/
290
+
291
+ if (tc_words != NULL)
292
+ tc_words = tc_words->xmlChildrenNode;
293
+ while (tc_words != NULL)
294
+ {
295
+ if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
296
+ {
297
+ xmlChar *key;
298
+ key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
299
+ Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
300
+ xmlFree(key);
301
+ }
302
+ tc_words = tc_words->next;
303
+ }
304
+
305
+
306
+ /*Term Frequency load dict*/
307
+
308
+ if (tf_words != NULL)
309
+ tf_words = tf_words->xmlChildrenNode;
310
+ while (tf_words != NULL)
311
+ {
312
+ if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
313
+ {
314
+ xmlChar *key;
315
+ xmlChar *idf_key;
316
+ key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
317
+
318
+ idf_key=xmlGetProp(tf_words,"idf");
319
+ Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
320
+ xmlFree(key);
321
+ xmlFree(idf_key);
322
+ }
323
+ tf_words = tf_words->next;
324
+ }
325
+
326
+
327
+ xmlFreeDoc(doc);
328
+ xmlCleanupParser ();
329
+ g_free(local_dict_name);
330
+ return (TRUE);
331
+ }
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = ENV["CFLAGS"].to_s + " " + `pkg-config --cflags glib-2.0 libxml-2.0`.chomp
4
+ $LDFLAGS = ENV["LDFLAGS"].to_s + " " + `pkg-config --libs glib-2.0 libxml-2.0`.chomp
5
+
6
+ create_makefile('summarize/summarize')
@@ -0,0 +1,185 @@
1
+ /*
2
+ * grader-tc.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+
27
+ #include "grader-tc.h"
28
+
29
+
30
+ /*Grader - Term count algorithm*/
31
+ /*This is non-normelized term frequency algorithm without using inverse document frequency database */
32
+
33
+ #define NUM_KEY_WORDS 100 /* use first n key words only */
34
+
35
+ int
36
+ ots_get_article_word_count (const OtsArticle * Doc)
37
+ {
38
+ GList *li;
39
+ int articleWC;
40
+ articleWC = 0;
41
+
42
+ if (Doc==NULL) return 0;
43
+
44
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
45
+ {
46
+ articleWC += ((OtsSentence *) li->data)->wc;
47
+ }
48
+
49
+ return articleWC;
50
+ }
51
+
52
+
53
+ /*take this line and add each word to the "wordStat" list
54
+ * this list will hold all of the words in the article and the number
55
+ * of times they appeared in the article.
56
+ */
57
+
58
+ static void
59
+ ots_line_add_wordlist(OtsArticle * Doc,const OtsSentence * aLine)
60
+ {
61
+ GList *li;
62
+ if ((aLine==NULL) ||(NULL==Doc)) { return;}
63
+
64
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
65
+ if (li->data && strlen (li->data)) ots_add_wordstat (Doc, (char *)li->data);
66
+
67
+ return;
68
+ }
69
+
70
+ static void
71
+ ots_create_wordlist(OtsArticle * Doc)
72
+ {
73
+ GList *line;
74
+ if (Doc==NULL) return;
75
+
76
+ for (line = (GList *) Doc->lines; line != NULL; line = line->next)
77
+ {
78
+ OtsSentence * aLine=line->data;
79
+ if (aLine)
80
+ ots_line_add_wordlist(Doc,aLine);
81
+ }
82
+ }
83
+
84
+
85
+
86
+
87
+ static int
88
+ keyVal (const int n) /* Ugly , I know */
89
+ {
90
+ if (n == 1) return 3;
91
+ if (n == 2) return 2;
92
+ if (n == 3) return 2;
93
+ if (n == 4) return 2;
94
+ return 1;
95
+ }
96
+
97
+
98
+ static void
99
+ ots_grade_line (GList *impList, OtsSentence * aLine,
100
+ OtsStemRule * rule)
101
+ {
102
+ GList *li;
103
+ GList *di;
104
+ int n;
105
+ char *tmp_stem;
106
+
107
+ if ((aLine==NULL)||(rule==NULL)||(impList==NULL)) return;
108
+
109
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word */
110
+ {
111
+ n = 0;
112
+ tmp_stem = ots_stem_strip ((unsigned char *) li->data, rule);
113
+
114
+ for (di = (GList *) impList;
115
+ ((di != NULL) && (n < NUM_KEY_WORDS)); di = di->next)
116
+ {
117
+ n++;
118
+ if ((NULL!=((OtsWordEntery *) di->data)->stem) && (NULL!=tmp_stem))
119
+ if (0 == strcmp ((((OtsWordEntery *) di->data)->stem), tmp_stem))
120
+ {
121
+ /* debug:
122
+ if (0!=strcmp((((OtsWordEntery *) di->data)->word),li->data))
123
+ printf("[%s][%s] stem[%s]\n",(((OtsWordEntery *) di->data)->word),li->data,tmp);*/
124
+
125
+ aLine->score += (((OtsWordEntery *) di->data)->occ) * keyVal (n);
126
+ }
127
+
128
+ }
129
+
130
+ g_free (tmp_stem);
131
+ }
132
+
133
+ }
134
+
135
+
136
+ void
137
+ ots_create_title_tc(OtsArticle * Doc)
138
+ {
139
+
140
+ char *tmp;
141
+ char *word;
142
+ int i;
143
+ GString *title;
144
+ if (NULL==Doc) return;
145
+
146
+ title=g_string_new(NULL);
147
+
148
+ for (i=0;i<5;i++)
149
+ {
150
+ word = ots_word_in_list(Doc->ImpWords,i);
151
+ if (word) g_string_append(title,word); else break;
152
+ if (i<4) g_string_append(title,",");
153
+ }
154
+
155
+ tmp=title->str;
156
+ if (NULL!=title) g_string_free(title,FALSE);
157
+ Doc->title=tmp;
158
+ }
159
+
160
+
161
+ void
162
+ ots_grade_doc_tc (OtsArticle * Doc)
163
+ {
164
+
165
+ GList *li;
166
+ if (NULL==Doc) return;
167
+ ots_create_wordlist(Doc);
168
+
169
+
170
+ Doc->ImpWords=ots_union_list (Doc->wordStat, Doc->dict); /* subtract from the Article wordlist all the words in the dic file (on , the , is...) */
171
+ Doc->ImpWords=ots_sort_list (Doc->ImpWords); /* sort the list , top 3 is what the article talks about (SARS , virus , cure ... ) */
172
+
173
+ /*to print wordlist: ots_print_wordlist (stdout, Doc->ImpWords);*/
174
+
175
+ if (0 == Doc->lineCount) return;
176
+
177
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
178
+ {
179
+ if (li->data)
180
+ ots_grade_line (Doc->ImpWords, (OtsSentence *) li->data, Doc->stem);
181
+ }
182
+
183
+
184
+ ots_create_title_tc(Doc);
185
+ }