summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,64 @@
1
+ /*
2
+ * grader-tc.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_GRADERTC_H
22
+ #define HAVE_GRADERTC_H
23
+
24
+
25
+ #include <glib.h>
26
+ #include "libots.h"
27
+
28
+ G_BEGIN_DECLS
29
+
30
+
31
+ typedef struct
32
+ {
33
+ gchar *word; /* the word */
34
+ gchar *stem; /*stem of the word*/
35
+ gint occ; /* how many times have we seen this word in the text? */
36
+ } OtsWordEntery;
37
+
38
+ /*Word list manipulations*/
39
+ void ots_free_wordlist (GList *aList);
40
+
41
+
42
+
43
+ OtsWordEntery *ots_copy_wordEntery (OtsWordEntery * obj);
44
+ OtsWordEntery *ots_new_wordEntery (unsigned const char *wordString);
45
+ OtsWordEntery *ots_new_wordEntery_strip (unsigned const char *wordString,const OtsStemRule *rule);
46
+ void ots_free_wordEntery (OtsWordEntery * WC);
47
+
48
+ GList *ots_sort_list (GList* aList);
49
+ GList *ots_union_list (const GList *aLst, const GList * bLst);
50
+
51
+ char *ots_word_in_list (const GList *aList,const int index);
52
+ char *ots_stem_in_list (const GList *aList,const int index);
53
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
54
+
55
+
56
+ /*grader*/
57
+
58
+ void ots_grade_doc_tc (OtsArticle * Doc);
59
+
60
+ G_END_DECLS
61
+
62
+
63
+
64
+ #endif /* HAVE_GRADERTC_H */
@@ -0,0 +1,116 @@
1
+ /*
2
+ * grader-tf.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*Grader - using the Term frequency algorithm. Will give each line a score*/
27
+
28
+
29
+
30
+ OtsWordTF*
31
+ ots_new_OtsWordTF(const char* word,const double tf)
32
+ {
33
+ OtsWordTF* obj=g_new0(OtsWordTF,1);
34
+ if (word!=NULL) obj->word=g_strdup(word);
35
+ obj->tf=tf;
36
+ return obj;
37
+ }
38
+
39
+ void
40
+ ots_free_OtsWordTF(OtsWordTF *obj)
41
+ {
42
+ if (obj!=NULL)
43
+ {
44
+ if (obj->word!=NULL) g_free(obj->word);
45
+ g_free(obj);
46
+ }
47
+ }
48
+
49
+ void
50
+ ots_free_TF_wordlist (GList * aList)
51
+ {
52
+ if (aList != NULL)
53
+ {
54
+ g_list_foreach(aList,(GFunc)ots_free_OtsWordTF, NULL);
55
+ g_list_free(aList);
56
+ }
57
+ }
58
+
59
+
60
+ void
61
+ ots_grade_line_tf (OtsSentence * aLine)
62
+ {
63
+
64
+ return;
65
+ }
66
+
67
+
68
+
69
+ void
70
+ ots_grade_doc_tf (OtsArticle * Doc)
71
+ {
72
+
73
+ GList *li;
74
+
75
+ /*Load tf list*/
76
+ /*Load idf list*/
77
+
78
+ if (0 == Doc->lineCount) return;
79
+
80
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
81
+ {
82
+ ots_grade_line_tf ((OtsSentence *) li->data /* , tf list , idf list*/);
83
+ }
84
+
85
+ return;
86
+ }
87
+
88
+
89
+ double
90
+ ots_tf_word_score (const double tf,const double idf)
91
+ /*IDF: how rare is word across the collection
92
+ TF: how often is word in doc */
93
+ {
94
+
95
+ return tf*idf;
96
+ }
97
+
98
+ /*
99
+ Determine frequency of query words
100
+ n = (num-of-sentences words appears in)
101
+ N = (total-number-of-sentences)
102
+ f = n/N
103
+ */
104
+
105
+ double
106
+ ots_calc_idf (const int term_count,const int doc_word_count)
107
+ {
108
+ return -log(doc_word_count/term_count);
109
+ }
110
+
111
+ double
112
+ ots_calc_tf (const int term_count,const int doc_word_count)
113
+ {
114
+ if (term_count==0) return 0; else
115
+ return 0.5+0.5*(doc_word_count/term_count);
116
+ }
@@ -0,0 +1,85 @@
1
+ /*
2
+ * grader.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ extern void ots_grade_doc_tc (OtsArticle * Doc);
27
+
28
+ /*Grader driver - will call one of the grading algorithm*/
29
+
30
+
31
+
32
+ void
33
+ ots_grade_structure (OtsArticle * Doc) /*must be called after the first grader*/
34
+ {
35
+ GList *li;
36
+ GList *first;
37
+ GList *second;
38
+ OtsSentence *first_line=NULL;
39
+
40
+ first = NULL;
41
+ second = NULL;
42
+
43
+ if (Doc==NULL) return;
44
+
45
+ if (Doc->lines!=NULL)
46
+ first_line= ((OtsSentence *) (Doc->lines->data));
47
+ if (NULL!=first_line) first_line->score *= 2; /*first line/title is very important so we increase its score */
48
+
49
+ /*This loop will *1.6 the score of each line that
50
+ starts with \n \n , in other words a new paragraph*/
51
+
52
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
53
+ {
54
+ OtsSentence *aLine = (li->data);
55
+ if (NULL != aLine) /*line is there */
56
+ {
57
+ first = aLine->words; /*first word? */
58
+ if (NULL != first)
59
+ second = first->next; /*second word? */
60
+ if ((NULL != first) && (NULL != second)) /*have content? */
61
+ if (strcmp (first->data, "\n") && strcmp (second->data, "\n")) /*new paragraph? */
62
+ aLine->score *= 1.6;
63
+ }
64
+
65
+ }
66
+
67
+ }
68
+
69
+ /**
70
+ Each grader needs to do:
71
+ 1.give a ->score to each line
72
+ 2.Set the ->title of the document
73
+ **/
74
+
75
+ void
76
+ ots_grade_doc (OtsArticle * Doc)
77
+ {
78
+
79
+ if (Doc==NULL) return;
80
+ ots_grade_doc_tc(Doc); /*Term count*/
81
+
82
+ /* or ots_grade_doc_fc (Doc); Term Frequency */
83
+
84
+ ots_grade_structure (Doc);
85
+ }
@@ -0,0 +1,128 @@
1
+ /*
2
+ * highlighter
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*After the grader has graded the article and each
27
+ sentence has a score the highlighter will select
28
+ some of the sentences*/
29
+
30
+ static int
31
+ ots_highlight_max_line (OtsArticle * Doc)
32
+ {
33
+ GList *li;
34
+ int max = 0;
35
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
36
+ {
37
+ if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
38
+ max = MAX (((OtsSentence *) li->data)->score, max);
39
+
40
+ }
41
+
42
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
43
+ {
44
+
45
+ if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */
46
+ {
47
+ ((OtsSentence *) li->data)->selected = 1;
48
+ return ((OtsSentence *) li->data)->wc;
49
+ }
50
+ }
51
+
52
+ return 0;
53
+ }
54
+
55
+
56
+ /* todo: impement this
57
+
58
+ void
59
+ ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
60
+
61
+ void
62
+ ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
63
+
64
+
65
+
66
+ void
67
+ ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
68
+ */
69
+
70
+ void
71
+ ots_highlight_doc (OtsArticle * Doc, int percent)
72
+ {
73
+ int i;
74
+ double ratio;
75
+ int wordCount;
76
+
77
+ if (0 == Doc->lineCount)
78
+ return;
79
+
80
+ if (percent > 100)
81
+ percent = 100;
82
+ else if (percent < 0)
83
+ percent = 0;
84
+
85
+ ratio = ((double) (percent)) / (100.0);
86
+
87
+ wordCount = ots_get_article_word_count (Doc);
88
+
89
+ for (i = 0; i < (ratio * (double) wordCount);)
90
+ {
91
+ i += ots_highlight_max_line (Doc);
92
+ }
93
+ }
94
+
95
+ void
96
+ ots_highlight_doc_lines (OtsArticle * Doc, int lines)
97
+ {
98
+ int i;
99
+ int lineCount;
100
+ int tmp;
101
+
102
+ if (0 == Doc->lineCount) return;
103
+
104
+ lineCount = Doc->lineCount;
105
+ i=0;
106
+ while ((i<lines)&&(i<lineCount))
107
+ {
108
+ i++;
109
+ tmp=ots_highlight_max_line (Doc);
110
+ }
111
+
112
+ }
113
+
114
+ void ots_highlight_doc_words (OtsArticle * Doc, int words)
115
+ {
116
+ int i;
117
+ int docWordCount;
118
+
119
+ if (0 == Doc->lineCount) return;
120
+
121
+ docWordCount = ots_get_article_word_count (Doc);
122
+
123
+ i=0;
124
+ while ((i < docWordCount) && (i <= words))
125
+ {
126
+ i += ots_highlight_max_line (Doc);
127
+ }
128
+ }
@@ -0,0 +1,131 @@
1
+ /*
2
+ * html.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ static unsigned char *
27
+ ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
28
+ {
29
+ GList *li;
30
+ GString *text;
31
+ unsigned char *utf8_data;
32
+ char *score_str;
33
+ text = g_string_new (NULL);
34
+
35
+ score_str=g_new0(char,32);
36
+ sprintf(score_str,"<!--(%ld)-->",aLine->score);
37
+ g_string_append (text,score_str);
38
+ g_free(score_str);
39
+
40
+ if ((aLine->selected))
41
+ {
42
+ g_string_append (text,
43
+ "<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
44
+ }
45
+ else
46
+ {
47
+ g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
48
+ }
49
+
50
+ for (li = (GList *) aLine->words; li != NULL; li = li->next)
51
+ {
52
+ if (0 == strcmp ((char *) li->data, "\n"))
53
+ g_string_append (text, "<br>");
54
+ else
55
+ g_string_append (text, (char *) li->data);
56
+ }
57
+ g_string_append (text,"</span></FONT>\n");
58
+
59
+ if (out_size)
60
+ *out_size = text->len;
61
+
62
+ utf8_data = text->str;
63
+ g_string_free (text, FALSE);
64
+
65
+ return utf8_data;
66
+ }
67
+
68
+
69
+ #if 0
70
+ static void
71
+ ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
72
+ {
73
+ unsigned char *utf8_txt;
74
+ size_t len;
75
+
76
+ utf8_txt = ots_get_line_HTML (aLine, &len);
77
+ fwrite (utf8_txt, 1, len, stream);
78
+ g_free (utf8_txt);
79
+ }
80
+ #endif
81
+
82
+
83
+ unsigned char *
84
+ ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
85
+ {
86
+ GList *li;
87
+ GString *text;
88
+ unsigned char *utf8_data;
89
+ size_t line_len;
90
+
91
+ text = g_string_new (NULL);
92
+
93
+
94
+ g_string_append (text,
95
+ "<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
96
+ g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
97
+ g_string_append (text, "<!--");
98
+ g_string_append (text, Doc->title);
99
+ g_string_append (text, "-->\n");
100
+
101
+
102
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
103
+ {
104
+ utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
105
+ g_string_append_len (text, utf8_data, line_len);
106
+ g_free (utf8_data);
107
+ }
108
+ g_string_append (text, "</body></html>\n");
109
+
110
+ if (out_len)
111
+ *out_len = text->len;
112
+ utf8_data = text->str;
113
+
114
+ g_string_free (text, FALSE);
115
+ return utf8_data;
116
+
117
+ }
118
+
119
+
120
+
121
+ void
122
+ ots_print_HTML (FILE * stream, const OtsArticle * Doc)
123
+ {
124
+ unsigned char *utf8_txt;
125
+ size_t len;
126
+
127
+ utf8_txt = ots_get_doc_HTML (Doc, &len);
128
+ fwrite (utf8_txt, 1, len, stream);
129
+ g_free (utf8_txt);
130
+
131
+ }