summarize 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,64 @@
1
+ /*
2
+ * grader-tc.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_GRADERTC_H
22
+ #define HAVE_GRADERTC_H
23
+
24
+
25
+ #include <glib.h>
26
+ #include "libots.h"
27
+
28
+ G_BEGIN_DECLS
29
+
30
+
31
+ typedef struct
32
+ {
33
+ gchar *word; /* the word */
34
+ gchar *stem; /*stem of the word*/
35
+ gint occ; /* how many times have we seen this word in the text? */
36
+ } OtsWordEntery;
37
+
38
+ /*Word list manipulations*/
39
+ void ots_free_wordlist (GList *aList);
40
+
41
+
42
+
43
+ OtsWordEntery *ots_copy_wordEntery (OtsWordEntery * obj);
44
+ OtsWordEntery *ots_new_wordEntery (unsigned const char *wordString);
45
+ OtsWordEntery *ots_new_wordEntery_strip (unsigned const char *wordString,const OtsStemRule *rule);
46
+ void ots_free_wordEntery (OtsWordEntery * WC);
47
+
48
+ GList *ots_sort_list (GList* aList);
49
+ GList *ots_union_list (const GList *aLst, const GList * bLst);
50
+
51
+ char *ots_word_in_list (const GList *aList,const int index);
52
+ char *ots_stem_in_list (const GList *aList,const int index);
53
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
54
+
55
+
56
+ /*grader*/
57
+
58
+ void ots_grade_doc_tc (OtsArticle * Doc);
59
+
60
+ G_END_DECLS
61
+
62
+
63
+
64
+ #endif /* HAVE_GRADERTC_H */
@@ -0,0 +1,116 @@
1
+ /*
2
+ * grader-tf.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*Grader - using the Term frequency algorithm. Will give each line a score*/
27
+
28
+
29
+
30
+ OtsWordTF*
31
+ ots_new_OtsWordTF(const char* word,const double tf)
32
+ {
33
+ OtsWordTF* obj=g_new0(OtsWordTF,1);
34
+ if (word!=NULL) obj->word=g_strdup(word);
35
+ obj->tf=tf;
36
+ return obj;
37
+ }
38
+
39
+ void
40
+ ots_free_OtsWordTF(OtsWordTF *obj)
41
+ {
42
+ if (obj!=NULL)
43
+ {
44
+ if (obj->word!=NULL) g_free(obj->word);
45
+ g_free(obj);
46
+ }
47
+ }
48
+
49
+ void
50
+ ots_free_TF_wordlist (GList * aList)
51
+ {
52
+ if (aList != NULL)
53
+ {
54
+ g_list_foreach(aList,(GFunc)ots_free_OtsWordTF, NULL);
55
+ g_list_free(aList);
56
+ }
57
+ }
58
+
59
+
60
+ void
61
+ ots_grade_line_tf (OtsSentence * aLine)
62
+ {
63
+
64
+ return;
65
+ }
66
+
67
+
68
+
69
+ void
70
+ ots_grade_doc_tf (OtsArticle * Doc)
71
+ {
72
+
73
+ GList *li;
74
+
75
+ /*Load tf list*/
76
+ /*Load idf list*/
77
+
78
+ if (0 == Doc->lineCount) return;
79
+
80
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
81
+ {
82
+ ots_grade_line_tf ((OtsSentence *) li->data /* , tf list , idf list*/);
83
+ }
84
+
85
+ return;
86
+ }
87
+
88
+
89
+ double
90
+ ots_tf_word_score (const double tf,const double idf)
91
+ /*IDF: how rare is word across the collection
92
+ TF: how often is word in doc */
93
+ {
94
+
95
+ return tf*idf;
96
+ }
97
+
98
+ /*
99
+ Determine frequency of query words
100
+ n = (num-of-sentences words appears in)
101
+ N = (total-number-of-sentences)
102
+ f = n/N
103
+ */
104
+
105
+ double
106
+ ots_calc_idf (const int term_count,const int doc_word_count)
107
+ {
108
+ return -log(doc_word_count/term_count);
109
+ }
110
+
111
+ double
112
+ ots_calc_tf (const int term_count,const int doc_word_count)
113
+ {
114
+ if (term_count==0) return 0; else
115
+ return 0.5+0.5*(doc_word_count/term_count);
116
+ }
@@ -0,0 +1,85 @@
1
+ /*
2
+ * grader.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ extern void ots_grade_doc_tc (OtsArticle * Doc);
27
+
28
+ /*Grader driver - will call one of the grading algorithm*/
29
+
30
+
31
+
32
+ void
33
+ ots_grade_structure (OtsArticle * Doc) /*must be called after the first grader*/
34
+ {
35
+ GList *li;
36
+ GList *first;
37
+ GList *second;
38
+ OtsSentence *first_line=NULL;
39
+
40
+ first = NULL;
41
+ second = NULL;
42
+
43
+ if (Doc==NULL) return;
44
+
45
+ if (Doc->lines!=NULL)
46
+ first_line= ((OtsSentence *) (Doc->lines->data));
47
+ if (NULL!=first_line) first_line->score *= 2; /*first line/title is very important so we increase its score */
48
+
49
+ /*This loop will *1.6 the score of each line that
50
+ starts with \n \n , in other words a new paragraph*/
51
+
52
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
53
+ {
54
+ OtsSentence *aLine = (li->data);
55
+ if (NULL != aLine) /*line is there */
56
+ {
57
+ first = aLine->words; /*first word? */
58
+ if (NULL != first)
59
+ second = first->next; /*second word? */
60
+ if ((NULL != first) && (NULL != second)) /*have content? */
61
+ if (strcmp (first->data, "\n") && strcmp (second->data, "\n")) /*new paragraph? */
62
+ aLine->score *= 1.6;
63
+ }
64
+
65
+ }
66
+
67
+ }
68
+
69
+ /**
70
+ Each grader needs to do:
71
+ 1.give a ->score to each line
72
+ 2.Set the ->title of the document
73
+ **/
74
+
75
+ void
76
+ ots_grade_doc (OtsArticle * Doc)
77
+ {
78
+
79
+ if (Doc==NULL) return;
80
+ ots_grade_doc_tc(Doc); /*Term count*/
81
+
82
+ /* or ots_grade_doc_fc (Doc); Term Frequency */
83
+
84
+ ots_grade_structure (Doc);
85
+ }
@@ -0,0 +1,128 @@
1
+ /*
2
+ * highlighter
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ /*After the grader has graded the article and each
27
+ sentence has a score the highlighter will select
28
+ some of the sentences*/
29
+
30
+ static int
31
+ ots_highlight_max_line (OtsArticle * Doc)
32
+ {
33
+ GList *li;
34
+ int max = 0;
35
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
36
+ {
37
+ if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
38
+ max = MAX (((OtsSentence *) li->data)->score, max);
39
+
40
+ }
41
+
42
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
43
+ {
44
+
45
+ if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */
46
+ {
47
+ ((OtsSentence *) li->data)->selected = 1;
48
+ return ((OtsSentence *) li->data)->wc;
49
+ }
50
+ }
51
+
52
+ return 0;
53
+ }
54
+
55
+
56
+ /* todo: impement this
57
+
58
+ void
59
+ ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount)
60
+
61
+ void
62
+ ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount)
63
+
64
+
65
+
66
+ void
67
+ ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks
68
+ */
69
+
70
+ void
71
+ ots_highlight_doc (OtsArticle * Doc, int percent)
72
+ {
73
+ int i;
74
+ double ratio;
75
+ int wordCount;
76
+
77
+ if (0 == Doc->lineCount)
78
+ return;
79
+
80
+ if (percent > 100)
81
+ percent = 100;
82
+ else if (percent < 0)
83
+ percent = 0;
84
+
85
+ ratio = ((double) (percent)) / (100.0);
86
+
87
+ wordCount = ots_get_article_word_count (Doc);
88
+
89
+ for (i = 0; i < (ratio * (double) wordCount);)
90
+ {
91
+ i += ots_highlight_max_line (Doc);
92
+ }
93
+ }
94
+
95
+ void
96
+ ots_highlight_doc_lines (OtsArticle * Doc, int lines)
97
+ {
98
+ int i;
99
+ int lineCount;
100
+ int tmp;
101
+
102
+ if (0 == Doc->lineCount) return;
103
+
104
+ lineCount = Doc->lineCount;
105
+ i=0;
106
+ while ((i<lines)&&(i<lineCount))
107
+ {
108
+ i++;
109
+ tmp=ots_highlight_max_line (Doc);
110
+ }
111
+
112
+ }
113
+
114
+ void ots_highlight_doc_words (OtsArticle * Doc, int words)
115
+ {
116
+ int i;
117
+ int docWordCount;
118
+
119
+ if (0 == Doc->lineCount) return;
120
+
121
+ docWordCount = ots_get_article_word_count (Doc);
122
+
123
+ i=0;
124
+ while ((i < docWordCount) && (i <= words))
125
+ {
126
+ i += ots_highlight_max_line (Doc);
127
+ }
128
+ }
@@ -0,0 +1,131 @@
1
+ /*
2
+ * html.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "libots.h"
25
+
26
+ static unsigned char *
27
+ ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size)
28
+ {
29
+ GList *li;
30
+ GString *text;
31
+ unsigned char *utf8_data;
32
+ char *score_str;
33
+ text = g_string_new (NULL);
34
+
35
+ score_str=g_new0(char,32);
36
+ sprintf(score_str,"<!--(%ld)-->",aLine->score);
37
+ g_string_append (text,score_str);
38
+ g_free(score_str);
39
+
40
+ if ((aLine->selected))
41
+ {
42
+ g_string_append (text,
43
+ "<FONT COLOR=\"#16569E\"><span style=\'background:yellow;\'>");
44
+ }
45
+ else
46
+ {
47
+ g_string_append (text, "<FONT COLOR=\"#16569E\"><span>");
48
+ }
49
+
50
+ for (li = (GList *) aLine->words; li != NULL; li = li->next)
51
+ {
52
+ if (0 == strcmp ((char *) li->data, "\n"))
53
+ g_string_append (text, "<br>");
54
+ else
55
+ g_string_append (text, (char *) li->data);
56
+ }
57
+ g_string_append (text,"</span></FONT>\n");
58
+
59
+ if (out_size)
60
+ *out_size = text->len;
61
+
62
+ utf8_data = text->str;
63
+ g_string_free (text, FALSE);
64
+
65
+ return utf8_data;
66
+ }
67
+
68
+
69
+ #if 0
70
+ static void
71
+ ots_print_line_HTML (FILE * stream, const OtsSentence * aLine)
72
+ {
73
+ unsigned char *utf8_txt;
74
+ size_t len;
75
+
76
+ utf8_txt = ots_get_line_HTML (aLine, &len);
77
+ fwrite (utf8_txt, 1, len, stream);
78
+ g_free (utf8_txt);
79
+ }
80
+ #endif
81
+
82
+
83
+ unsigned char *
84
+ ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len)
85
+ {
86
+ GList *li;
87
+ GString *text;
88
+ unsigned char *utf8_data;
89
+ size_t line_len;
90
+
91
+ text = g_string_new (NULL);
92
+
93
+
94
+ g_string_append (text,
95
+ "<html>\n<head>\n<title>OTS</title>\n<meta charset=\"utf-8\">\n</head>\n<body>\n");
96
+ g_string_append (text, "<!-- Generated by OpenTextSummarizer -->\n");
97
+ g_string_append (text, "<!--");
98
+ g_string_append (text, Doc->title);
99
+ g_string_append (text, "-->\n");
100
+
101
+
102
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
103
+ {
104
+ utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len);
105
+ g_string_append_len (text, utf8_data, line_len);
106
+ g_free (utf8_data);
107
+ }
108
+ g_string_append (text, "</body></html>\n");
109
+
110
+ if (out_len)
111
+ *out_len = text->len;
112
+ utf8_data = text->str;
113
+
114
+ g_string_free (text, FALSE);
115
+ return utf8_data;
116
+
117
+ }
118
+
119
+
120
+
121
+ void
122
+ ots_print_HTML (FILE * stream, const OtsArticle * Doc)
123
+ {
124
+ unsigned char *utf8_txt;
125
+ size_t len;
126
+
127
+ utf8_txt = ots_get_doc_HTML (Doc, &len);
128
+ fwrite (utf8_txt, 1, len, stream);
129
+ g_free (utf8_txt);
130
+
131
+ }