summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,158 @@
1
+ /*
2
+ * libots.h
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #ifndef HAVE_LIBOTS_H
22
+ #define HAVE_LIBOTS_H
23
+
24
+ #include <glib.h>
25
+
26
+ G_BEGIN_DECLS
27
+
28
+ typedef struct
29
+ { /* the Term Frequency data structure */
30
+ char* word;
31
+ double tf; /*Also used for TF*/
32
+ } OtsWordTF;
33
+
34
+
35
+ typedef struct
36
+ {
37
+ /*a GList of char* */
38
+ GList *RemovePre; /* (a|b) replace string a with b */
39
+ GList *RemovePost;
40
+ GList *step1_pre;
41
+ GList *step1_post;
42
+
43
+ GList *synonyms;
44
+ GList *manual;
45
+
46
+ GList *ParserBreak;
47
+ GList *ParserDontBreak;
48
+
49
+
50
+ /*to be implemented*/
51
+ GList *ReplaceChars;
52
+
53
+ } OtsStemRule;
54
+
55
+
56
+ typedef struct
57
+ {
58
+ GList *words; /* a Glist of words (char*) */
59
+ glong score; /*score set by the grader*/
60
+ gboolean selected; /*is selected?*/
61
+ gint wc; /*word count*/
62
+ void *user_data; /*pointer to the original sentence , or serial number maybe*/
63
+ } OtsSentence;
64
+
65
+
66
+ typedef struct
67
+ {
68
+ GList *lines; /* a Glist of sentences (struct Sentence) */
69
+ gint lineCount; /*lines in the text*/
70
+ char *title; /*title , auto generated*/
71
+
72
+ OtsStemRule *stem; /*stemming & parsing rules*/
73
+
74
+ /*Term Frequency grader*/
75
+ GList *tf_terms;
76
+ GList *idf_terms;
77
+
78
+
79
+ /*Term Count grader*/
80
+ GList *dict; /* dictionary from xml*/
81
+ GList *wordStat; /* a wordlist of all words in the article and their occ */
82
+ GList *ImpWords; /*important words - for term count grader*/
83
+
84
+
85
+ } OtsArticle;
86
+
87
+
88
+ OtsArticle *ots_new_article (void);
89
+ void ots_free_article (OtsArticle *art);
90
+
91
+ /*parser*/
92
+ void ots_parse_file (FILE * stream, OtsArticle * Doc); /*file input */
93
+ void ots_parse_stream(const unsigned char *utf8 , size_t len ,OtsArticle *Doc); /*parse unicode stream*/
94
+
95
+ OtsSentence *ots_append_line (OtsArticle * Doc);
96
+ void ots_append_word (OtsSentence * aLine,unsigned const char *aWord);
97
+ void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString);
98
+
99
+
100
+ /*dictionary*/
101
+ gboolean ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name);
102
+
103
+ int ots_get_article_word_count (const OtsArticle * Doc);
104
+
105
+
106
+ /*grader*/
107
+ void ots_highlight_doc (OtsArticle * Doc, int percent); /*example: 20%*/
108
+ void ots_highlight_doc_lines (OtsArticle * Doc, int lines); /*example: 10 lines*/
109
+ void ots_highlight_doc_words (OtsArticle * Doc, int words); /*example: 50 words*/
110
+
111
+ void ots_grade_doc (OtsArticle * Doc);
112
+
113
+ void ots_free_OtsWordTF(OtsWordTF *obj); /*todo: put in .h file*/
114
+ OtsWordTF* ots_new_OtsWordTF(const char* word,const double idf);
115
+
116
+
117
+ /*HTML output*/
118
+ void ots_print_HTML (FILE * stream, const OtsArticle * Doc);
119
+ unsigned char *ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len);
120
+
121
+ /*TEXT output*/
122
+ void ots_print_doc (FILE * stream, const OtsArticle * Doc);
123
+ unsigned char *ots_get_doc_text (const OtsArticle * Doc, size_t * out_len);
124
+
125
+
126
+ /*Plugin writing*/
127
+ unsigned char* ots_get_line_text (const OtsSentence *aLine, gboolean only_if_selected, size_t *out_size);
128
+ gboolean ots_is_line_selected(const OtsSentence *aLine);
129
+
130
+ /*Stemm support*/
131
+ OtsStemRule *new_stem_rule(void);
132
+ void free_stem_rule (OtsStemRule *rule);
133
+ unsigned char * ots_stem_strip (unsigned const char * aWord, const OtsStemRule *rule); /*returns newly allocated string with the root of the word*/
134
+ unsigned char *ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule); /*Remove leading spaces, comas, colons, etc. */
135
+
136
+ /*Relations between texts*/
137
+
138
+ /*Returns the number of topics that two blocks of text share*/
139
+ int ots_text_relations(
140
+ const unsigned char *text1,const unsigned char *lang_code1,
141
+ const unsigned char *text2,const unsigned char *lang_code2,const int topic_num);
142
+
143
+ /*For a given text, return the list of the topics*/
144
+ char* ots_text_topics(const unsigned char *text,const unsigned char *lang_code,int topic_num);
145
+
146
+
147
+ /*For a given text, return the list of the stemmed topics*/
148
+ GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_code,int topic_num);
149
+
150
+
151
+ /*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
152
+ int ots_topic_list_score(const GList *topic_list1,const GList *topic_list2);
153
+
154
+ G_END_DECLS
155
+
156
+
157
+
158
+ #endif /* HAVE_LIBOTS_H */
@@ -0,0 +1,173 @@
1
+ /*
2
+ * parser.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include <strings.h>
25
+ #include "libots.h"
26
+
27
+ #define BUFFER_SIZE (1024*8)
28
+
29
+ int
30
+ ots_match_post (const char *aWord,const char *post)
31
+ {
32
+ int i, wlen, plen;
33
+
34
+
35
+ wlen = strlen (aWord);
36
+ plen = strlen (post);
37
+
38
+ if (plen > wlen) return 0;
39
+
40
+ for (i = 0; i < plen; i++)
41
+ if (aWord[wlen - plen + i] != post[i])
42
+ return 0; /* no match */
43
+
44
+ return 1; /*word match */
45
+ }
46
+
47
+ void
48
+ ots_parse_file (FILE * stream, OtsArticle * Doc )
49
+ {
50
+ unsigned char fread_buffer[BUFFER_SIZE];
51
+ unsigned char *buffer;
52
+ size_t nread, total_read, avail_size;
53
+
54
+ buffer = g_new0 (unsigned char, BUFFER_SIZE);
55
+
56
+ avail_size = BUFFER_SIZE;
57
+ total_read = nread = 0;
58
+ while ((nread =
59
+ fread (fread_buffer, sizeof (unsigned char), sizeof (fread_buffer),
60
+ stream)) > 0)
61
+ {
62
+ if (nread + total_read > avail_size)
63
+ {
64
+ avail_size *= 2;
65
+ buffer = g_renew (unsigned char, buffer, avail_size);
66
+ }
67
+
68
+ strncpy (buffer + total_read, fread_buffer, nread);
69
+ total_read += nread;
70
+ }
71
+
72
+ ots_parse_stream (buffer, total_read, Doc);
73
+ g_free (buffer);
74
+ }
75
+
76
+
77
+
78
+
79
+
80
+ int
81
+ ots_parser_should_break(const char *aWord,const OtsStemRule * rule)
82
+ {
83
+ GList *li;
84
+ char *postfix;
85
+ int toBreak=0;
86
+
87
+ for (li = (GList *) rule->ParserBreak; li != NULL; li = li->next)
88
+ {
89
+ postfix=li->data;
90
+ if (ots_match_post (aWord, postfix) )
91
+ {
92
+ toBreak=1;
93
+ break;
94
+ }
95
+
96
+ }
97
+
98
+
99
+ for (li = (GList *) rule->ParserDontBreak; li != NULL; li = li->next)
100
+ {
101
+ postfix=li->data;
102
+ if (ots_match_post (aWord, postfix) )
103
+ {
104
+ toBreak=0;
105
+ break;
106
+ }
107
+
108
+ }
109
+ return toBreak;
110
+ }
111
+
112
+
113
+
114
+ void
115
+ ots_parse_stream(const unsigned char *utf8, size_t len, OtsArticle * Doc) /*parse the unicode stream */
116
+ {
117
+
118
+ OtsSentence *tmpLine = ots_append_line (Doc);
119
+ OtsStemRule * rule=Doc->stem;
120
+ gunichar uc;
121
+ int index = 0;
122
+ char *s = (char *) utf8;
123
+ GString *word_buffer = g_string_new (NULL);
124
+
125
+
126
+ while ((*s) && (index < len))
127
+ {
128
+ uc = g_utf8_get_char (s);
129
+
130
+ if (!g_unichar_isspace (uc)) /* space is the end of a word */
131
+ {
132
+
133
+ g_string_append_unichar(word_buffer,uc);
134
+
135
+ }
136
+ else
137
+ {
138
+
139
+ if (0<word_buffer->len)
140
+ {
141
+ ots_append_word (tmpLine, word_buffer->str);
142
+
143
+ if (ots_parser_should_break(word_buffer->str,rule)) {
144
+ tmpLine = ots_append_line (Doc); /* Add a new Line */
145
+ }
146
+
147
+ g_string_assign (word_buffer, "");
148
+
149
+ }
150
+
151
+ if (uc=='\n') {ots_append_word (tmpLine,"\n");}
152
+ else
153
+ {ots_append_word (tmpLine," ");}
154
+
155
+ g_string_assign (word_buffer,"");
156
+ }
157
+
158
+ s = g_utf8_next_char (s);
159
+
160
+ index++;
161
+ }
162
+
163
+
164
+ if (0<word_buffer->len) /*final flush*/
165
+ {
166
+ ots_append_word (tmpLine, word_buffer->str);
167
+ g_string_assign (word_buffer, "");
168
+ }
169
+
170
+
171
+
172
+ g_string_free (word_buffer, TRUE);
173
+ }
@@ -0,0 +1,163 @@
1
+ /*
2
+ * relations.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include "grader-tc.h"
25
+
26
+ #include "libots.h"
27
+ /*
28
+ The Inner product of two texts is defined as the number of topics they
29
+ share. This set of functions implements this relations using the ots
30
+ api.
31
+
32
+ Application: a relation between a slashdot article and a comment made
33
+ usage: ots_text_relations(story,"en",comment,"en",n);
34
+ where n is the max number of most important topics to consider; safe to give a high number (ex: 20);
35
+
36
+ returns:
37
+ 0 - off topic
38
+ n - number of topics they share
39
+
40
+ */
41
+
42
+ #define OTS_MAX_TOPIC_WORD_SIZE 256
43
+
44
+ /*Returns the number of topics that two blocks of text share*/
45
+ int ots_text_relations(
46
+ const unsigned char *text1,const unsigned char *lang_code1,
47
+ const unsigned char *text2,const unsigned char *lang_code2,const int topic_num)
48
+ {
49
+ GList* top1;
50
+ GList* top2;
51
+ int score;
52
+
53
+ top1=ots_text_stem_list(text1,lang_code1,topic_num);
54
+ top2=ots_text_stem_list(text2,lang_code2,topic_num);
55
+
56
+ score=ots_topic_list_score(top1,top2);
57
+
58
+ if (top1){g_list_foreach (top1, (GFunc) g_free, NULL);g_list_free (top1);}
59
+ if (top2){g_list_foreach (top2, (GFunc) g_free, NULL);g_list_free (top2);}
60
+
61
+ return score;
62
+ }
63
+
64
+
65
+
66
+
67
+ /*For a given text, return the list of the topics*/
68
+ char* ots_text_topics(
69
+ const unsigned char *text,const unsigned char *lang_code,int topic_num)
70
+ {
71
+ int i;
72
+ GString *word;
73
+ unsigned char *str;
74
+ unsigned char *tmp;
75
+ OtsArticle *Art;
76
+
77
+ if (NULL==text) return NULL;
78
+ word = g_string_new (NULL);
79
+
80
+ Art = ots_new_article ();
81
+
82
+ ots_load_xml_dictionary(Art,lang_code); /*Load the dictionary*/
83
+ if (text!=NULL) ots_parse_stream (text,strlen(text), Art); /* read text , put it in struct Article */
84
+ ots_grade_doc (Art);
85
+
86
+
87
+ for (i=0;i<=topic_num;i++)
88
+ {
89
+ tmp=ots_word_in_list(Art->ImpWords,i);
90
+ if ((tmp!=NULL)&&(strlen(tmp)>0)) {g_string_append(word,tmp);
91
+ g_string_append(word," "); }
92
+ }
93
+
94
+
95
+ str=word->str;
96
+ g_string_free (word, FALSE);
97
+ ots_free_article (Art);
98
+
99
+ return str;
100
+ }
101
+
102
+
103
+
104
+ /*For a given text, return the list of the stemmed topics*/
105
+ GList* ots_text_stem_list(const unsigned char *text, const unsigned char *lang_code, int topic_num)
106
+ {
107
+ int i;
108
+ GList *topics=NULL;
109
+ unsigned char *tmp;
110
+ OtsArticle *Art;
111
+
112
+ if (NULL==text) return NULL;
113
+
114
+ Art = ots_new_article ();
115
+
116
+ ots_load_xml_dictionary(Art,lang_code);
117
+ if (text!=NULL) ots_parse_stream (text,strlen(text), Art);
118
+ ots_grade_doc (Art);
119
+
120
+
121
+ for (i=0;i<=topic_num;i++)
122
+ {
123
+ tmp=ots_stem_in_list(Art->ImpWords,i);
124
+ if ((tmp)&&(strlen(tmp)>0))
125
+ topics=g_list_append(topics,g_strdup(tmp));
126
+ }
127
+
128
+
129
+ ots_free_article (Art);
130
+ return topics;
131
+ }
132
+
133
+ /*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
134
+ int ots_topic_list_score(
135
+ const GList *topic_list1,
136
+ const GList *topic_list2)
137
+ {
138
+ int count=0;
139
+ GList *tmplist1;
140
+ GList *tmplist2;
141
+
142
+ if (!(topic_list1)) return 0;
143
+ if (!(topic_list2)) return 0;
144
+
145
+ tmplist1 = g_list_first(topic_list1);
146
+ while(tmplist1)
147
+ {
148
+ tmplist2 = g_list_first(topic_list2);
149
+ while(tmplist2)
150
+ {
151
+
152
+ if ((tmplist1->data)&&(tmplist2->data)&&(strlen(tmplist2->data)>1))
153
+ if (0==strncmp(tmplist1->data,tmplist2->data,OTS_MAX_TOPIC_WORD_SIZE))
154
+ {count++;}
155
+
156
+ tmplist2 = g_list_next(tmplist2);
157
+ }
158
+ tmplist1 = g_list_next(tmplist1);
159
+ }
160
+
161
+ return count;
162
+ }
163
+