summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,332 @@
1
+ /*
2
+ * stemmer.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include <string.h>
25
+ #include "libots.h"
26
+
27
+ #define MAX_PREFIX_SIZE 256
28
+
29
+ OtsStemRule *
30
+ new_stem_rule ()
31
+ {
32
+ OtsStemRule *rule = g_new0 (OtsStemRule, 1);
33
+ return rule;
34
+ }
35
+
36
+ void
37
+ free_stem_rule (OtsStemRule *rule)
38
+ {
39
+
40
+ if (rule != NULL)
41
+ {
42
+ g_list_foreach (rule->RemovePre, (GFunc) g_free, NULL);
43
+ g_list_free (rule->RemovePre);
44
+ g_list_foreach (rule->RemovePost, (GFunc) g_free, NULL);
45
+ g_list_free (rule->RemovePost);
46
+
47
+ g_list_foreach (rule->step1_pre, (GFunc) g_free, NULL);
48
+ g_list_free (rule->step1_pre);
49
+ g_list_foreach (rule->step1_post, (GFunc) g_free, NULL);
50
+ g_list_free (rule->step1_post);
51
+
52
+ g_list_foreach (rule->synonyms, (GFunc) g_free, NULL);
53
+ g_list_free (rule->synonyms);
54
+ g_list_foreach (rule->manual, (GFunc) g_free, NULL);
55
+ g_list_free (rule->manual);
56
+
57
+ g_list_foreach (rule->ParserBreak, (GFunc) g_free, NULL);
58
+ g_list_free (rule->ParserBreak);
59
+ g_list_foreach (rule->ParserDontBreak, (GFunc) g_free, NULL);
60
+ g_list_free (rule->ParserDontBreak);
61
+
62
+ g_list_foreach (rule->ReplaceChars, (GFunc) g_free, NULL);
63
+ g_list_free (rule->ReplaceChars);
64
+
65
+ g_free (rule);
66
+ }
67
+ return;
68
+ }
69
+
70
+ static void
71
+ ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b) /*given already alocated part_a and b */
72
+ { /*example "red|blue" */
73
+ int i, j, clen;
74
+ i = 0;
75
+ j = 0;
76
+
77
+ if (comp==NULL) return;
78
+ if (part_a==NULL) return;
79
+ if (part_b==NULL) return;
80
+
81
+ clen = strlen (comp);
82
+
83
+
84
+ part_a[0] = 0;
85
+ part_b[0] = 0;
86
+
87
+ while ((i < clen) && (i < MAX_PREFIX_SIZE) && (comp[i] != '|'))
88
+ {
89
+ part_a[i] = comp[i];
90
+ i++;
91
+ }
92
+ part_a[i] = 0;
93
+
94
+ i++; /*skip the | mark */
95
+ while (i < clen && (j < MAX_PREFIX_SIZE))
96
+ {
97
+ part_b[j] = comp[i];
98
+ i++;
99
+ j++;
100
+ }
101
+ part_b[j] = 0;
102
+ return;
103
+ }
104
+
105
+
106
+ static unsigned char *
107
+ ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new)
108
+ {
109
+ int i, plen, wlen, nlen;
110
+ unsigned char *new_str = NULL;
111
+
112
+ if (aWord==NULL) return NULL;
113
+
114
+ plen = strlen (pre);
115
+ wlen = strlen (aWord);
116
+ nlen = strlen (new);
117
+
118
+ for (i = 0; i < plen; i++)
119
+ if (aWord[i] != pre[i])
120
+ return NULL; /*no match */
121
+
122
+ new_str = g_new0 (char, wlen + nlen +5);
123
+ for (i = 0; i <= nlen; i++)
124
+ new_str[i] = new[i];
125
+
126
+ for (i = nlen; i <= nlen + wlen - plen; i++)
127
+ new_str[i] = aWord[i + plen - nlen];
128
+
129
+ new_str[i + 1] = 0;
130
+ return new_str;
131
+ }
132
+
133
+
134
+
135
+ static unsigned char *
136
+ ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new)
137
+ {
138
+ unsigned int i, wlen, plen, nlen;
139
+ unsigned char *new_str = NULL;
140
+
141
+ if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL;
142
+
143
+ wlen = strlen (aWord);
144
+ plen = strlen (post);
145
+ nlen = strlen (new);
146
+
147
+ if (plen>wlen) return NULL;
148
+
149
+
150
+ for (i = 0; i < plen; i++)
151
+ if (aWord[wlen - plen + i]!= post[i])
152
+ return NULL; /* no match */
153
+
154
+ new_str = g_new0 (char, wlen + nlen +5);
155
+
156
+ for (i = 0; i <= wlen - plen; i++) /*place word */
157
+ new_str[i] = aWord[i];
158
+
159
+ for (i = 0; i <= nlen; i++) /*place newfix */
160
+ new_str[wlen - plen + i] = new[i];
161
+
162
+ return new_str; /*word replaced */
163
+ }
164
+
165
+
166
+
167
+ static unsigned char *
168
+ ots_stem_replace_word (unsigned const char *aWord,unsigned const char *old,unsigned const char *new)
169
+ {
170
+
171
+ if (aWord==NULL) return NULL;
172
+
173
+ if ((aWord)&&(0 == strcmp (aWord, old)))
174
+ {
175
+ return g_strdup (new);
176
+ }
177
+ else
178
+ {
179
+ return NULL;
180
+ }
181
+
182
+ }
183
+
184
+
185
+
186
+
187
+
188
+ unsigned char *
189
+ ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule)
190
+ {
191
+ GList *li;
192
+ unsigned char *rep = NULL;
193
+ unsigned char *normWord = NULL;
194
+
195
+ if (aWord==NULL) return NULL;
196
+
197
+ normWord = g_utf8_strdown (aWord, -1); /*lowercase the word */
198
+
199
+ char *prefix;
200
+ char *newfix;
201
+
202
+ prefix = g_new0 (char, MAX_PREFIX_SIZE);
203
+ newfix = g_new0 (char, MAX_PREFIX_SIZE);
204
+
205
+ for (li = (GList *) rule->step1_pre; li != NULL; li = li->next)
206
+ {
207
+ ots_stem_break (li->data, prefix, newfix);
208
+ rep = ots_stem_remove_pre (normWord, prefix, newfix);
209
+ if (NULL != rep)
210
+ {
211
+ g_free (normWord);
212
+ normWord = rep;
213
+ rep = NULL;
214
+ }
215
+ }
216
+
217
+
218
+ for (li = (GList *) rule->step1_post; li != NULL; li = li->next)
219
+ {
220
+ ots_stem_break (li->data, prefix, newfix);
221
+ rep = ots_stem_remove_post(normWord, prefix, newfix);
222
+ if (NULL != rep)
223
+ {
224
+ g_free (normWord);
225
+ normWord = rep;
226
+ rep = NULL;
227
+ }
228
+ }
229
+
230
+ g_free (prefix);
231
+ g_free (newfix);
232
+
233
+ return normWord;
234
+ }
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+ unsigned char *
244
+ ots_stem_strip (unsigned const char *aWord,const OtsStemRule * rule)
245
+ {
246
+ GList *li;
247
+ unsigned char *rep = NULL;
248
+
249
+ unsigned char *prefix;
250
+ unsigned char *newfix;
251
+ unsigned char *normWord=NULL;
252
+
253
+ prefix = g_new0 (char, MAX_PREFIX_SIZE);
254
+ newfix = g_new0 (char, MAX_PREFIX_SIZE);
255
+
256
+ if (aWord==NULL) return NULL;
257
+
258
+ normWord = ots_stem_format (aWord,rule);
259
+
260
+
261
+ for (li = (GList *) rule->manual; li != NULL; li = li->next)
262
+ {
263
+ ots_stem_break (li->data, prefix, newfix);
264
+ rep = ots_stem_replace_word (normWord, prefix, newfix);
265
+ if (NULL != rep)
266
+ {
267
+ g_free (normWord);
268
+ normWord = rep;
269
+ rep = NULL;
270
+ break;
271
+ }
272
+ }
273
+
274
+
275
+
276
+
277
+ for (li = (GList *) rule->RemovePre; li != NULL; li = li->next)
278
+ {
279
+ ots_stem_break (li->data, prefix, newfix);
280
+ rep = ots_stem_remove_pre (normWord, prefix, newfix);
281
+ if (NULL != rep)
282
+ {
283
+ g_free (normWord);
284
+ normWord = rep;
285
+ rep = NULL;
286
+ break;
287
+ }
288
+ }
289
+
290
+
291
+ for (li = (GList *) rule->RemovePost; li != NULL; li = li->next)
292
+ {
293
+ ots_stem_break (li->data, prefix, newfix);
294
+ rep = ots_stem_remove_post (normWord, prefix, newfix);
295
+ if (NULL != rep)
296
+ {
297
+ g_free (normWord);
298
+ normWord = rep;
299
+ rep = NULL;
300
+ break;
301
+ }
302
+
303
+ }
304
+
305
+
306
+ for (li = (GList *) rule->synonyms; li != NULL; li = li->next)
307
+ {
308
+ ots_stem_break (li->data, prefix, newfix);
309
+ rep = ots_stem_replace_word (normWord, prefix, newfix);
310
+ if (NULL != rep)
311
+ {
312
+ g_free (normWord);
313
+ normWord = rep;
314
+ rep = NULL;
315
+ break;
316
+ }
317
+ }
318
+
319
+
320
+ g_free (prefix);
321
+ g_free (newfix);
322
+
323
+
324
+ if (strlen(normWord)<3) /*stem is two letter long. thats not right. N(eed)==N(ation) ?*/
325
+ {
326
+ g_free(normWord);
327
+ normWord = ots_stem_format (aWord,rule); /*lowercase the word */
328
+ }
329
+
330
+
331
+ return normWord;
332
+ }
@@ -0,0 +1,43 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+
5
+ #include <glib.h>
6
+ #include <glib-object.h>
7
+ #include <ruby.h>
8
+
9
+ #include "libots.h"
10
+ #include "summarize.h"
11
+
12
+ const char *OTS_ERROR_BAD_DICT = "Cannot load dictionary file";
13
+
14
+ void Init_summarize() {
15
+ VALUE rb_mOts = rb_define_module("Summarize");
16
+ rb_define_module_function(rb_mOts, "summarize", summarize, 3);
17
+ }
18
+
19
+ static VALUE summarize(const VALUE self, const VALUE rb_str, const VALUE rb_dict_file, const VALUE rb_ratio) {
20
+ int length = RSTRING_LEN(rb_str);
21
+ char *text = StringValuePtr(rb_str);
22
+ char *dictionary_file = StringValuePtr(rb_dict_file);
23
+ int ratio = NUM2INT(rb_ratio);
24
+ unsigned char *result;
25
+ size_t result_len;
26
+ OtsArticle *doc = ots_new_article();
27
+
28
+ if (!ots_load_xml_dictionary(doc, dictionary_file)) {
29
+ ots_free_article(doc);
30
+ rb_raise(rb_eRuntimeError, OTS_ERROR_BAD_DICT);
31
+ return Qnil;
32
+ }
33
+
34
+ ots_parse_stream(text, length, doc);
35
+ ots_grade_doc(doc);
36
+ ots_highlight_doc(doc, ratio);
37
+
38
+ result = ots_get_doc_text(doc, &result_len);
39
+
40
+ ots_free_article(doc);
41
+
42
+ return rb_str_new2(result);
43
+ }
@@ -0,0 +1,12 @@
1
+ #ifndef RSTRING_PTR
2
+ #define RSTRING_PTR(s) (RSTRING(s)->ptr)
3
+ #endif
4
+
5
+ #ifndef RSTRING_LEN
6
+ #define RSTRING_LEN(s) (RSTRING(s)->len)
7
+ #endif
8
+
9
+ #ifndef __summarize_h__
10
+ #define __summarize_h__
11
+ static VALUE summarize(VALUE, VALUE, VALUE, VALUE);
12
+ #endif
@@ -0,0 +1,98 @@
1
+ /*
2
+ * text.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+
27
+ unsigned char *
28
+ ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size)
29
+ {
30
+ GList *li;
31
+ GString *text;
32
+ unsigned char *utf8_data;
33
+
34
+ if (!(aLine))
35
+ return NULL;
36
+
37
+ text = g_string_new (NULL);
38
+
39
+ if (!only_if_selected || aLine->selected)
40
+ {
41
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
42
+ if (li->data && strlen (li->data)) /*if word exists*/
43
+ g_string_append (text, (char *) li->data);
44
+
45
+ }
46
+
47
+ if (out_size)
48
+ *out_size = text->len;
49
+
50
+ utf8_data = text->str;
51
+ g_string_free (text, FALSE);
52
+
53
+ return utf8_data;
54
+ }
55
+
56
+ static void
57
+ ots_print_line (FILE * stream, const OtsSentence * aLine)
58
+ {
59
+ unsigned char *utf8_txt;
60
+ size_t len;
61
+ utf8_txt = ots_get_line_text (aLine, TRUE, &len);
62
+ fwrite (utf8_txt, 1, len, stream);
63
+ g_free (utf8_txt);
64
+ }
65
+
66
+ unsigned char *
67
+ ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
68
+ {
69
+ GList *li;
70
+ GString *text;
71
+ unsigned char *utf8_data;
72
+ size_t line_len;
73
+
74
+ text = g_string_new (NULL);
75
+
76
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
77
+ {
78
+ utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len);
79
+ g_string_append_len (text, utf8_data, line_len);
80
+ g_free (utf8_data);
81
+ }
82
+
83
+ if (out_len)
84
+ *out_len = text->len;
85
+ utf8_data = text->str;
86
+
87
+ g_string_free (text, FALSE);
88
+ return utf8_data;
89
+ }
90
+
91
+ void
92
+ ots_print_doc (FILE * stream, const OtsArticle * Doc)
93
+ {
94
+ GList *li;
95
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next) /* for each line in Article Do: */
96
+ ots_print_line (stream, (OtsSentence *) li->data);
97
+ fputc ('\n', stream);
98
+ }