summarize 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
@@ -0,0 +1,332 @@
1
+ /*
2
+ * stemmer.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include <string.h>
25
+ #include "libots.h"
26
+
27
+ #define MAX_PREFIX_SIZE 256
28
+
29
+ OtsStemRule *
30
+ new_stem_rule ()
31
+ {
32
+ OtsStemRule *rule = g_new0 (OtsStemRule, 1);
33
+ return rule;
34
+ }
35
+
36
+ void
37
+ free_stem_rule (OtsStemRule *rule)
38
+ {
39
+
40
+ if (rule != NULL)
41
+ {
42
+ g_list_foreach (rule->RemovePre, (GFunc) g_free, NULL);
43
+ g_list_free (rule->RemovePre);
44
+ g_list_foreach (rule->RemovePost, (GFunc) g_free, NULL);
45
+ g_list_free (rule->RemovePost);
46
+
47
+ g_list_foreach (rule->step1_pre, (GFunc) g_free, NULL);
48
+ g_list_free (rule->step1_pre);
49
+ g_list_foreach (rule->step1_post, (GFunc) g_free, NULL);
50
+ g_list_free (rule->step1_post);
51
+
52
+ g_list_foreach (rule->synonyms, (GFunc) g_free, NULL);
53
+ g_list_free (rule->synonyms);
54
+ g_list_foreach (rule->manual, (GFunc) g_free, NULL);
55
+ g_list_free (rule->manual);
56
+
57
+ g_list_foreach (rule->ParserBreak, (GFunc) g_free, NULL);
58
+ g_list_free (rule->ParserBreak);
59
+ g_list_foreach (rule->ParserDontBreak, (GFunc) g_free, NULL);
60
+ g_list_free (rule->ParserDontBreak);
61
+
62
+ g_list_foreach (rule->ReplaceChars, (GFunc) g_free, NULL);
63
+ g_list_free (rule->ReplaceChars);
64
+
65
+ g_free (rule);
66
+ }
67
+ return;
68
+ }
69
+
70
+ static void
71
+ ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b) /*given already alocated part_a and b */
72
+ { /*example "red|blue" */
73
+ int i, j, clen;
74
+ i = 0;
75
+ j = 0;
76
+
77
+ if (comp==NULL) return;
78
+ if (part_a==NULL) return;
79
+ if (part_b==NULL) return;
80
+
81
+ clen = strlen (comp);
82
+
83
+
84
+ part_a[0] = 0;
85
+ part_b[0] = 0;
86
+
87
+ while ((i < clen) && (i < MAX_PREFIX_SIZE) && (comp[i] != '|'))
88
+ {
89
+ part_a[i] = comp[i];
90
+ i++;
91
+ }
92
+ part_a[i] = 0;
93
+
94
+ i++; /*skip the | mark */
95
+ while (i < clen && (j < MAX_PREFIX_SIZE))
96
+ {
97
+ part_b[j] = comp[i];
98
+ i++;
99
+ j++;
100
+ }
101
+ part_b[j] = 0;
102
+ return;
103
+ }
104
+
105
+
106
+ static unsigned char *
107
+ ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new)
108
+ {
109
+ int i, plen, wlen, nlen;
110
+ unsigned char *new_str = NULL;
111
+
112
+ if (aWord==NULL) return NULL;
113
+
114
+ plen = strlen (pre);
115
+ wlen = strlen (aWord);
116
+ nlen = strlen (new);
117
+
118
+ for (i = 0; i < plen; i++)
119
+ if (aWord[i] != pre[i])
120
+ return NULL; /*no match */
121
+
122
+ new_str = g_new0 (char, wlen + nlen +5);
123
+ for (i = 0; i <= nlen; i++)
124
+ new_str[i] = new[i];
125
+
126
+ for (i = nlen; i <= nlen + wlen - plen; i++)
127
+ new_str[i] = aWord[i + plen - nlen];
128
+
129
+ new_str[i + 1] = 0;
130
+ return new_str;
131
+ }
132
+
133
+
134
+
135
+ static unsigned char *
136
+ ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new)
137
+ {
138
+ unsigned int i, wlen, plen, nlen;
139
+ unsigned char *new_str = NULL;
140
+
141
+ if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL;
142
+
143
+ wlen = strlen (aWord);
144
+ plen = strlen (post);
145
+ nlen = strlen (new);
146
+
147
+ if (plen>wlen) return NULL;
148
+
149
+
150
+ for (i = 0; i < plen; i++)
151
+ if (aWord[wlen - plen + i]!= post[i])
152
+ return NULL; /* no match */
153
+
154
+ new_str = g_new0 (char, wlen + nlen +5);
155
+
156
+ for (i = 0; i <= wlen - plen; i++) /*place word */
157
+ new_str[i] = aWord[i];
158
+
159
+ for (i = 0; i <= nlen; i++) /*place newfix */
160
+ new_str[wlen - plen + i] = new[i];
161
+
162
+ return new_str; /*word replaced */
163
+ }
164
+
165
+
166
+
167
+ static unsigned char *
168
+ ots_stem_replace_word (unsigned const char *aWord,unsigned const char *old,unsigned const char *new)
169
+ {
170
+
171
+ if (aWord==NULL) return NULL;
172
+
173
+ if ((aWord)&&(0 == strcmp (aWord, old)))
174
+ {
175
+ return g_strdup (new);
176
+ }
177
+ else
178
+ {
179
+ return NULL;
180
+ }
181
+
182
+ }
183
+
184
+
185
+
186
+
187
+
188
+ unsigned char *
189
+ ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule)
190
+ {
191
+ GList *li;
192
+ unsigned char *rep = NULL;
193
+ unsigned char *normWord = NULL;
194
+
195
+ if (aWord==NULL) return NULL;
196
+
197
+ normWord = g_utf8_strdown (aWord, -1); /*lowercase the word */
198
+
199
+ char *prefix;
200
+ char *newfix;
201
+
202
+ prefix = g_new0 (char, MAX_PREFIX_SIZE);
203
+ newfix = g_new0 (char, MAX_PREFIX_SIZE);
204
+
205
+ for (li = (GList *) rule->step1_pre; li != NULL; li = li->next)
206
+ {
207
+ ots_stem_break (li->data, prefix, newfix);
208
+ rep = ots_stem_remove_pre (normWord, prefix, newfix);
209
+ if (NULL != rep)
210
+ {
211
+ g_free (normWord);
212
+ normWord = rep;
213
+ rep = NULL;
214
+ }
215
+ }
216
+
217
+
218
+ for (li = (GList *) rule->step1_post; li != NULL; li = li->next)
219
+ {
220
+ ots_stem_break (li->data, prefix, newfix);
221
+ rep = ots_stem_remove_post(normWord, prefix, newfix);
222
+ if (NULL != rep)
223
+ {
224
+ g_free (normWord);
225
+ normWord = rep;
226
+ rep = NULL;
227
+ }
228
+ }
229
+
230
+ g_free (prefix);
231
+ g_free (newfix);
232
+
233
+ return normWord;
234
+ }
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+ unsigned char *
244
+ ots_stem_strip (unsigned const char *aWord,const OtsStemRule * rule)
245
+ {
246
+ GList *li;
247
+ unsigned char *rep = NULL;
248
+
249
+ unsigned char *prefix;
250
+ unsigned char *newfix;
251
+ unsigned char *normWord=NULL;
252
+
253
+ prefix = g_new0 (char, MAX_PREFIX_SIZE);
254
+ newfix = g_new0 (char, MAX_PREFIX_SIZE);
255
+
256
+ if (aWord==NULL) return NULL;
257
+
258
+ normWord = ots_stem_format (aWord,rule);
259
+
260
+
261
+ for (li = (GList *) rule->manual; li != NULL; li = li->next)
262
+ {
263
+ ots_stem_break (li->data, prefix, newfix);
264
+ rep = ots_stem_replace_word (normWord, prefix, newfix);
265
+ if (NULL != rep)
266
+ {
267
+ g_free (normWord);
268
+ normWord = rep;
269
+ rep = NULL;
270
+ break;
271
+ }
272
+ }
273
+
274
+
275
+
276
+
277
+ for (li = (GList *) rule->RemovePre; li != NULL; li = li->next)
278
+ {
279
+ ots_stem_break (li->data, prefix, newfix);
280
+ rep = ots_stem_remove_pre (normWord, prefix, newfix);
281
+ if (NULL != rep)
282
+ {
283
+ g_free (normWord);
284
+ normWord = rep;
285
+ rep = NULL;
286
+ break;
287
+ }
288
+ }
289
+
290
+
291
+ for (li = (GList *) rule->RemovePost; li != NULL; li = li->next)
292
+ {
293
+ ots_stem_break (li->data, prefix, newfix);
294
+ rep = ots_stem_remove_post (normWord, prefix, newfix);
295
+ if (NULL != rep)
296
+ {
297
+ g_free (normWord);
298
+ normWord = rep;
299
+ rep = NULL;
300
+ break;
301
+ }
302
+
303
+ }
304
+
305
+
306
+ for (li = (GList *) rule->synonyms; li != NULL; li = li->next)
307
+ {
308
+ ots_stem_break (li->data, prefix, newfix);
309
+ rep = ots_stem_replace_word (normWord, prefix, newfix);
310
+ if (NULL != rep)
311
+ {
312
+ g_free (normWord);
313
+ normWord = rep;
314
+ rep = NULL;
315
+ break;
316
+ }
317
+ }
318
+
319
+
320
+ g_free (prefix);
321
+ g_free (newfix);
322
+
323
+
324
+ if (strlen(normWord)<3) /*stem is two letter long. thats not right. N(eed)==N(ation) ?*/
325
+ {
326
+ g_free(normWord);
327
+ normWord = ots_stem_format (aWord,rule); /*lowercase the word */
328
+ }
329
+
330
+
331
+ return normWord;
332
+ }
@@ -0,0 +1,43 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+
5
+ #include <glib.h>
6
+ #include <glib-object.h>
7
+ #include <ruby.h>
8
+
9
+ #include "libots.h"
10
+ #include "summarize.h"
11
+
12
+ const char *OTS_ERROR_BAD_DICT = "Cannot load dictionary file";
13
+
14
+ void Init_summarize() {
15
+ VALUE rb_mOts = rb_define_module("Summarize");
16
+ rb_define_module_function(rb_mOts, "summarize", summarize, 3);
17
+ }
18
+
19
+ static VALUE summarize(const VALUE self, const VALUE rb_str, const VALUE rb_dict_file, const VALUE rb_ratio) {
20
+ int length = RSTRING_LEN(rb_str);
21
+ char *text = StringValuePtr(rb_str);
22
+ char *dictionary_file = StringValuePtr(rb_dict_file);
23
+ int ratio = NUM2INT(rb_ratio);
24
+ unsigned char *result;
25
+ size_t result_len;
26
+ OtsArticle *doc = ots_new_article();
27
+
28
+ if (!ots_load_xml_dictionary(doc, dictionary_file)) {
29
+ ots_free_article(doc);
30
+ rb_raise(rb_eRuntimeError, OTS_ERROR_BAD_DICT);
31
+ return Qnil;
32
+ }
33
+
34
+ ots_parse_stream(text, length, doc);
35
+ ots_grade_doc(doc);
36
+ ots_highlight_doc(doc, ratio);
37
+
38
+ result = ots_get_doc_text(doc, &result_len);
39
+
40
+ ots_free_article(doc);
41
+
42
+ return rb_str_new2(result);
43
+ }
@@ -0,0 +1,12 @@
1
+ #ifndef RSTRING_PTR
2
+ #define RSTRING_PTR(s) (RSTRING(s)->ptr)
3
+ #endif
4
+
5
+ #ifndef RSTRING_LEN
6
+ #define RSTRING_LEN(s) (RSTRING(s)->len)
7
+ #endif
8
+
9
+ #ifndef __summarize_h__
10
+ #define __summarize_h__
11
+ static VALUE summarize(VALUE, VALUE, VALUE, VALUE);
12
+ #endif
@@ -0,0 +1,98 @@
1
+ /*
2
+ * text.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+
27
+ unsigned char *
28
+ ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size)
29
+ {
30
+ GList *li;
31
+ GString *text;
32
+ unsigned char *utf8_data;
33
+
34
+ if (!(aLine))
35
+ return NULL;
36
+
37
+ text = g_string_new (NULL);
38
+
39
+ if (!only_if_selected || aLine->selected)
40
+ {
41
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
42
+ if (li->data && strlen (li->data)) /*if word exists*/
43
+ g_string_append (text, (char *) li->data);
44
+
45
+ }
46
+
47
+ if (out_size)
48
+ *out_size = text->len;
49
+
50
+ utf8_data = text->str;
51
+ g_string_free (text, FALSE);
52
+
53
+ return utf8_data;
54
+ }
55
+
56
+ static void
57
+ ots_print_line (FILE * stream, const OtsSentence * aLine)
58
+ {
59
+ unsigned char *utf8_txt;
60
+ size_t len;
61
+ utf8_txt = ots_get_line_text (aLine, TRUE, &len);
62
+ fwrite (utf8_txt, 1, len, stream);
63
+ g_free (utf8_txt);
64
+ }
65
+
66
+ unsigned char *
67
+ ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
68
+ {
69
+ GList *li;
70
+ GString *text;
71
+ unsigned char *utf8_data;
72
+ size_t line_len;
73
+
74
+ text = g_string_new (NULL);
75
+
76
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
77
+ {
78
+ utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len);
79
+ g_string_append_len (text, utf8_data, line_len);
80
+ g_free (utf8_data);
81
+ }
82
+
83
+ if (out_len)
84
+ *out_len = text->len;
85
+ utf8_data = text->str;
86
+
87
+ g_string_free (text, FALSE);
88
+ return utf8_data;
89
+ }
90
+
91
+ void
92
+ ots_print_doc (FILE * stream, const OtsArticle * Doc)
93
+ {
94
+ GList *li;
95
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next) /* for each line in Article Do: */
96
+ ots_print_line (stream, (OtsSentence *) li->data);
97
+ fputc ('\n', stream);
98
+ }