summarize 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +11 -0
  2. data/README.markdown +42 -0
  3. data/Rakefile +49 -0
  4. data/ext/summarize/article.c +119 -0
  5. data/ext/summarize/dic/bg.xml +101 -0
  6. data/ext/summarize/dic/ca.xml +141 -0
  7. data/ext/summarize/dic/cs.xml +161 -0
  8. data/ext/summarize/dic/cy.xml +118 -0
  9. data/ext/summarize/dic/da.xml +129 -0
  10. data/ext/summarize/dic/de.xml +354 -0
  11. data/ext/summarize/dic/el.xml +80 -0
  12. data/ext/summarize/dic/en.xml +606 -0
  13. data/ext/summarize/dic/eo.xml +171 -0
  14. data/ext/summarize/dic/es.xml +369 -0
  15. data/ext/summarize/dic/et.xml +172 -0
  16. data/ext/summarize/dic/eu.xml +77 -0
  17. data/ext/summarize/dic/fi.xml +105 -0
  18. data/ext/summarize/dic/fr.xml +199 -0
  19. data/ext/summarize/dic/ga.xml +124 -0
  20. data/ext/summarize/dic/gl.xml +290 -0
  21. data/ext/summarize/dic/he.xml +334 -0
  22. data/ext/summarize/dic/hu.xml +280 -0
  23. data/ext/summarize/dic/ia.xml +97 -0
  24. data/ext/summarize/dic/id.xml +75 -0
  25. data/ext/summarize/dic/is.xml +201 -0
  26. data/ext/summarize/dic/it.xml +206 -0
  27. data/ext/summarize/dic/lv.xml +77 -0
  28. data/ext/summarize/dic/mi.xml +76 -0
  29. data/ext/summarize/dic/ms.xml +160 -0
  30. data/ext/summarize/dic/mt.xml +73 -0
  31. data/ext/summarize/dic/nl.xml +245 -0
  32. data/ext/summarize/dic/nn.xml +264 -0
  33. data/ext/summarize/dic/pl.xml +92 -0
  34. data/ext/summarize/dic/pt.xml +365 -0
  35. data/ext/summarize/dic/ro.xml +163 -0
  36. data/ext/summarize/dic/ru.xml +150 -0
  37. data/ext/summarize/dic/sv.xml +255 -0
  38. data/ext/summarize/dic/tl.xml +67 -0
  39. data/ext/summarize/dic/tr.xml +65 -0
  40. data/ext/summarize/dic/uk.xml +98 -0
  41. data/ext/summarize/dic/yi.xml +293 -0
  42. data/ext/summarize/dictionary.c +331 -0
  43. data/ext/summarize/extconf.rb +6 -0
  44. data/ext/summarize/grader-tc.c +185 -0
  45. data/ext/summarize/grader-tc.h +64 -0
  46. data/ext/summarize/grader-tf.c +116 -0
  47. data/ext/summarize/grader.c +85 -0
  48. data/ext/summarize/highlighter.c +128 -0
  49. data/ext/summarize/html.c +131 -0
  50. data/ext/summarize/libots.h +158 -0
  51. data/ext/summarize/parser.c +173 -0
  52. data/ext/summarize/relations.c +163 -0
  53. data/ext/summarize/stemmer.c +332 -0
  54. data/ext/summarize/summarize.c +43 -0
  55. data/ext/summarize/summarize.h +12 -0
  56. data/ext/summarize/text.c +98 -0
  57. data/ext/summarize/wordlist.c +220 -0
  58. data/lib/summarize.rb +91 -0
  59. data/lib/summarize/summarize.bundle +0 -0
  60. data/sample_data/jupiter.txt +15 -0
  61. data/summarize.gemspec +21 -0
  62. metadata +140 -0
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ Makefile
2
+ lib/ots/ots.bundle
3
+ tmp/*
4
+ *.DS_Store
5
+ coverage/*
6
+ *.swp
7
+ *.swo
8
+ pkg/*
9
+ *.gem
10
+
11
+ !.keep
data/README.markdown ADDED
@@ -0,0 +1,42 @@
1
+ # Summarize
2
+
3
+ ## A Ruby C wrapper for Open Text Summarizer
4
+
5
+ ## Install
6
+
7
+ ### Manual install
8
+
9
+ git clone https://github.com/ssoper/summarize.git
10
+ cd summarize
11
+ rake build # install the rake-compiler gem if you dont already have it
12
+ gem build summarize.gemspec
13
+ gem install summarize-1.0.gem
14
+
15
+ ## Usage
16
+
17
+ The summarize method is added to File which you can use to summarize the contents of any plain text file
18
+
19
+ File.open('path/to/file').summarize
20
+
21
+ Or use the String method
22
+
23
+ "text to summarize".summarize
24
+
25
+ By default it uses an English dictionary for summarizing but forty languages are supported. Pass in the valid ISO 639 language code to use one. A ratio (default is 25%) can also be passed in.
26
+
27
+ # Parse an article using Portuguese stemming rules with a ratio of 50%
28
+ "text to summarize".summarize(:language => 'pt', :ratio => 50)
29
+
30
+ You can also use custom stemming rules
31
+
32
+ "text to summarize".summarize(:dictionary => 'path/to/custom/dictionary')
33
+
34
+ ## Dependencies
35
+
36
+ You must have glib-2.0 and libxml-2.0 installed and properly configured.
37
+
38
+ ## Author
39
+
40
+ Gem written by Sean Soper ([@ssoper](http://twitter.com/ssoper))
41
+
42
+ The Open Text Summarizer library was written by Nadav Rotem and can be found at <http://libots.sourceforge.net/>
data/Rakefile ADDED
@@ -0,0 +1,49 @@
1
+ require "rubygems"
2
+ require "rake"
3
+ require "benchmark"
4
+
5
+ # Built using rake-compiler (https://github.com/luislavena/rake-compiler)
6
+ begin
7
+ require "rake/extensiontask"
8
+ rescue LoadError
9
+ abort "\nYou must install the rake-compiler gem\n\n gem install rake-compiler\n\n"
10
+ end
11
+
12
+ Rake::ExtensionTask.new("summarize") do |extension|
13
+ extension.lib_dir = "lib/summarize"
14
+ end
15
+
16
+ task :build => [:clean, :compile]
17
+
18
+ task :test => :build do
19
+ $:.unshift(File.join(File.dirname(__FILE__), "lib"))
20
+ require "summarize"
21
+ require "test/unit"
22
+
23
+ class TestSummarizer < Test::Unit::TestCase
24
+ def setup
25
+ @jupiter = File.read(File.join(File.dirname(__FILE__), 'sample_data/jupiter.txt'))
26
+ end
27
+
28
+ def test_ratios
29
+ content, ratio_25, ratio_50 = @jupiter.split('||').collect { |text| text.strip }
30
+ assert content.summarize(:ratio => 25) == ratio_25
31
+ assert content.summarize(:ratio => 50) == ratio_50
32
+ end
33
+
34
+ def test_unsupported_language
35
+ content = @jupiter.split('||').first
36
+ assert_raise RuntimeError do
37
+ content.summarize(:language => 'nonexistent')
38
+ end
39
+ end
40
+
41
+ def test_invalid_dictionary_file
42
+ content = @jupiter.split('||').first
43
+ assert_raise RuntimeError do
44
+ content.summarize(:dictionary => '/path/to/nowhere')
45
+ end
46
+ end
47
+ end
48
+
49
+ end
@@ -0,0 +1,119 @@
1
+ /*
2
+ * article.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ extern void ots_free_TF_wordlist (GList * aList);
29
+
30
+ #define MAX_WORD_LENGTH 35
31
+
32
+ /*Data structure related functions*/
33
+
34
+ OtsSentence *
35
+ ots_new_sentence (void)
36
+ {
37
+ OtsSentence *aLine = g_new0 (OtsSentence, 1);
38
+ aLine->words = NULL;
39
+ aLine->wc = 0;
40
+ aLine->selected = 0;
41
+ aLine->score = 0;
42
+ return aLine;
43
+ }
44
+
45
+ void
46
+ ots_free_sentence (OtsSentence * sen)
47
+ {
48
+ if (sen != NULL)
49
+ {
50
+ g_list_foreach (sen->words, (GFunc) g_free, NULL);
51
+ g_list_free (sen->words);
52
+ g_free (sen);
53
+ }
54
+ sen=NULL;
55
+ }
56
+
57
+ OtsArticle *
58
+ ots_new_article (void)
59
+ {
60
+ OtsArticle *Doc;
61
+ Doc = g_new0 (OtsArticle, 1);
62
+ Doc->lineCount = 0;
63
+ Doc->title = NULL;
64
+ Doc->stem=new_stem_rule ();
65
+ Doc->lines=NULL;
66
+ Doc->dict = NULL;
67
+ Doc->ImpWords = NULL;
68
+ Doc->wordStat = NULL;
69
+
70
+ Doc->tf_terms=NULL;
71
+ return Doc;
72
+ }
73
+
74
+ void
75
+ ots_free_article (OtsArticle * art)
76
+ {
77
+ if (NULL != art)
78
+ {
79
+ free_stem_rule (art->stem);
80
+ ots_free_wordlist (art->dict);
81
+ ots_free_wordlist (art->ImpWords);
82
+ ots_free_wordlist (art->wordStat);
83
+
84
+ ots_free_TF_wordlist(art->tf_terms);
85
+
86
+ g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL);
87
+ g_list_free (art->lines);
88
+
89
+ if (art->title != NULL) g_free (art->title);
90
+ g_free (art);
91
+ }
92
+ art=NULL;
93
+ }
94
+
95
+ OtsSentence *
96
+ ots_append_line (OtsArticle * Doc)
97
+ {
98
+ OtsSentence *aLine = ots_new_sentence ();
99
+ Doc->lineCount++;
100
+ Doc->lines = g_list_append (Doc->lines, aLine);
101
+ return aLine;
102
+ }
103
+
104
+ void
105
+ ots_append_word (OtsSentence * aLine,unsigned const char *aWord)
106
+ {
107
+ if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return;
108
+ aLine->wc++;
109
+ aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord));
110
+ return;
111
+ }
112
+
113
+
114
+ gboolean
115
+ ots_is_line_selected(const OtsSentence *aLine)
116
+ {
117
+ if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;}
118
+ return (aLine->selected);
119
+ }
@@ -0,0 +1,101 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="bulgarian">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>август</word>
62
+ <word>април</word>
63
+ <word>в</word>
64
+ <word>всеки</word>
65
+ <word>всичко</word>
66
+ <word>вторник</word>
67
+ <word>да</word>
68
+ <word>декември</word>
69
+ <word>за</word>
70
+ <word>и</word>
71
+ <word>или</word>
72
+ <word>има</word>
73
+ <word>което</word>
74
+ <word>към</word>
75
+ <word>май</word>
76
+ <word>март</word>
77
+ <word>на</word>
78
+ <word>не</word>
79
+ <word>неделя</word>
80
+ <word>ноември</word>
81
+ <word>октомври</word>
82
+ <word>от</word>
83
+ <word>петък</word>
84
+ <word>по</word>
85
+ <word>понеделник</word>
86
+ <word>при</word>
87
+ <word>с</word>
88
+ <word>септември</word>
89
+ <word>сряда</word>
90
+ <word>сто</word>
91
+ <word>събота</word>
92
+ <word>трябва</word>
93
+ <word>февруари</word>
94
+ <word>хиляда</word>
95
+ <word>че</word>
96
+ <word>четвъртък</word>
97
+ <word>юли</word>
98
+ <word>юни</word>
99
+ <word>януари</word>
100
+ </grader-tc>
101
+ </dictionary>
@@ -0,0 +1,141 @@
1
+ <?xml version="1.0"?>
2
+ <dictionary lang="catalan">
3
+ <stemmer>
4
+ <step1_pre>
5
+ <rule>"|</rule>
6
+ <rule>(|</rule>
7
+ </step1_pre>
8
+
9
+
10
+ <step1_post>
11
+ <rule>."|</rule>
12
+ <rule>,"|</rule>
13
+ <rule>.|</rule>
14
+ <rule>,|</rule>
15
+ <rule>"|</rule>
16
+ <rule>)|</rule>
17
+ <rule>?|</rule>
18
+ <rule>:|</rule>
19
+ <rule>;|</rule>
20
+ <rule>!|</rule>
21
+ </step1_post>
22
+
23
+
24
+ <manual>
25
+ <rule>wrote|write</rule>
26
+ <rule>came|come</rule>
27
+ <rule>went|go</rule>
28
+ </manual>
29
+
30
+ <post>
31
+ <rule>before1|1after</rule>
32
+ </post>
33
+ <pre>
34
+ <rule>before1|1after</rule>
35
+ </pre>
36
+ </stemmer>
37
+ <parser>
38
+
39
+ <linebreak>
40
+ <rule>."</rule>
41
+ <rule>?"</rule>
42
+ <rule>!"</rule>
43
+ <rule>,"</rule>
44
+ <rule>.</rule>
45
+ <rule>?</rule>
46
+ <rule>;</rule>
47
+ <rule>|</rule>
48
+ <rule>!</rule>
49
+ </linebreak>
50
+
51
+ <linedontbreak>
52
+ <rule>Dr.</rule>
53
+ <rule>Mr.</rule>
54
+ <rule>Mrs.</rule>
55
+ <rule>U.S.</rule>
56
+ <rule>Rep.</rule>
57
+ <rule>Sen.</rule>
58
+ </linedontbreak>
59
+ </parser>
60
+ <grader-tc>
61
+ <word>a</word>
62
+ <word>abans</word>
63
+ <word>al</word>
64
+ <word>amb</word>
65
+ <word>ambdós</word>
66
+ <word>anar</word>
67
+ <word>ara</word>
68
+ <word>baix</word>
69
+ <word>cap</word>
70
+ <word>cert</word>
71
+ <word>com</word>
72
+ <word>cuál</word>
73
+ <word>damunt</word>
74
+ <word>de</word>
75
+ <word>dins</word>
76
+ <word>doble</word>
77
+ <word>dos</word>
78
+ <word>dues</word>
79
+ <word>el</word>
80
+ <word>ell</word>
81
+ <word>ella</word>
82
+ <word>elles</word>
83
+ <word>ells</word>
84
+ <word>els</word>
85
+ <word>en</word>
86
+ <word>ésser</word>
87
+ <word>estar</word>
88
+ <word>excepte</word>
89
+ <word>jo</word>
90
+ <word>la</word>
91
+ <word>les</word>
92
+ <word>lluny</word>
93
+ <word>lo</word>
94
+ <word>los</word>
95
+ <word>mai</word>
96
+ <word>me</word>
97
+ <word>meu</word>
98
+ <word>meus</word>
99
+ <word>meva</word>
100
+ <word>meves</word>
101
+ <word>mí</word>
102
+ <word>na</word>
103
+ <word>nos</word>
104
+ <word>nosaltres</word>
105
+ <word>nostra</word>
106
+ <word>nostre</word>
107
+ <word>nostres</word>
108
+ <word>qual</word>
109
+ <word>quals</word>
110
+ <word>quan</word>
111
+ <word>quelcom</word>
112
+ <word>quin</word>
113
+ <word>quina</word>
114
+ <word>quines</word>
115
+ <word>quins</word>
116
+ <word>se</word>
117
+ <word>ser</word>
118
+ <word>seu</word>
119
+ <word>seus</word>
120
+ <word>seva</word>
121
+ <word>seves</word>
122
+ <word>sí</word>
123
+ <word>tenir</word>
124
+ <word>teu</word>
125
+ <word>teus</word>
126
+ <word>teva</word>
127
+ <word>teves</word>
128
+ <word>tu</word>
129
+ <word>u</word>
130
+ <word>un</word>
131
+ <word>una</word>
132
+ <word>unes</word>
133
+ <word>uns</word>
134
+ <word>vosaltres</word>
135
+ <word>vostè</word>
136
+ <word>vostès</word>
137
+ <word>vostra</word>
138
+ <word>vostre</word>
139
+ <word>vostres</word>
140
+ </grader-tc>
141
+ </dictionary>