summarize 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/README.markdown +42 -0
- data/Rakefile +49 -0
- data/ext/summarize/article.c +119 -0
- data/ext/summarize/dic/bg.xml +101 -0
- data/ext/summarize/dic/ca.xml +141 -0
- data/ext/summarize/dic/cs.xml +161 -0
- data/ext/summarize/dic/cy.xml +118 -0
- data/ext/summarize/dic/da.xml +129 -0
- data/ext/summarize/dic/de.xml +354 -0
- data/ext/summarize/dic/el.xml +80 -0
- data/ext/summarize/dic/en.xml +606 -0
- data/ext/summarize/dic/eo.xml +171 -0
- data/ext/summarize/dic/es.xml +369 -0
- data/ext/summarize/dic/et.xml +172 -0
- data/ext/summarize/dic/eu.xml +77 -0
- data/ext/summarize/dic/fi.xml +105 -0
- data/ext/summarize/dic/fr.xml +199 -0
- data/ext/summarize/dic/ga.xml +124 -0
- data/ext/summarize/dic/gl.xml +290 -0
- data/ext/summarize/dic/he.xml +334 -0
- data/ext/summarize/dic/hu.xml +280 -0
- data/ext/summarize/dic/ia.xml +97 -0
- data/ext/summarize/dic/id.xml +75 -0
- data/ext/summarize/dic/is.xml +201 -0
- data/ext/summarize/dic/it.xml +206 -0
- data/ext/summarize/dic/lv.xml +77 -0
- data/ext/summarize/dic/mi.xml +76 -0
- data/ext/summarize/dic/ms.xml +160 -0
- data/ext/summarize/dic/mt.xml +73 -0
- data/ext/summarize/dic/nl.xml +245 -0
- data/ext/summarize/dic/nn.xml +264 -0
- data/ext/summarize/dic/pl.xml +92 -0
- data/ext/summarize/dic/pt.xml +365 -0
- data/ext/summarize/dic/ro.xml +163 -0
- data/ext/summarize/dic/ru.xml +150 -0
- data/ext/summarize/dic/sv.xml +255 -0
- data/ext/summarize/dic/tl.xml +67 -0
- data/ext/summarize/dic/tr.xml +65 -0
- data/ext/summarize/dic/uk.xml +98 -0
- data/ext/summarize/dic/yi.xml +293 -0
- data/ext/summarize/dictionary.c +331 -0
- data/ext/summarize/extconf.rb +6 -0
- data/ext/summarize/grader-tc.c +185 -0
- data/ext/summarize/grader-tc.h +64 -0
- data/ext/summarize/grader-tf.c +116 -0
- data/ext/summarize/grader.c +85 -0
- data/ext/summarize/highlighter.c +128 -0
- data/ext/summarize/html.c +131 -0
- data/ext/summarize/libots.h +158 -0
- data/ext/summarize/parser.c +173 -0
- data/ext/summarize/relations.c +163 -0
- data/ext/summarize/stemmer.c +332 -0
- data/ext/summarize/summarize.c +43 -0
- data/ext/summarize/summarize.h +12 -0
- data/ext/summarize/text.c +98 -0
- data/ext/summarize/wordlist.c +220 -0
- data/lib/summarize.rb +91 -0
- data/lib/summarize/summarize.bundle +0 -0
- data/sample_data/jupiter.txt +15 -0
- data/summarize.gemspec +21 -0
- metadata +140 -0
data/.gitignore
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# Summarize
|
2
|
+
|
3
|
+
## A Ruby C wrapper for Open Text Summarizer
|
4
|
+
|
5
|
+
## Install
|
6
|
+
|
7
|
+
### Manual install
|
8
|
+
|
9
|
+
git clone https://github.com/ssoper/summarize.git
|
10
|
+
cd summarize
|
11
|
+
rake build # install the rake-compiler gem if you dont already have it
|
12
|
+
gem build summarize.gemspec
|
13
|
+
gem install summarize-1.0.gem
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
The summarize method is added to File which you can use to summarize the contents of any plain text file
|
18
|
+
|
19
|
+
File.open('path/to/file').summarize
|
20
|
+
|
21
|
+
Or use the String method
|
22
|
+
|
23
|
+
"text to summarize".summarize
|
24
|
+
|
25
|
+
By default it uses an English dictionary for summarizing but forty languages are supported. Pass in the valid ISO 639 language code to use one. A ratio (default is 25%) can also be passed in.
|
26
|
+
|
27
|
+
# Parse an article using Portuguese stemming rules with a ratio of 50%
|
28
|
+
"text to summarize".summarize(:language => 'pt', :ratio => 50)
|
29
|
+
|
30
|
+
You can also use custom stemming rules
|
31
|
+
|
32
|
+
"text to summarize".summarize(:dictionary => 'path/to/custom/dictionary')
|
33
|
+
|
34
|
+
## Dependencies
|
35
|
+
|
36
|
+
You must have glib-2.0 and libxml-2.0 installed and properly configured.
|
37
|
+
|
38
|
+
## Author
|
39
|
+
|
40
|
+
Gem written by Sean Soper ([@ssoper](http://twitter.com/ssoper))
|
41
|
+
|
42
|
+
The Open Text Summarizer library was written by Nadav Rotem and can be found at <http://libots.sourceforge.net/>
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "rake"
|
3
|
+
require "benchmark"
|
4
|
+
|
5
|
+
# Built using rake-compiler (https://github.com/luislavena/rake-compiler)
|
6
|
+
begin
|
7
|
+
require "rake/extensiontask"
|
8
|
+
rescue LoadError
|
9
|
+
abort "\nYou must install the rake-compiler gem\n\n gem install rake-compiler\n\n"
|
10
|
+
end
|
11
|
+
|
12
|
+
Rake::ExtensionTask.new("summarize") do |extension|
|
13
|
+
extension.lib_dir = "lib/summarize"
|
14
|
+
end
|
15
|
+
|
16
|
+
task :build => [:clean, :compile]
|
17
|
+
|
18
|
+
task :test => :build do
|
19
|
+
$:.unshift(File.join(File.dirname(__FILE__), "lib"))
|
20
|
+
require "summarize"
|
21
|
+
require "test/unit"
|
22
|
+
|
23
|
+
class TestSummarizer < Test::Unit::TestCase
|
24
|
+
def setup
|
25
|
+
@jupiter = File.read(File.join(File.dirname(__FILE__), 'sample_data/jupiter.txt'))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_ratios
|
29
|
+
content, ratio_25, ratio_50 = @jupiter.split('||').collect { |text| text.strip }
|
30
|
+
assert content.summarize(:ratio => 25) == ratio_25
|
31
|
+
assert content.summarize(:ratio => 50) == ratio_50
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_unsupported_language
|
35
|
+
content = @jupiter.split('||').first
|
36
|
+
assert_raise RuntimeError do
|
37
|
+
content.summarize(:language => 'nonexistent')
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_invalid_dictionary_file
|
42
|
+
content = @jupiter.split('||').first
|
43
|
+
assert_raise RuntimeError do
|
44
|
+
content.summarize(:dictionary => '/path/to/nowhere')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
/*
|
2
|
+
* article.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
extern void ots_free_TF_wordlist (GList * aList);
|
29
|
+
|
30
|
+
#define MAX_WORD_LENGTH 35
|
31
|
+
|
32
|
+
/*Data structure related functions*/
|
33
|
+
|
34
|
+
OtsSentence *
|
35
|
+
ots_new_sentence (void)
|
36
|
+
{
|
37
|
+
OtsSentence *aLine = g_new0 (OtsSentence, 1);
|
38
|
+
aLine->words = NULL;
|
39
|
+
aLine->wc = 0;
|
40
|
+
aLine->selected = 0;
|
41
|
+
aLine->score = 0;
|
42
|
+
return aLine;
|
43
|
+
}
|
44
|
+
|
45
|
+
void
|
46
|
+
ots_free_sentence (OtsSentence * sen)
|
47
|
+
{
|
48
|
+
if (sen != NULL)
|
49
|
+
{
|
50
|
+
g_list_foreach (sen->words, (GFunc) g_free, NULL);
|
51
|
+
g_list_free (sen->words);
|
52
|
+
g_free (sen);
|
53
|
+
}
|
54
|
+
sen=NULL;
|
55
|
+
}
|
56
|
+
|
57
|
+
OtsArticle *
|
58
|
+
ots_new_article (void)
|
59
|
+
{
|
60
|
+
OtsArticle *Doc;
|
61
|
+
Doc = g_new0 (OtsArticle, 1);
|
62
|
+
Doc->lineCount = 0;
|
63
|
+
Doc->title = NULL;
|
64
|
+
Doc->stem=new_stem_rule ();
|
65
|
+
Doc->lines=NULL;
|
66
|
+
Doc->dict = NULL;
|
67
|
+
Doc->ImpWords = NULL;
|
68
|
+
Doc->wordStat = NULL;
|
69
|
+
|
70
|
+
Doc->tf_terms=NULL;
|
71
|
+
return Doc;
|
72
|
+
}
|
73
|
+
|
74
|
+
void
|
75
|
+
ots_free_article (OtsArticle * art)
|
76
|
+
{
|
77
|
+
if (NULL != art)
|
78
|
+
{
|
79
|
+
free_stem_rule (art->stem);
|
80
|
+
ots_free_wordlist (art->dict);
|
81
|
+
ots_free_wordlist (art->ImpWords);
|
82
|
+
ots_free_wordlist (art->wordStat);
|
83
|
+
|
84
|
+
ots_free_TF_wordlist(art->tf_terms);
|
85
|
+
|
86
|
+
g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL);
|
87
|
+
g_list_free (art->lines);
|
88
|
+
|
89
|
+
if (art->title != NULL) g_free (art->title);
|
90
|
+
g_free (art);
|
91
|
+
}
|
92
|
+
art=NULL;
|
93
|
+
}
|
94
|
+
|
95
|
+
OtsSentence *
|
96
|
+
ots_append_line (OtsArticle * Doc)
|
97
|
+
{
|
98
|
+
OtsSentence *aLine = ots_new_sentence ();
|
99
|
+
Doc->lineCount++;
|
100
|
+
Doc->lines = g_list_append (Doc->lines, aLine);
|
101
|
+
return aLine;
|
102
|
+
}
|
103
|
+
|
104
|
+
void
|
105
|
+
ots_append_word (OtsSentence * aLine,unsigned const char *aWord)
|
106
|
+
{
|
107
|
+
if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return;
|
108
|
+
aLine->wc++;
|
109
|
+
aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord));
|
110
|
+
return;
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
gboolean
|
115
|
+
ots_is_line_selected(const OtsSentence *aLine)
|
116
|
+
{
|
117
|
+
if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;}
|
118
|
+
return (aLine->selected);
|
119
|
+
}
|
@@ -0,0 +1,101 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="bulgarian">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>август</word>
|
62
|
+
<word>април</word>
|
63
|
+
<word>в</word>
|
64
|
+
<word>всеки</word>
|
65
|
+
<word>всичко</word>
|
66
|
+
<word>вторник</word>
|
67
|
+
<word>да</word>
|
68
|
+
<word>декември</word>
|
69
|
+
<word>за</word>
|
70
|
+
<word>и</word>
|
71
|
+
<word>или</word>
|
72
|
+
<word>има</word>
|
73
|
+
<word>което</word>
|
74
|
+
<word>към</word>
|
75
|
+
<word>май</word>
|
76
|
+
<word>март</word>
|
77
|
+
<word>на</word>
|
78
|
+
<word>не</word>
|
79
|
+
<word>неделя</word>
|
80
|
+
<word>ноември</word>
|
81
|
+
<word>октомври</word>
|
82
|
+
<word>от</word>
|
83
|
+
<word>петък</word>
|
84
|
+
<word>по</word>
|
85
|
+
<word>понеделник</word>
|
86
|
+
<word>при</word>
|
87
|
+
<word>с</word>
|
88
|
+
<word>септември</word>
|
89
|
+
<word>сряда</word>
|
90
|
+
<word>сто</word>
|
91
|
+
<word>събота</word>
|
92
|
+
<word>трябва</word>
|
93
|
+
<word>февруари</word>
|
94
|
+
<word>хиляда</word>
|
95
|
+
<word>че</word>
|
96
|
+
<word>четвъртък</word>
|
97
|
+
<word>юли</word>
|
98
|
+
<word>юни</word>
|
99
|
+
<word>януари</word>
|
100
|
+
</grader-tc>
|
101
|
+
</dictionary>
|
@@ -0,0 +1,141 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="catalan">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>a</word>
|
62
|
+
<word>abans</word>
|
63
|
+
<word>al</word>
|
64
|
+
<word>amb</word>
|
65
|
+
<word>ambdós</word>
|
66
|
+
<word>anar</word>
|
67
|
+
<word>ara</word>
|
68
|
+
<word>baix</word>
|
69
|
+
<word>cap</word>
|
70
|
+
<word>cert</word>
|
71
|
+
<word>com</word>
|
72
|
+
<word>cuál</word>
|
73
|
+
<word>damunt</word>
|
74
|
+
<word>de</word>
|
75
|
+
<word>dins</word>
|
76
|
+
<word>doble</word>
|
77
|
+
<word>dos</word>
|
78
|
+
<word>dues</word>
|
79
|
+
<word>el</word>
|
80
|
+
<word>ell</word>
|
81
|
+
<word>ella</word>
|
82
|
+
<word>elles</word>
|
83
|
+
<word>ells</word>
|
84
|
+
<word>els</word>
|
85
|
+
<word>en</word>
|
86
|
+
<word>ésser</word>
|
87
|
+
<word>estar</word>
|
88
|
+
<word>excepte</word>
|
89
|
+
<word>jo</word>
|
90
|
+
<word>la</word>
|
91
|
+
<word>les</word>
|
92
|
+
<word>lluny</word>
|
93
|
+
<word>lo</word>
|
94
|
+
<word>los</word>
|
95
|
+
<word>mai</word>
|
96
|
+
<word>me</word>
|
97
|
+
<word>meu</word>
|
98
|
+
<word>meus</word>
|
99
|
+
<word>meva</word>
|
100
|
+
<word>meves</word>
|
101
|
+
<word>mí</word>
|
102
|
+
<word>na</word>
|
103
|
+
<word>nos</word>
|
104
|
+
<word>nosaltres</word>
|
105
|
+
<word>nostra</word>
|
106
|
+
<word>nostre</word>
|
107
|
+
<word>nostres</word>
|
108
|
+
<word>qual</word>
|
109
|
+
<word>quals</word>
|
110
|
+
<word>quan</word>
|
111
|
+
<word>quelcom</word>
|
112
|
+
<word>quin</word>
|
113
|
+
<word>quina</word>
|
114
|
+
<word>quines</word>
|
115
|
+
<word>quins</word>
|
116
|
+
<word>se</word>
|
117
|
+
<word>ser</word>
|
118
|
+
<word>seu</word>
|
119
|
+
<word>seus</word>
|
120
|
+
<word>seva</word>
|
121
|
+
<word>seves</word>
|
122
|
+
<word>sí</word>
|
123
|
+
<word>tenir</word>
|
124
|
+
<word>teu</word>
|
125
|
+
<word>teus</word>
|
126
|
+
<word>teva</word>
|
127
|
+
<word>teves</word>
|
128
|
+
<word>tu</word>
|
129
|
+
<word>u</word>
|
130
|
+
<word>un</word>
|
131
|
+
<word>una</word>
|
132
|
+
<word>unes</word>
|
133
|
+
<word>uns</word>
|
134
|
+
<word>vosaltres</word>
|
135
|
+
<word>vostè</word>
|
136
|
+
<word>vostès</word>
|
137
|
+
<word>vostra</word>
|
138
|
+
<word>vostre</word>
|
139
|
+
<word>vostres</word>
|
140
|
+
</grader-tc>
|
141
|
+
</dictionary>
|