summarize 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/README.markdown +42 -0
- data/Rakefile +49 -0
- data/ext/summarize/article.c +119 -0
- data/ext/summarize/dic/bg.xml +101 -0
- data/ext/summarize/dic/ca.xml +141 -0
- data/ext/summarize/dic/cs.xml +161 -0
- data/ext/summarize/dic/cy.xml +118 -0
- data/ext/summarize/dic/da.xml +129 -0
- data/ext/summarize/dic/de.xml +354 -0
- data/ext/summarize/dic/el.xml +80 -0
- data/ext/summarize/dic/en.xml +606 -0
- data/ext/summarize/dic/eo.xml +171 -0
- data/ext/summarize/dic/es.xml +369 -0
- data/ext/summarize/dic/et.xml +172 -0
- data/ext/summarize/dic/eu.xml +77 -0
- data/ext/summarize/dic/fi.xml +105 -0
- data/ext/summarize/dic/fr.xml +199 -0
- data/ext/summarize/dic/ga.xml +124 -0
- data/ext/summarize/dic/gl.xml +290 -0
- data/ext/summarize/dic/he.xml +334 -0
- data/ext/summarize/dic/hu.xml +280 -0
- data/ext/summarize/dic/ia.xml +97 -0
- data/ext/summarize/dic/id.xml +75 -0
- data/ext/summarize/dic/is.xml +201 -0
- data/ext/summarize/dic/it.xml +206 -0
- data/ext/summarize/dic/lv.xml +77 -0
- data/ext/summarize/dic/mi.xml +76 -0
- data/ext/summarize/dic/ms.xml +160 -0
- data/ext/summarize/dic/mt.xml +73 -0
- data/ext/summarize/dic/nl.xml +245 -0
- data/ext/summarize/dic/nn.xml +264 -0
- data/ext/summarize/dic/pl.xml +92 -0
- data/ext/summarize/dic/pt.xml +365 -0
- data/ext/summarize/dic/ro.xml +163 -0
- data/ext/summarize/dic/ru.xml +150 -0
- data/ext/summarize/dic/sv.xml +255 -0
- data/ext/summarize/dic/tl.xml +67 -0
- data/ext/summarize/dic/tr.xml +65 -0
- data/ext/summarize/dic/uk.xml +98 -0
- data/ext/summarize/dic/yi.xml +293 -0
- data/ext/summarize/dictionary.c +331 -0
- data/ext/summarize/extconf.rb +6 -0
- data/ext/summarize/grader-tc.c +185 -0
- data/ext/summarize/grader-tc.h +64 -0
- data/ext/summarize/grader-tf.c +116 -0
- data/ext/summarize/grader.c +85 -0
- data/ext/summarize/highlighter.c +128 -0
- data/ext/summarize/html.c +131 -0
- data/ext/summarize/libots.h +158 -0
- data/ext/summarize/parser.c +173 -0
- data/ext/summarize/relations.c +163 -0
- data/ext/summarize/stemmer.c +332 -0
- data/ext/summarize/summarize.c +43 -0
- data/ext/summarize/summarize.h +12 -0
- data/ext/summarize/text.c +98 -0
- data/ext/summarize/wordlist.c +220 -0
- data/lib/summarize.rb +91 -0
- data/lib/summarize/summarize.bundle +0 -0
- data/sample_data/jupiter.txt +15 -0
- data/summarize.gemspec +21 -0
- metadata +140 -0
data/.gitignore
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# Summarize
|
2
|
+
|
3
|
+
## A Ruby C wrapper for Open Text Summarizer
|
4
|
+
|
5
|
+
## Install
|
6
|
+
|
7
|
+
### Manual install
|
8
|
+
|
9
|
+
git clone https://github.com/ssoper/summarize.git
|
10
|
+
cd summarize
|
11
|
+
rake build # install the rake-compiler gem if you dont already have it
|
12
|
+
gem build summarize.gemspec
|
13
|
+
gem install summarize-1.0.gem
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
The summarize method is added to File which you can use to summarize the contents of any plain text file
|
18
|
+
|
19
|
+
File.open('path/to/file').summarize
|
20
|
+
|
21
|
+
Or use the String method
|
22
|
+
|
23
|
+
"text to summarize".summarize
|
24
|
+
|
25
|
+
By default it uses an English dictionary for summarizing but forty languages are supported. Pass in the valid ISO 639 language code to use one. A ratio (default is 25%) can also be passed in.
|
26
|
+
|
27
|
+
# Parse an article using Portuguese stemming rules with a ratio of 50%
|
28
|
+
"text to summarize".summarize(:language => 'pt', :ratio => 50)
|
29
|
+
|
30
|
+
You can also use custom stemming rules
|
31
|
+
|
32
|
+
"text to summarize".summarize(:dictionary => 'path/to/custom/dictionary')
|
33
|
+
|
34
|
+
## Dependencies
|
35
|
+
|
36
|
+
You must have glib-2.0 and libxml-2.0 installed and properly configured.
|
37
|
+
|
38
|
+
## Author
|
39
|
+
|
40
|
+
Gem written by Sean Soper ([@ssoper](http://twitter.com/ssoper))
|
41
|
+
|
42
|
+
The Open Text Summarizer library was written by Nadav Rotem and can be found at <http://libots.sourceforge.net/>
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "rake"
|
3
|
+
require "benchmark"
|
4
|
+
|
5
|
+
# Built using rake-compiler (https://github.com/luislavena/rake-compiler)
|
6
|
+
begin
|
7
|
+
require "rake/extensiontask"
|
8
|
+
rescue LoadError
|
9
|
+
abort "\nYou must install the rake-compiler gem\n\n gem install rake-compiler\n\n"
|
10
|
+
end
|
11
|
+
|
12
|
+
Rake::ExtensionTask.new("summarize") do |extension|
|
13
|
+
extension.lib_dir = "lib/summarize"
|
14
|
+
end
|
15
|
+
|
16
|
+
task :build => [:clean, :compile]
|
17
|
+
|
18
|
+
task :test => :build do
|
19
|
+
$:.unshift(File.join(File.dirname(__FILE__), "lib"))
|
20
|
+
require "summarize"
|
21
|
+
require "test/unit"
|
22
|
+
|
23
|
+
class TestSummarizer < Test::Unit::TestCase
|
24
|
+
def setup
|
25
|
+
@jupiter = File.read(File.join(File.dirname(__FILE__), 'sample_data/jupiter.txt'))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_ratios
|
29
|
+
content, ratio_25, ratio_50 = @jupiter.split('||').collect { |text| text.strip }
|
30
|
+
assert content.summarize(:ratio => 25) == ratio_25
|
31
|
+
assert content.summarize(:ratio => 50) == ratio_50
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_unsupported_language
|
35
|
+
content = @jupiter.split('||').first
|
36
|
+
assert_raise RuntimeError do
|
37
|
+
content.summarize(:language => 'nonexistent')
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_invalid_dictionary_file
|
42
|
+
content = @jupiter.split('||').first
|
43
|
+
assert_raise RuntimeError do
|
44
|
+
content.summarize(:dictionary => '/path/to/nowhere')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
/*
|
2
|
+
* article.c
|
3
|
+
*
|
4
|
+
* Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
|
5
|
+
*
|
6
|
+
* This program is free software; you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU General Public License as published by
|
8
|
+
* the Free Software Foundation; either version 2 of the License, or
|
9
|
+
* (at your option) any later version.
|
10
|
+
*
|
11
|
+
* This program is distributed in the hope that it will be useful,
|
12
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
* GNU Library General Public License for more details.
|
15
|
+
*
|
16
|
+
* You should have received a copy of the GNU General Public License
|
17
|
+
* along with this program; if not, write to the Free Software
|
18
|
+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
#include "libots.h"
|
26
|
+
#include "grader-tc.h"
|
27
|
+
|
28
|
+
extern void ots_free_TF_wordlist (GList * aList);
|
29
|
+
|
30
|
+
#define MAX_WORD_LENGTH 35
|
31
|
+
|
32
|
+
/*Data structure related functions*/
|
33
|
+
|
34
|
+
OtsSentence *
|
35
|
+
ots_new_sentence (void)
|
36
|
+
{
|
37
|
+
OtsSentence *aLine = g_new0 (OtsSentence, 1);
|
38
|
+
aLine->words = NULL;
|
39
|
+
aLine->wc = 0;
|
40
|
+
aLine->selected = 0;
|
41
|
+
aLine->score = 0;
|
42
|
+
return aLine;
|
43
|
+
}
|
44
|
+
|
45
|
+
void
|
46
|
+
ots_free_sentence (OtsSentence * sen)
|
47
|
+
{
|
48
|
+
if (sen != NULL)
|
49
|
+
{
|
50
|
+
g_list_foreach (sen->words, (GFunc) g_free, NULL);
|
51
|
+
g_list_free (sen->words);
|
52
|
+
g_free (sen);
|
53
|
+
}
|
54
|
+
sen=NULL;
|
55
|
+
}
|
56
|
+
|
57
|
+
OtsArticle *
|
58
|
+
ots_new_article (void)
|
59
|
+
{
|
60
|
+
OtsArticle *Doc;
|
61
|
+
Doc = g_new0 (OtsArticle, 1);
|
62
|
+
Doc->lineCount = 0;
|
63
|
+
Doc->title = NULL;
|
64
|
+
Doc->stem=new_stem_rule ();
|
65
|
+
Doc->lines=NULL;
|
66
|
+
Doc->dict = NULL;
|
67
|
+
Doc->ImpWords = NULL;
|
68
|
+
Doc->wordStat = NULL;
|
69
|
+
|
70
|
+
Doc->tf_terms=NULL;
|
71
|
+
return Doc;
|
72
|
+
}
|
73
|
+
|
74
|
+
void
|
75
|
+
ots_free_article (OtsArticle * art)
|
76
|
+
{
|
77
|
+
if (NULL != art)
|
78
|
+
{
|
79
|
+
free_stem_rule (art->stem);
|
80
|
+
ots_free_wordlist (art->dict);
|
81
|
+
ots_free_wordlist (art->ImpWords);
|
82
|
+
ots_free_wordlist (art->wordStat);
|
83
|
+
|
84
|
+
ots_free_TF_wordlist(art->tf_terms);
|
85
|
+
|
86
|
+
g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL);
|
87
|
+
g_list_free (art->lines);
|
88
|
+
|
89
|
+
if (art->title != NULL) g_free (art->title);
|
90
|
+
g_free (art);
|
91
|
+
}
|
92
|
+
art=NULL;
|
93
|
+
}
|
94
|
+
|
95
|
+
OtsSentence *
|
96
|
+
ots_append_line (OtsArticle * Doc)
|
97
|
+
{
|
98
|
+
OtsSentence *aLine = ots_new_sentence ();
|
99
|
+
Doc->lineCount++;
|
100
|
+
Doc->lines = g_list_append (Doc->lines, aLine);
|
101
|
+
return aLine;
|
102
|
+
}
|
103
|
+
|
104
|
+
void
|
105
|
+
ots_append_word (OtsSentence * aLine,unsigned const char *aWord)
|
106
|
+
{
|
107
|
+
if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return;
|
108
|
+
aLine->wc++;
|
109
|
+
aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord));
|
110
|
+
return;
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
gboolean
|
115
|
+
ots_is_line_selected(const OtsSentence *aLine)
|
116
|
+
{
|
117
|
+
if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;}
|
118
|
+
return (aLine->selected);
|
119
|
+
}
|
@@ -0,0 +1,101 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="bulgarian">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>август</word>
|
62
|
+
<word>април</word>
|
63
|
+
<word>в</word>
|
64
|
+
<word>всеки</word>
|
65
|
+
<word>всичко</word>
|
66
|
+
<word>вторник</word>
|
67
|
+
<word>да</word>
|
68
|
+
<word>декември</word>
|
69
|
+
<word>за</word>
|
70
|
+
<word>и</word>
|
71
|
+
<word>или</word>
|
72
|
+
<word>има</word>
|
73
|
+
<word>което</word>
|
74
|
+
<word>към</word>
|
75
|
+
<word>май</word>
|
76
|
+
<word>март</word>
|
77
|
+
<word>на</word>
|
78
|
+
<word>не</word>
|
79
|
+
<word>неделя</word>
|
80
|
+
<word>ноември</word>
|
81
|
+
<word>октомври</word>
|
82
|
+
<word>от</word>
|
83
|
+
<word>петък</word>
|
84
|
+
<word>по</word>
|
85
|
+
<word>понеделник</word>
|
86
|
+
<word>при</word>
|
87
|
+
<word>с</word>
|
88
|
+
<word>септември</word>
|
89
|
+
<word>сряда</word>
|
90
|
+
<word>сто</word>
|
91
|
+
<word>събота</word>
|
92
|
+
<word>трябва</word>
|
93
|
+
<word>февруари</word>
|
94
|
+
<word>хиляда</word>
|
95
|
+
<word>че</word>
|
96
|
+
<word>четвъртък</word>
|
97
|
+
<word>юли</word>
|
98
|
+
<word>юни</word>
|
99
|
+
<word>януари</word>
|
100
|
+
</grader-tc>
|
101
|
+
</dictionary>
|
@@ -0,0 +1,141 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<dictionary lang="catalan">
|
3
|
+
<stemmer>
|
4
|
+
<step1_pre>
|
5
|
+
<rule>"|</rule>
|
6
|
+
<rule>(|</rule>
|
7
|
+
</step1_pre>
|
8
|
+
|
9
|
+
|
10
|
+
<step1_post>
|
11
|
+
<rule>."|</rule>
|
12
|
+
<rule>,"|</rule>
|
13
|
+
<rule>.|</rule>
|
14
|
+
<rule>,|</rule>
|
15
|
+
<rule>"|</rule>
|
16
|
+
<rule>)|</rule>
|
17
|
+
<rule>?|</rule>
|
18
|
+
<rule>:|</rule>
|
19
|
+
<rule>;|</rule>
|
20
|
+
<rule>!|</rule>
|
21
|
+
</step1_post>
|
22
|
+
|
23
|
+
|
24
|
+
<manual>
|
25
|
+
<rule>wrote|write</rule>
|
26
|
+
<rule>came|come</rule>
|
27
|
+
<rule>went|go</rule>
|
28
|
+
</manual>
|
29
|
+
|
30
|
+
<post>
|
31
|
+
<rule>before1|1after</rule>
|
32
|
+
</post>
|
33
|
+
<pre>
|
34
|
+
<rule>before1|1after</rule>
|
35
|
+
</pre>
|
36
|
+
</stemmer>
|
37
|
+
<parser>
|
38
|
+
|
39
|
+
<linebreak>
|
40
|
+
<rule>."</rule>
|
41
|
+
<rule>?"</rule>
|
42
|
+
<rule>!"</rule>
|
43
|
+
<rule>,"</rule>
|
44
|
+
<rule>.</rule>
|
45
|
+
<rule>?</rule>
|
46
|
+
<rule>;</rule>
|
47
|
+
<rule>|</rule>
|
48
|
+
<rule>!</rule>
|
49
|
+
</linebreak>
|
50
|
+
|
51
|
+
<linedontbreak>
|
52
|
+
<rule>Dr.</rule>
|
53
|
+
<rule>Mr.</rule>
|
54
|
+
<rule>Mrs.</rule>
|
55
|
+
<rule>U.S.</rule>
|
56
|
+
<rule>Rep.</rule>
|
57
|
+
<rule>Sen.</rule>
|
58
|
+
</linedontbreak>
|
59
|
+
</parser>
|
60
|
+
<grader-tc>
|
61
|
+
<word>a</word>
|
62
|
+
<word>abans</word>
|
63
|
+
<word>al</word>
|
64
|
+
<word>amb</word>
|
65
|
+
<word>ambdós</word>
|
66
|
+
<word>anar</word>
|
67
|
+
<word>ara</word>
|
68
|
+
<word>baix</word>
|
69
|
+
<word>cap</word>
|
70
|
+
<word>cert</word>
|
71
|
+
<word>com</word>
|
72
|
+
<word>cuál</word>
|
73
|
+
<word>damunt</word>
|
74
|
+
<word>de</word>
|
75
|
+
<word>dins</word>
|
76
|
+
<word>doble</word>
|
77
|
+
<word>dos</word>
|
78
|
+
<word>dues</word>
|
79
|
+
<word>el</word>
|
80
|
+
<word>ell</word>
|
81
|
+
<word>ella</word>
|
82
|
+
<word>elles</word>
|
83
|
+
<word>ells</word>
|
84
|
+
<word>els</word>
|
85
|
+
<word>en</word>
|
86
|
+
<word>ésser</word>
|
87
|
+
<word>estar</word>
|
88
|
+
<word>excepte</word>
|
89
|
+
<word>jo</word>
|
90
|
+
<word>la</word>
|
91
|
+
<word>les</word>
|
92
|
+
<word>lluny</word>
|
93
|
+
<word>lo</word>
|
94
|
+
<word>los</word>
|
95
|
+
<word>mai</word>
|
96
|
+
<word>me</word>
|
97
|
+
<word>meu</word>
|
98
|
+
<word>meus</word>
|
99
|
+
<word>meva</word>
|
100
|
+
<word>meves</word>
|
101
|
+
<word>mí</word>
|
102
|
+
<word>na</word>
|
103
|
+
<word>nos</word>
|
104
|
+
<word>nosaltres</word>
|
105
|
+
<word>nostra</word>
|
106
|
+
<word>nostre</word>
|
107
|
+
<word>nostres</word>
|
108
|
+
<word>qual</word>
|
109
|
+
<word>quals</word>
|
110
|
+
<word>quan</word>
|
111
|
+
<word>quelcom</word>
|
112
|
+
<word>quin</word>
|
113
|
+
<word>quina</word>
|
114
|
+
<word>quines</word>
|
115
|
+
<word>quins</word>
|
116
|
+
<word>se</word>
|
117
|
+
<word>ser</word>
|
118
|
+
<word>seu</word>
|
119
|
+
<word>seus</word>
|
120
|
+
<word>seva</word>
|
121
|
+
<word>seves</word>
|
122
|
+
<word>sí</word>
|
123
|
+
<word>tenir</word>
|
124
|
+
<word>teu</word>
|
125
|
+
<word>teus</word>
|
126
|
+
<word>teva</word>
|
127
|
+
<word>teves</word>
|
128
|
+
<word>tu</word>
|
129
|
+
<word>u</word>
|
130
|
+
<word>un</word>
|
131
|
+
<word>una</word>
|
132
|
+
<word>unes</word>
|
133
|
+
<word>uns</word>
|
134
|
+
<word>vosaltres</word>
|
135
|
+
<word>vostè</word>
|
136
|
+
<word>vostès</word>
|
137
|
+
<word>vostra</word>
|
138
|
+
<word>vostre</word>
|
139
|
+
<word>vostres</word>
|
140
|
+
</grader-tc>
|
141
|
+
</dictionary>
|