summarize 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +2 -2
- data/ext/summarize/extconf.rb +6 -2
- data/ext/summarize/grader-tf.c +4 -1
- data/ext/summarize/highlighter.c +1 -1
- data/ext/summarize/libots.h +1 -1
- data/ext/summarize/parser.c +2 -3
- data/ext/summarize/relations.c +2 -2
- data/ext/summarize/stemmer.c +3 -3
- data/ext/summarize/summarize.c +3 -4
- data/ext/summarize/text.c +2 -2
- data/lib/summarize.rb +7 -1
- metadata +2 -2
data/README.markdown
CHANGED
@@ -12,7 +12,7 @@
|
|
12
12
|
cd summarize
|
13
13
|
rake build
|
14
14
|
gem build summarize.gemspec
|
15
|
-
gem install summarize-1.0.gem
|
15
|
+
gem install summarize-1.0.1.gem
|
16
16
|
|
17
17
|
## Usage
|
18
18
|
|
@@ -27,7 +27,7 @@ Or use the String method
|
|
27
27
|
By default it uses an English dictionary for summarizing but forty languages are supported. Pass in the valid ISO 639 language code to use one. A ratio (default is 25%) can also be passed in.
|
28
28
|
|
29
29
|
# Parse an article using Portuguese stemming rules with a ratio of 50%
|
30
|
-
"
|
30
|
+
"texto para sumariar".summarize(:language => 'pt', :ratio => 50)
|
31
31
|
|
32
32
|
You can also use custom stemming rules
|
33
33
|
|
data/ext/summarize/extconf.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
require 'mkmf'
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
%w(glib-2.0 libxml-2.0).each do |lib|
|
4
|
+
pkg_config lib
|
5
|
+
end
|
6
|
+
|
7
|
+
# $CFLAGS = ENV["CFLAGS"].to_s + " " + `pkg-config --cflags glib-2.0 libxml-2.0`.chomp
|
8
|
+
# $LDFLAGS = ENV["LDFLAGS"].to_s + " " + `pkg-config --libs glib-2.0 libxml-2.0`.chomp
|
5
9
|
|
6
10
|
create_makefile('summarize/summarize')
|
data/ext/summarize/grader-tf.c
CHANGED
@@ -102,11 +102,14 @@ N = (total-number-of-sentences)
|
|
102
102
|
f = n/N
|
103
103
|
*/
|
104
104
|
|
105
|
+
/*
|
106
|
+
ssoper: unused and causing warning messages
|
107
|
+
|
105
108
|
double
|
106
109
|
ots_calc_idf (const int term_count,const int doc_word_count)
|
107
110
|
{
|
108
111
|
return -log(doc_word_count/term_count);
|
109
|
-
}
|
112
|
+
}*/
|
110
113
|
|
111
114
|
double
|
112
115
|
ots_calc_tf (const int term_count,const int doc_word_count)
|
data/ext/summarize/highlighter.c
CHANGED
@@ -31,7 +31,7 @@ static int
|
|
31
31
|
ots_highlight_max_line (OtsArticle * Doc)
|
32
32
|
{
|
33
33
|
GList *li;
|
34
|
-
int max = 0;
|
34
|
+
long int max = 0;
|
35
35
|
for (li = (GList *) Doc->lines; li != NULL; li = li->next)
|
36
36
|
{
|
37
37
|
if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */
|
data/ext/summarize/libots.h
CHANGED
@@ -149,7 +149,7 @@ GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_co
|
|
149
149
|
|
150
150
|
|
151
151
|
/*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
|
152
|
-
int ots_topic_list_score(
|
152
|
+
int ots_topic_list_score(GList *topic_list1, GList *topic_list2);
|
153
153
|
|
154
154
|
G_END_DECLS
|
155
155
|
|
data/ext/summarize/parser.c
CHANGED
@@ -29,8 +29,7 @@
|
|
29
29
|
int
|
30
30
|
ots_match_post (const char *aWord,const char *post)
|
31
31
|
{
|
32
|
-
int i, wlen, plen;
|
33
|
-
|
32
|
+
long int i, wlen, plen;
|
34
33
|
|
35
34
|
wlen = strlen (aWord);
|
36
35
|
plen = strlen (post);
|
@@ -118,7 +117,7 @@ ots_parse_stream(const unsigned char *utf8, size_t len, OtsArticle * Doc) /*pars
|
|
118
117
|
OtsSentence *tmpLine = ots_append_line (Doc);
|
119
118
|
OtsStemRule * rule=Doc->stem;
|
120
119
|
gunichar uc;
|
121
|
-
|
120
|
+
size_t index = 0;
|
122
121
|
char *s = (char *) utf8;
|
123
122
|
GString *word_buffer = g_string_new (NULL);
|
124
123
|
|
data/ext/summarize/relations.c
CHANGED
@@ -132,8 +132,8 @@ return topics;
|
|
132
132
|
|
133
133
|
/*Gives a score on the relations between two lists of topics; simmilar to the inner product*/
|
134
134
|
int ots_topic_list_score(
|
135
|
-
|
136
|
-
|
135
|
+
GList *topic_list1,
|
136
|
+
GList *topic_list2)
|
137
137
|
{
|
138
138
|
int count=0;
|
139
139
|
GList *tmplist1;
|
data/ext/summarize/stemmer.c
CHANGED
@@ -70,7 +70,7 @@ if (rule != NULL)
|
|
70
70
|
static void
|
71
71
|
ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b) /*given already alocated part_a and b */
|
72
72
|
{ /*example "red|blue" */
|
73
|
-
int i, j, clen;
|
73
|
+
long int i, j, clen;
|
74
74
|
i = 0;
|
75
75
|
j = 0;
|
76
76
|
|
@@ -106,7 +106,7 @@ ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *p
|
|
106
106
|
static unsigned char *
|
107
107
|
ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new)
|
108
108
|
{
|
109
|
-
int i, plen, wlen, nlen;
|
109
|
+
long int i, plen, wlen, nlen;
|
110
110
|
unsigned char *new_str = NULL;
|
111
111
|
|
112
112
|
if (aWord==NULL) return NULL;
|
@@ -135,7 +135,7 @@ ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigne
|
|
135
135
|
static unsigned char *
|
136
136
|
ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new)
|
137
137
|
{
|
138
|
-
unsigned int i, wlen, plen, nlen;
|
138
|
+
unsigned long int i, wlen, plen, nlen;
|
139
139
|
unsigned char *new_str = NULL;
|
140
140
|
|
141
141
|
if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL;
|
data/ext/summarize/summarize.c
CHANGED
@@ -9,15 +9,14 @@
|
|
9
9
|
#include "libots.h"
|
10
10
|
#include "summarize.h"
|
11
11
|
|
12
|
-
const char *OTS_ERROR_BAD_DICT = "Cannot load dictionary file";
|
13
12
|
|
14
13
|
void Init_summarize() {
|
15
14
|
VALUE rb_mOts = rb_define_module("Summarize");
|
16
15
|
rb_define_module_function(rb_mOts, "summarize", summarize, 3);
|
17
16
|
}
|
18
17
|
|
19
|
-
static VALUE summarize(const VALUE self,
|
20
|
-
int length = RSTRING_LEN(rb_str);
|
18
|
+
static VALUE summarize(const VALUE self, volatile VALUE rb_str, volatile VALUE rb_dict_file, const VALUE rb_ratio) {
|
19
|
+
long int length = RSTRING_LEN(rb_str);
|
21
20
|
char *text = StringValuePtr(rb_str);
|
22
21
|
char *dictionary_file = StringValuePtr(rb_dict_file);
|
23
22
|
int ratio = NUM2INT(rb_ratio);
|
@@ -27,7 +26,7 @@ static VALUE summarize(const VALUE self, const VALUE rb_str, const VALUE rb_dict
|
|
27
26
|
|
28
27
|
if (!ots_load_xml_dictionary(doc, dictionary_file)) {
|
29
28
|
ots_free_article(doc);
|
30
|
-
rb_raise(rb_eRuntimeError,
|
29
|
+
rb_raise(rb_eRuntimeError, "Cannot load dictionary file");
|
31
30
|
return Qnil;
|
32
31
|
}
|
33
32
|
|
data/ext/summarize/text.c
CHANGED
@@ -57,7 +57,7 @@ static void
|
|
57
57
|
ots_print_line (FILE * stream, const OtsSentence * aLine)
|
58
58
|
{
|
59
59
|
unsigned char *utf8_txt;
|
60
|
-
size_t len;
|
60
|
+
size_t len = 0;
|
61
61
|
utf8_txt = ots_get_line_text (aLine, TRUE, &len);
|
62
62
|
fwrite (utf8_txt, 1, len, stream);
|
63
63
|
g_free (utf8_txt);
|
@@ -69,7 +69,7 @@ ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
|
|
69
69
|
GList *li;
|
70
70
|
GString *text;
|
71
71
|
unsigned char *utf8_data;
|
72
|
-
size_t line_len;
|
72
|
+
size_t line_len = 0;
|
73
73
|
|
74
74
|
text = g_string_new (NULL);
|
75
75
|
|
data/lib/summarize.rb
CHANGED
@@ -10,7 +10,7 @@ class Hash #:nodoc:
|
|
10
10
|
end unless {}.respond_to? 'symbolize_keys'
|
11
11
|
|
12
12
|
module Summarize
|
13
|
-
VERSION = "1.0.
|
13
|
+
VERSION = "1.0.2"
|
14
14
|
|
15
15
|
LANGUAGES = [
|
16
16
|
'bg', # Bulgarian
|
@@ -84,6 +84,9 @@ class String
|
|
84
84
|
# language::
|
85
85
|
# An ISO 639-1 language code. See Summarize::LANGUAGES for the supported list.
|
86
86
|
#
|
87
|
+
# dictionary::
|
88
|
+
# A path to a custom stemming XML file
|
89
|
+
#
|
87
90
|
# == Returns:
|
88
91
|
# A string summary
|
89
92
|
#
|
@@ -105,6 +108,9 @@ class File
|
|
105
108
|
# language::
|
106
109
|
# An ISO 639-1 language code. See Summarize::LANGUAGES for the supported list.
|
107
110
|
#
|
111
|
+
# dictionary::
|
112
|
+
# A path to a custom stemming XML file
|
113
|
+
#
|
108
114
|
# == Returns:
|
109
115
|
# A string summary
|
110
116
|
#
|