ots 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +25 -0
- data/VERSION +1 -0
- data/ext/extconf.rb +14 -0
- data/ext/ots.c +197 -0
- data/lib/ots.rb +1 -0
- data/test/ots_test.rb +62 -0
- metadata +82 -0
data/README
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
ots is an interface to libots - The open text summarizer
|
2
|
+
|
3
|
+
INSTALL:
|
4
|
+
|
5
|
+
sudo gem install ots --source http://gems.github.com
|
6
|
+
|
7
|
+
REQUIREMENT:
|
8
|
+
|
9
|
+
* Ruby >= 1.8.7 ( >= 1.9.1 recommended)
|
10
|
+
* rubygems >= 1.3.5
|
11
|
+
* ruby development libraries (debian: ruby1.8-dev, ruby1.9.1-dev)
|
12
|
+
* libxml2 development libraries (debian: libxml2-dev)
|
13
|
+
* libots development libraries (debian: libots-dev)
|
14
|
+
* glib2.0 development libraries (debian: libglib2.0-dev)
|
15
|
+
|
16
|
+
USAGE:
|
17
|
+
|
18
|
+
>> require "rubygems"
|
19
|
+
>> require "ots"
|
20
|
+
>> summarizer = ots.new
|
21
|
+
>> summarizer.parse("I think I need some ice cream to cool me off. It is too hot down under")
|
22
|
+
>> summarizer.title
|
23
|
+
=> [ "hot","cool","cream","ice","think" ]
|
24
|
+
>> summarizer.summarize(:lines => 1)
|
25
|
+
=> [ { :sentence => "I think I need some ice cream to cool me off", :score => 57 } ]
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.4.2
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
$CFLAGS = "-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -Wall"
|
4
|
+
$LDFLAGS = "-lglib-2.0"
|
5
|
+
|
6
|
+
dir_config("libots", ["/usr/local", "/opt/local", "/usr"])
|
7
|
+
|
8
|
+
headers = [ 'stdio.h', 'stdlib.h', 'string.h', 'libots-1/ots/libots.h' ]
|
9
|
+
if have_header('libots-1/ots/libots.h') && have_library('ots-1', 'ots_new_article', headers)
|
10
|
+
create_makefile 'ots'
|
11
|
+
else
|
12
|
+
puts "Cannot find libots headers or libraries"
|
13
|
+
exit 1
|
14
|
+
end
|
data/ext/ots.c
ADDED
@@ -0,0 +1,197 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
/* ruby 1.9 only */
|
4
|
+
#ifdef RUBY_VM
|
5
|
+
#include <ruby/encoding.h>
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include <stdio.h>
|
9
|
+
#include <stdlib.h>
|
10
|
+
#include <string.h>
|
11
|
+
|
12
|
+
#include <libots-1/ots/libots.h>
|
13
|
+
|
14
|
+
#define ID_CONST_GET rb_intern("const_get")
|
15
|
+
#define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
|
16
|
+
|
17
|
+
static VALUE rb_cOTS;
|
18
|
+
static VALUE eLoadError;
|
19
|
+
static VALUE eRuntimeError;
|
20
|
+
static VALUE eArgumentError;
|
21
|
+
|
22
|
+
typedef struct {
|
23
|
+
gchar *word; /* the word */
|
24
|
+
gchar *stem; /*stem of the word*/
|
25
|
+
gint occ; /* how many times have we seen this word in the text? */
|
26
|
+
} OtsWordEntery;
|
27
|
+
|
28
|
+
|
29
|
+
/* helpers */
|
30
|
+
|
31
|
+
OtsArticle* get_article(VALUE self, gboolean error_on_missing) {
|
32
|
+
VALUE rb_article_object = rb_iv_get(self, "@article");
|
33
|
+
if (rb_article_object == Qnil) {
|
34
|
+
if (error_on_missing)
|
35
|
+
rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
|
36
|
+
else
|
37
|
+
return NULL;
|
38
|
+
}
|
39
|
+
return (OtsArticle *)DATA_PTR(rb_article_object);
|
40
|
+
}
|
41
|
+
|
42
|
+
void rb_ots_free_article(VALUE self) {
|
43
|
+
OtsArticle *article = DATA_PTR(rb_iv_get(self, "@article"));
|
44
|
+
ots_free_article(article);
|
45
|
+
}
|
46
|
+
|
47
|
+
VALUE rb_string(char *utf8) {
|
48
|
+
VALUE str = rb_str_new(utf8, strlen(utf8));
|
49
|
+
|
50
|
+
/* ruby 1.9 only - force bytestream to utf8 */
|
51
|
+
#ifdef RUBY_VM
|
52
|
+
rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
|
53
|
+
ENC_CODERANGE_CLEAR(str);
|
54
|
+
#endif
|
55
|
+
|
56
|
+
return str;
|
57
|
+
}
|
58
|
+
|
59
|
+
/* ruby libots methods/wrappers */
|
60
|
+
|
61
|
+
VALUE rb_ots_init(VALUE self) {
|
62
|
+
OtsArticle *article = get_article(self, FALSE);
|
63
|
+
VALUE dict = Qnil;
|
64
|
+
if (article != NULL) {
|
65
|
+
dict = rb_iv_get(self, "@dict");
|
66
|
+
ots_free_article(article);
|
67
|
+
}
|
68
|
+
article = ots_new_article();
|
69
|
+
rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
|
70
|
+
rb_iv_set(self, "@dict", dict);
|
71
|
+
return self;
|
72
|
+
}
|
73
|
+
|
74
|
+
VALUE rb_ots_load_dictionary(VALUE self, VALUE dict) {
|
75
|
+
char *dict_cstr = "en";
|
76
|
+
if (dict != Qnil) dict_cstr = RSTRING_PTR(dict);
|
77
|
+
|
78
|
+
OtsArticle *article = get_article(self, FALSE);
|
79
|
+
if (article == NULL) {
|
80
|
+
rb_ots_init(self);
|
81
|
+
article = get_article(self, TRUE);
|
82
|
+
}
|
83
|
+
|
84
|
+
if (!ots_load_xml_dictionary(article, (unsigned const char *)dict_cstr)) {
|
85
|
+
rb_ots_free_article(self);
|
86
|
+
rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
|
87
|
+
}
|
88
|
+
|
89
|
+
rb_iv_set(self, "@dict", dict);
|
90
|
+
return Qtrue;
|
91
|
+
}
|
92
|
+
|
93
|
+
VALUE rb_ots_parse_string(VALUE self, VALUE string) {
|
94
|
+
const unsigned char *string_cstr = (const unsigned char *)RSTRING_PTR(string);
|
95
|
+
size_t string_len = RSTRING_LEN(string);
|
96
|
+
|
97
|
+
rb_ots_init(self);
|
98
|
+
rb_ots_load_dictionary(self, rb_iv_get(self, "@dict"));
|
99
|
+
OtsArticle *article = get_article(self, TRUE);
|
100
|
+
ots_parse_stream(string_cstr, string_len, article);
|
101
|
+
ots_grade_doc(article);
|
102
|
+
return Qtrue;
|
103
|
+
}
|
104
|
+
|
105
|
+
VALUE rb_ots_highlight_lines(VALUE self, int lines) {
|
106
|
+
OtsArticle *article = get_article(self, TRUE);
|
107
|
+
ots_highlight_doc_lines(article, lines);
|
108
|
+
return Qtrue;
|
109
|
+
}
|
110
|
+
|
111
|
+
VALUE rb_ots_highlight_percent(VALUE self, int percent) {
|
112
|
+
OtsArticle *article = get_article(self, TRUE);
|
113
|
+
ots_highlight_doc(article, percent);
|
114
|
+
return Qtrue;
|
115
|
+
}
|
116
|
+
|
117
|
+
VALUE rb_ots_article_title(VALUE self) {
|
118
|
+
OtsArticle *article = get_article(self, TRUE);
|
119
|
+
if (article->title != NULL)
|
120
|
+
return rb_string(article->title);
|
121
|
+
else
|
122
|
+
return Qnil;
|
123
|
+
}
|
124
|
+
|
125
|
+
VALUE rb_ots_article_keywords(VALUE self) {
|
126
|
+
OtsArticle *article = get_article(self, TRUE);
|
127
|
+
GList* words = article->ImpWords;
|
128
|
+
VALUE iwords = rb_ary_new();
|
129
|
+
while (words != NULL) {
|
130
|
+
OtsWordEntery *data = (OtsWordEntery *)words->data;
|
131
|
+
if (data != NULL && strlen(data->word) > 0)
|
132
|
+
rb_ary_push(iwords, rb_string(data->word));
|
133
|
+
words = words->next;
|
134
|
+
}
|
135
|
+
|
136
|
+
return iwords;
|
137
|
+
}
|
138
|
+
|
139
|
+
VALUE rb_ots_get_highlighted_lines(VALUE self) {
|
140
|
+
OtsArticle *article = get_article(self, TRUE);
|
141
|
+
OtsSentence *sentence;
|
142
|
+
GList *curr_line = article->lines;
|
143
|
+
VALUE hlt_lines = rb_ary_new();
|
144
|
+
|
145
|
+
while (curr_line != NULL) {
|
146
|
+
sentence = (OtsSentence *)curr_line->data;
|
147
|
+
if (sentence->selected) {
|
148
|
+
size_t len;
|
149
|
+
unsigned char* content = ots_get_line_text(sentence, TRUE, &len);
|
150
|
+
VALUE hlt_line = rb_hash_new();
|
151
|
+
rb_hash_aset(hlt_line, ID2SYM(rb_intern("sentence")), rb_string((char *)content));
|
152
|
+
rb_hash_aset(hlt_line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
|
153
|
+
rb_ary_push(hlt_lines, hlt_line);
|
154
|
+
}
|
155
|
+
curr_line = g_list_next(curr_line);
|
156
|
+
}
|
157
|
+
|
158
|
+
return hlt_lines;
|
159
|
+
}
|
160
|
+
|
161
|
+
VALUE rb_summarize(VALUE self, VALUE options) {
|
162
|
+
|
163
|
+
VALUE lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
|
164
|
+
VALUE percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
|
165
|
+
|
166
|
+
if (lines != Qnil && percent != Qnil) {
|
167
|
+
rb_ots_free_article(self);
|
168
|
+
rb_raise(eArgumentError, "Cannot summarize on :lines & :percent, only one is allowed");
|
169
|
+
}
|
170
|
+
else if (lines == Qnil && percent == Qnil) {
|
171
|
+
rb_ots_free_article(self);
|
172
|
+
rb_raise(eArgumentError, "Need either :lines or :percent to summarize");
|
173
|
+
}
|
174
|
+
|
175
|
+
if (lines != Qnil)
|
176
|
+
rb_ots_highlight_lines(self, FIX2INT(lines));
|
177
|
+
else if (percent != Qnil)
|
178
|
+
rb_ots_highlight_percent(self, FIX2INT(percent));
|
179
|
+
return rb_ots_get_highlighted_lines(self);
|
180
|
+
}
|
181
|
+
|
182
|
+
/* init */
|
183
|
+
|
184
|
+
void Init_ots(void) {
|
185
|
+
eLoadError = CONST_GET(rb_mKernel, "LoadError");
|
186
|
+
eRuntimeError = CONST_GET(rb_mKernel, "RuntimeError");
|
187
|
+
eArgumentError = CONST_GET(rb_mKernel, "ArgumentError");
|
188
|
+
rb_cOTS = rb_define_class("OTS", rb_cObject);
|
189
|
+
rb_define_method(rb_cOTS, "load_dictionary", rb_ots_load_dictionary, 1);
|
190
|
+
rb_define_method(rb_cOTS, "parse", rb_ots_parse_string, 1);
|
191
|
+
rb_define_method(rb_cOTS, "highlight_lines", rb_ots_highlight_lines, 1);
|
192
|
+
rb_define_method(rb_cOTS, "highlight_percent", rb_ots_highlight_percent, 1);
|
193
|
+
rb_define_method(rb_cOTS, "highlighted_content", rb_ots_get_highlighted_lines, 0);
|
194
|
+
rb_define_method(rb_cOTS, "summarize", rb_summarize, 1);
|
195
|
+
rb_define_method(rb_cOTS, "title", rb_ots_article_title, 0);
|
196
|
+
rb_define_method(rb_cOTS, "keywords", rb_ots_article_keywords, 0);
|
197
|
+
}
|
data/lib/ots.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w(.. ext ots))
|
data/test/ots_test.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class OTSTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
SAMPLE = <<-TEXT
|
6
|
+
The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
|
7
|
+
It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
|
8
|
+
Pacific subspecies.
|
9
|
+
TEXT
|
10
|
+
|
11
|
+
context 'Title' do
|
12
|
+
should 'extract title from given document' do
|
13
|
+
ots = OTS.new
|
14
|
+
ots.parse SAMPLE
|
15
|
+
assert_equal 'species,turtle,subspecies,pacific,atlantic', ots.title
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context 'Keywords' do
|
20
|
+
should 'extract keywords from given document' do
|
21
|
+
ots = OTS.new
|
22
|
+
ots.parse SAMPLE
|
23
|
+
assert_equal %W(
|
24
|
+
species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
|
25
|
+
belonging sea endangered critically hawksbill
|
26
|
+
), ots.keywords
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context 'Summary' do
|
31
|
+
should 'extract keywords from given document' do
|
32
|
+
ots = OTS.new
|
33
|
+
ots.parse SAMPLE
|
34
|
+
lines = ots.summarize(:lines => 2).map do |value|
|
35
|
+
{ :sentence => value[:sentence].gsub(/\n\s*/, ' ').strip, :score => value[:score] }
|
36
|
+
end
|
37
|
+
|
38
|
+
assert_equal [
|
39
|
+
{
|
40
|
+
:sentence => "The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.",
|
41
|
+
:score => 48
|
42
|
+
},
|
43
|
+
{
|
44
|
+
:sentence => "The species has a worldwide distribution, with Atlantic and Pacific subspecies.",
|
45
|
+
:score => 20
|
46
|
+
}
|
47
|
+
], lines
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
should 'utf8 encode strings properly' do
|
52
|
+
ots = OTS.new
|
53
|
+
text = "The hawksbill turtle\xE2\x80\x93is critically endangered."
|
54
|
+
text.force_encoding('UTF-8') if RUBY_VERSION >= "1.9"
|
55
|
+
|
56
|
+
ots.parse(text)
|
57
|
+
summary = ots.summarize(:lines => 1).first[:sentence]
|
58
|
+
assert_equal text, summary
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ots
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 4
|
8
|
+
- 2
|
9
|
+
version: 0.4.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Bharanee Rathna
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-02-16 00:00:00 +11:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: shoulda
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 2
|
30
|
+
- 10
|
31
|
+
version: "2.10"
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: Ruby interface to libots libraries for unix.
|
35
|
+
email: deepfryed@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions:
|
39
|
+
- ext/extconf.rb
|
40
|
+
extra_rdoc_files:
|
41
|
+
- README
|
42
|
+
files:
|
43
|
+
- README
|
44
|
+
- VERSION
|
45
|
+
- ext/ots.c
|
46
|
+
- lib/ots.rb
|
47
|
+
- test/ots_test.rb
|
48
|
+
- ext/extconf.rb
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: http://github.com/deepfryed/ots
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options:
|
55
|
+
- --charset=UTF-8
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
version: "0"
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
segments:
|
72
|
+
- 0
|
73
|
+
version: "0"
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 1.3.7
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: Open Text Summarizer interface for Ruby.
|
81
|
+
test_files:
|
82
|
+
- test/ots_test.rb
|