ots 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +25 -0
- data/VERSION +1 -0
- data/ext/extconf.rb +14 -0
- data/ext/ots.c +197 -0
- data/lib/ots.rb +1 -0
- data/test/ots_test.rb +62 -0
- metadata +82 -0
data/README
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
ots is an interface to libots - The open text summarizer
|
2
|
+
|
3
|
+
INSTALL:
|
4
|
+
|
5
|
+
sudo gem install ots --source http://gems.github.com
|
6
|
+
|
7
|
+
REQUIREMENT:
|
8
|
+
|
9
|
+
* Ruby >= 1.8.7 ( >= 1.9.1 recommended)
|
10
|
+
* rubygems >= 1.3.5
|
11
|
+
* ruby development libraries (debian: ruby1.8-dev, ruby1.9.1-dev)
|
12
|
+
* libxml2 development libraries (debian: libxml2-dev)
|
13
|
+
* libots development libraries (debian: libots-dev)
|
14
|
+
* glib2.0 development libraries (debian: libglib2.0-dev)
|
15
|
+
|
16
|
+
USAGE:
|
17
|
+
|
18
|
+
>> require "rubygems"
|
19
|
+
>> require "ots"
|
20
|
+
>> summarizer = ots.new
|
21
|
+
>> summarizer.parse("I think I need some ice cream to cool me off. It is too hot down under")
|
22
|
+
>> summarizer.title
|
23
|
+
=> [ "hot","cool","cream","ice","think" ]
|
24
|
+
>> summarizer.summarize(:lines => 1)
|
25
|
+
=> [ { :sentence => "I think I need some ice cream to cool me off", :score => 57 } ]
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.4.2
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
$CFLAGS = "-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -Wall"
|
4
|
+
$LDFLAGS = "-lglib-2.0"
|
5
|
+
|
6
|
+
dir_config("libots", ["/usr/local", "/opt/local", "/usr"])
|
7
|
+
|
8
|
+
headers = [ 'stdio.h', 'stdlib.h', 'string.h', 'libots-1/ots/libots.h' ]
|
9
|
+
if have_header('libots-1/ots/libots.h') && have_library('ots-1', 'ots_new_article', headers)
|
10
|
+
create_makefile 'ots'
|
11
|
+
else
|
12
|
+
puts "Cannot find libots headers or libraries"
|
13
|
+
exit 1
|
14
|
+
end
|
data/ext/ots.c
ADDED
@@ -0,0 +1,197 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
/* ruby 1.9 only */
|
4
|
+
#ifdef RUBY_VM
|
5
|
+
#include <ruby/encoding.h>
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include <stdio.h>
|
9
|
+
#include <stdlib.h>
|
10
|
+
#include <string.h>
|
11
|
+
|
12
|
+
#include <libots-1/ots/libots.h>
|
13
|
+
|
14
|
+
#define ID_CONST_GET rb_intern("const_get")
|
15
|
+
#define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
|
16
|
+
|
17
|
+
static VALUE rb_cOTS;
|
18
|
+
static VALUE eLoadError;
|
19
|
+
static VALUE eRuntimeError;
|
20
|
+
static VALUE eArgumentError;
|
21
|
+
|
22
|
+
typedef struct {
|
23
|
+
gchar *word; /* the word */
|
24
|
+
gchar *stem; /*stem of the word*/
|
25
|
+
gint occ; /* how many times have we seen this word in the text? */
|
26
|
+
} OtsWordEntery;
|
27
|
+
|
28
|
+
|
29
|
+
/* helpers */
|
30
|
+
|
31
|
+
OtsArticle* get_article(VALUE self, gboolean error_on_missing) {
|
32
|
+
VALUE rb_article_object = rb_iv_get(self, "@article");
|
33
|
+
if (rb_article_object == Qnil) {
|
34
|
+
if (error_on_missing)
|
35
|
+
rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
|
36
|
+
else
|
37
|
+
return NULL;
|
38
|
+
}
|
39
|
+
return (OtsArticle *)DATA_PTR(rb_article_object);
|
40
|
+
}
|
41
|
+
|
42
|
+
void rb_ots_free_article(VALUE self) {
|
43
|
+
OtsArticle *article = DATA_PTR(rb_iv_get(self, "@article"));
|
44
|
+
ots_free_article(article);
|
45
|
+
}
|
46
|
+
|
47
|
+
VALUE rb_string(char *utf8) {
|
48
|
+
VALUE str = rb_str_new(utf8, strlen(utf8));
|
49
|
+
|
50
|
+
/* ruby 1.9 only - force bytestream to utf8 */
|
51
|
+
#ifdef RUBY_VM
|
52
|
+
rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
|
53
|
+
ENC_CODERANGE_CLEAR(str);
|
54
|
+
#endif
|
55
|
+
|
56
|
+
return str;
|
57
|
+
}
|
58
|
+
|
59
|
+
/* ruby libots methods/wrappers */
|
60
|
+
|
61
|
+
VALUE rb_ots_init(VALUE self) {
|
62
|
+
OtsArticle *article = get_article(self, FALSE);
|
63
|
+
VALUE dict = Qnil;
|
64
|
+
if (article != NULL) {
|
65
|
+
dict = rb_iv_get(self, "@dict");
|
66
|
+
ots_free_article(article);
|
67
|
+
}
|
68
|
+
article = ots_new_article();
|
69
|
+
rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
|
70
|
+
rb_iv_set(self, "@dict", dict);
|
71
|
+
return self;
|
72
|
+
}
|
73
|
+
|
74
|
+
VALUE rb_ots_load_dictionary(VALUE self, VALUE dict) {
|
75
|
+
char *dict_cstr = "en";
|
76
|
+
if (dict != Qnil) dict_cstr = RSTRING_PTR(dict);
|
77
|
+
|
78
|
+
OtsArticle *article = get_article(self, FALSE);
|
79
|
+
if (article == NULL) {
|
80
|
+
rb_ots_init(self);
|
81
|
+
article = get_article(self, TRUE);
|
82
|
+
}
|
83
|
+
|
84
|
+
if (!ots_load_xml_dictionary(article, (unsigned const char *)dict_cstr)) {
|
85
|
+
rb_ots_free_article(self);
|
86
|
+
rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
|
87
|
+
}
|
88
|
+
|
89
|
+
rb_iv_set(self, "@dict", dict);
|
90
|
+
return Qtrue;
|
91
|
+
}
|
92
|
+
|
93
|
+
VALUE rb_ots_parse_string(VALUE self, VALUE string) {
|
94
|
+
const unsigned char *string_cstr = (const unsigned char *)RSTRING_PTR(string);
|
95
|
+
size_t string_len = RSTRING_LEN(string);
|
96
|
+
|
97
|
+
rb_ots_init(self);
|
98
|
+
rb_ots_load_dictionary(self, rb_iv_get(self, "@dict"));
|
99
|
+
OtsArticle *article = get_article(self, TRUE);
|
100
|
+
ots_parse_stream(string_cstr, string_len, article);
|
101
|
+
ots_grade_doc(article);
|
102
|
+
return Qtrue;
|
103
|
+
}
|
104
|
+
|
105
|
+
VALUE rb_ots_highlight_lines(VALUE self, int lines) {
|
106
|
+
OtsArticle *article = get_article(self, TRUE);
|
107
|
+
ots_highlight_doc_lines(article, lines);
|
108
|
+
return Qtrue;
|
109
|
+
}
|
110
|
+
|
111
|
+
VALUE rb_ots_highlight_percent(VALUE self, int percent) {
|
112
|
+
OtsArticle *article = get_article(self, TRUE);
|
113
|
+
ots_highlight_doc(article, percent);
|
114
|
+
return Qtrue;
|
115
|
+
}
|
116
|
+
|
117
|
+
VALUE rb_ots_article_title(VALUE self) {
|
118
|
+
OtsArticle *article = get_article(self, TRUE);
|
119
|
+
if (article->title != NULL)
|
120
|
+
return rb_string(article->title);
|
121
|
+
else
|
122
|
+
return Qnil;
|
123
|
+
}
|
124
|
+
|
125
|
+
VALUE rb_ots_article_keywords(VALUE self) {
|
126
|
+
OtsArticle *article = get_article(self, TRUE);
|
127
|
+
GList* words = article->ImpWords;
|
128
|
+
VALUE iwords = rb_ary_new();
|
129
|
+
while (words != NULL) {
|
130
|
+
OtsWordEntery *data = (OtsWordEntery *)words->data;
|
131
|
+
if (data != NULL && strlen(data->word) > 0)
|
132
|
+
rb_ary_push(iwords, rb_string(data->word));
|
133
|
+
words = words->next;
|
134
|
+
}
|
135
|
+
|
136
|
+
return iwords;
|
137
|
+
}
|
138
|
+
|
139
|
+
VALUE rb_ots_get_highlighted_lines(VALUE self) {
|
140
|
+
OtsArticle *article = get_article(self, TRUE);
|
141
|
+
OtsSentence *sentence;
|
142
|
+
GList *curr_line = article->lines;
|
143
|
+
VALUE hlt_lines = rb_ary_new();
|
144
|
+
|
145
|
+
while (curr_line != NULL) {
|
146
|
+
sentence = (OtsSentence *)curr_line->data;
|
147
|
+
if (sentence->selected) {
|
148
|
+
size_t len;
|
149
|
+
unsigned char* content = ots_get_line_text(sentence, TRUE, &len);
|
150
|
+
VALUE hlt_line = rb_hash_new();
|
151
|
+
rb_hash_aset(hlt_line, ID2SYM(rb_intern("sentence")), rb_string((char *)content));
|
152
|
+
rb_hash_aset(hlt_line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
|
153
|
+
rb_ary_push(hlt_lines, hlt_line);
|
154
|
+
}
|
155
|
+
curr_line = g_list_next(curr_line);
|
156
|
+
}
|
157
|
+
|
158
|
+
return hlt_lines;
|
159
|
+
}
|
160
|
+
|
161
|
+
VALUE rb_summarize(VALUE self, VALUE options) {
|
162
|
+
|
163
|
+
VALUE lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
|
164
|
+
VALUE percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
|
165
|
+
|
166
|
+
if (lines != Qnil && percent != Qnil) {
|
167
|
+
rb_ots_free_article(self);
|
168
|
+
rb_raise(eArgumentError, "Cannot summarize on :lines & :percent, only one is allowed");
|
169
|
+
}
|
170
|
+
else if (lines == Qnil && percent == Qnil) {
|
171
|
+
rb_ots_free_article(self);
|
172
|
+
rb_raise(eArgumentError, "Need either :lines or :percent to summarize");
|
173
|
+
}
|
174
|
+
|
175
|
+
if (lines != Qnil)
|
176
|
+
rb_ots_highlight_lines(self, FIX2INT(lines));
|
177
|
+
else if (percent != Qnil)
|
178
|
+
rb_ots_highlight_percent(self, FIX2INT(percent));
|
179
|
+
return rb_ots_get_highlighted_lines(self);
|
180
|
+
}
|
181
|
+
|
182
|
+
/* init */
|
183
|
+
|
184
|
+
void Init_ots(void) {
|
185
|
+
eLoadError = CONST_GET(rb_mKernel, "LoadError");
|
186
|
+
eRuntimeError = CONST_GET(rb_mKernel, "RuntimeError");
|
187
|
+
eArgumentError = CONST_GET(rb_mKernel, "ArgumentError");
|
188
|
+
rb_cOTS = rb_define_class("OTS", rb_cObject);
|
189
|
+
rb_define_method(rb_cOTS, "load_dictionary", rb_ots_load_dictionary, 1);
|
190
|
+
rb_define_method(rb_cOTS, "parse", rb_ots_parse_string, 1);
|
191
|
+
rb_define_method(rb_cOTS, "highlight_lines", rb_ots_highlight_lines, 1);
|
192
|
+
rb_define_method(rb_cOTS, "highlight_percent", rb_ots_highlight_percent, 1);
|
193
|
+
rb_define_method(rb_cOTS, "highlighted_content", rb_ots_get_highlighted_lines, 0);
|
194
|
+
rb_define_method(rb_cOTS, "summarize", rb_summarize, 1);
|
195
|
+
rb_define_method(rb_cOTS, "title", rb_ots_article_title, 0);
|
196
|
+
rb_define_method(rb_cOTS, "keywords", rb_ots_article_keywords, 0);
|
197
|
+
}
|
data/lib/ots.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w(.. ext ots))
|
data/test/ots_test.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class OTSTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
SAMPLE = <<-TEXT
|
6
|
+
The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
|
7
|
+
It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
|
8
|
+
Pacific subspecies.
|
9
|
+
TEXT
|
10
|
+
|
11
|
+
context 'Title' do
|
12
|
+
should 'extract title from given document' do
|
13
|
+
ots = OTS.new
|
14
|
+
ots.parse SAMPLE
|
15
|
+
assert_equal 'species,turtle,subspecies,pacific,atlantic', ots.title
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context 'Keywords' do
|
20
|
+
should 'extract keywords from given document' do
|
21
|
+
ots = OTS.new
|
22
|
+
ots.parse SAMPLE
|
23
|
+
assert_equal %W(
|
24
|
+
species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
|
25
|
+
belonging sea endangered critically hawksbill
|
26
|
+
), ots.keywords
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context 'Summary' do
|
31
|
+
should 'extract keywords from given document' do
|
32
|
+
ots = OTS.new
|
33
|
+
ots.parse SAMPLE
|
34
|
+
lines = ots.summarize(:lines => 2).map do |value|
|
35
|
+
{ :sentence => value[:sentence].gsub(/\n\s*/, ' ').strip, :score => value[:score] }
|
36
|
+
end
|
37
|
+
|
38
|
+
assert_equal [
|
39
|
+
{
|
40
|
+
:sentence => "The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.",
|
41
|
+
:score => 48
|
42
|
+
},
|
43
|
+
{
|
44
|
+
:sentence => "The species has a worldwide distribution, with Atlantic and Pacific subspecies.",
|
45
|
+
:score => 20
|
46
|
+
}
|
47
|
+
], lines
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
should 'utf8 encode strings properly' do
|
52
|
+
ots = OTS.new
|
53
|
+
text = "The hawksbill turtle\xE2\x80\x93is critically endangered."
|
54
|
+
text.force_encoding('UTF-8') if RUBY_VERSION >= "1.9"
|
55
|
+
|
56
|
+
ots.parse(text)
|
57
|
+
summary = ots.summarize(:lines => 1).first[:sentence]
|
58
|
+
assert_equal text, summary
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ots
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 4
|
8
|
+
- 2
|
9
|
+
version: 0.4.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Bharanee Rathna
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-02-16 00:00:00 +11:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: shoulda
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 2
|
30
|
+
- 10
|
31
|
+
version: "2.10"
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: Ruby interface to libots libraries for unix.
|
35
|
+
email: deepfryed@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions:
|
39
|
+
- ext/extconf.rb
|
40
|
+
extra_rdoc_files:
|
41
|
+
- README
|
42
|
+
files:
|
43
|
+
- README
|
44
|
+
- VERSION
|
45
|
+
- ext/ots.c
|
46
|
+
- lib/ots.rb
|
47
|
+
- test/ots_test.rb
|
48
|
+
- ext/extconf.rb
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: http://github.com/deepfryed/ots
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options:
|
55
|
+
- --charset=UTF-8
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
version: "0"
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
segments:
|
72
|
+
- 0
|
73
|
+
version: "0"
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 1.3.7
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: Open Text Summarizer interface for Ruby.
|
81
|
+
test_files:
|
82
|
+
- test/ots_test.rb
|