ots 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (7) hide show
  1. data/README +25 -0
  2. data/VERSION +1 -0
  3. data/ext/extconf.rb +14 -0
  4. data/ext/ots.c +197 -0
  5. data/lib/ots.rb +1 -0
  6. data/test/ots_test.rb +62 -0
  7. metadata +82 -0
data/README ADDED
@@ -0,0 +1,25 @@
1
+ ots is an interface to libots - The open text summarizer
2
+
3
+ INSTALL:
4
+
5
+ sudo gem install ots --source http://gems.github.com
6
+
7
+ REQUIREMENT:
8
+
9
+ * Ruby >= 1.8.7 ( >= 1.9.1 recommended)
10
+ * rubygems >= 1.3.5
11
+ * ruby development libraries (debian: ruby1.8-dev, ruby1.9.1-dev)
12
+ * libxml2 development libraries (debian: libxml2-dev)
13
+ * libots development libraries (debian: libots-dev)
14
+ * glib2.0 development libraries (debian: libglib2.0-dev)
15
+
16
+ USAGE:
17
+
18
+ >> require "rubygems"
19
+ >> require "ots"
20
+ >> summarizer = ots.new
21
+ >> summarizer.parse("I think I need some ice cream to cool me off. It is too hot down under")
22
+ >> summarizer.title
23
+ => [ "hot","cool","cream","ice","think" ]
24
+ >> summarizer.summarize(:lines => 1)
25
+ => [ { :sentence => "I think I need some ice cream to cool me off", :score => 57 } ]
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.2
data/ext/extconf.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -Wall"
4
+ $LDFLAGS = "-lglib-2.0"
5
+
6
+ dir_config("libots", ["/usr/local", "/opt/local", "/usr"])
7
+
8
+ headers = [ 'stdio.h', 'stdlib.h', 'string.h', 'libots-1/ots/libots.h' ]
9
+ if have_header('libots-1/ots/libots.h') && have_library('ots-1', 'ots_new_article', headers)
10
+ create_makefile 'ots'
11
+ else
12
+ puts "Cannot find libots headers or libraries"
13
+ exit 1
14
+ end
data/ext/ots.c ADDED
@@ -0,0 +1,197 @@
1
+ #include <ruby.h>
2
+
3
+ /* ruby 1.9 only */
4
+ #ifdef RUBY_VM
5
+ #include <ruby/encoding.h>
6
+ #endif
7
+
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+ #include <libots-1/ots/libots.h>
13
+
14
+ #define ID_CONST_GET rb_intern("const_get")
15
+ #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
16
+
17
+ static VALUE rb_cOTS;
18
+ static VALUE eLoadError;
19
+ static VALUE eRuntimeError;
20
+ static VALUE eArgumentError;
21
+
22
+ typedef struct {
23
+ gchar *word; /* the word */
24
+ gchar *stem; /*stem of the word*/
25
+ gint occ; /* how many times have we seen this word in the text? */
26
+ } OtsWordEntery;
27
+
28
+
29
+ /* helpers */
30
+
31
+ OtsArticle* get_article(VALUE self, gboolean error_on_missing) {
32
+ VALUE rb_article_object = rb_iv_get(self, "@article");
33
+ if (rb_article_object == Qnil) {
34
+ if (error_on_missing)
35
+ rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
36
+ else
37
+ return NULL;
38
+ }
39
+ return (OtsArticle *)DATA_PTR(rb_article_object);
40
+ }
41
+
42
+ void rb_ots_free_article(VALUE self) {
43
+ OtsArticle *article = DATA_PTR(rb_iv_get(self, "@article"));
44
+ ots_free_article(article);
45
+ }
46
+
47
+ VALUE rb_string(char *utf8) {
48
+ VALUE str = rb_str_new(utf8, strlen(utf8));
49
+
50
+ /* ruby 1.9 only - force bytestream to utf8 */
51
+ #ifdef RUBY_VM
52
+ rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
53
+ ENC_CODERANGE_CLEAR(str);
54
+ #endif
55
+
56
+ return str;
57
+ }
58
+
59
+ /* ruby libots methods/wrappers */
60
+
61
+ VALUE rb_ots_init(VALUE self) {
62
+ OtsArticle *article = get_article(self, FALSE);
63
+ VALUE dict = Qnil;
64
+ if (article != NULL) {
65
+ dict = rb_iv_get(self, "@dict");
66
+ ots_free_article(article);
67
+ }
68
+ article = ots_new_article();
69
+ rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
70
+ rb_iv_set(self, "@dict", dict);
71
+ return self;
72
+ }
73
+
74
+ VALUE rb_ots_load_dictionary(VALUE self, VALUE dict) {
75
+ char *dict_cstr = "en";
76
+ if (dict != Qnil) dict_cstr = RSTRING_PTR(dict);
77
+
78
+ OtsArticle *article = get_article(self, FALSE);
79
+ if (article == NULL) {
80
+ rb_ots_init(self);
81
+ article = get_article(self, TRUE);
82
+ }
83
+
84
+ if (!ots_load_xml_dictionary(article, (unsigned const char *)dict_cstr)) {
85
+ rb_ots_free_article(self);
86
+ rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
87
+ }
88
+
89
+ rb_iv_set(self, "@dict", dict);
90
+ return Qtrue;
91
+ }
92
+
93
+ VALUE rb_ots_parse_string(VALUE self, VALUE string) {
94
+ const unsigned char *string_cstr = (const unsigned char *)RSTRING_PTR(string);
95
+ size_t string_len = RSTRING_LEN(string);
96
+
97
+ rb_ots_init(self);
98
+ rb_ots_load_dictionary(self, rb_iv_get(self, "@dict"));
99
+ OtsArticle *article = get_article(self, TRUE);
100
+ ots_parse_stream(string_cstr, string_len, article);
101
+ ots_grade_doc(article);
102
+ return Qtrue;
103
+ }
104
+
105
+ VALUE rb_ots_highlight_lines(VALUE self, int lines) {
106
+ OtsArticle *article = get_article(self, TRUE);
107
+ ots_highlight_doc_lines(article, lines);
108
+ return Qtrue;
109
+ }
110
+
111
+ VALUE rb_ots_highlight_percent(VALUE self, int percent) {
112
+ OtsArticle *article = get_article(self, TRUE);
113
+ ots_highlight_doc(article, percent);
114
+ return Qtrue;
115
+ }
116
+
117
+ VALUE rb_ots_article_title(VALUE self) {
118
+ OtsArticle *article = get_article(self, TRUE);
119
+ if (article->title != NULL)
120
+ return rb_string(article->title);
121
+ else
122
+ return Qnil;
123
+ }
124
+
125
+ VALUE rb_ots_article_keywords(VALUE self) {
126
+ OtsArticle *article = get_article(self, TRUE);
127
+ GList* words = article->ImpWords;
128
+ VALUE iwords = rb_ary_new();
129
+ while (words != NULL) {
130
+ OtsWordEntery *data = (OtsWordEntery *)words->data;
131
+ if (data != NULL && strlen(data->word) > 0)
132
+ rb_ary_push(iwords, rb_string(data->word));
133
+ words = words->next;
134
+ }
135
+
136
+ return iwords;
137
+ }
138
+
139
+ VALUE rb_ots_get_highlighted_lines(VALUE self) {
140
+ OtsArticle *article = get_article(self, TRUE);
141
+ OtsSentence *sentence;
142
+ GList *curr_line = article->lines;
143
+ VALUE hlt_lines = rb_ary_new();
144
+
145
+ while (curr_line != NULL) {
146
+ sentence = (OtsSentence *)curr_line->data;
147
+ if (sentence->selected) {
148
+ size_t len;
149
+ unsigned char* content = ots_get_line_text(sentence, TRUE, &len);
150
+ VALUE hlt_line = rb_hash_new();
151
+ rb_hash_aset(hlt_line, ID2SYM(rb_intern("sentence")), rb_string((char *)content));
152
+ rb_hash_aset(hlt_line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
153
+ rb_ary_push(hlt_lines, hlt_line);
154
+ }
155
+ curr_line = g_list_next(curr_line);
156
+ }
157
+
158
+ return hlt_lines;
159
+ }
160
+
161
+ VALUE rb_summarize(VALUE self, VALUE options) {
162
+
163
+ VALUE lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
164
+ VALUE percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
165
+
166
+ if (lines != Qnil && percent != Qnil) {
167
+ rb_ots_free_article(self);
168
+ rb_raise(eArgumentError, "Cannot summarize on :lines & :percent, only one is allowed");
169
+ }
170
+ else if (lines == Qnil && percent == Qnil) {
171
+ rb_ots_free_article(self);
172
+ rb_raise(eArgumentError, "Need either :lines or :percent to summarize");
173
+ }
174
+
175
+ if (lines != Qnil)
176
+ rb_ots_highlight_lines(self, FIX2INT(lines));
177
+ else if (percent != Qnil)
178
+ rb_ots_highlight_percent(self, FIX2INT(percent));
179
+ return rb_ots_get_highlighted_lines(self);
180
+ }
181
+
182
+ /* init */
183
+
184
+ void Init_ots(void) {
185
+ eLoadError = CONST_GET(rb_mKernel, "LoadError");
186
+ eRuntimeError = CONST_GET(rb_mKernel, "RuntimeError");
187
+ eArgumentError = CONST_GET(rb_mKernel, "ArgumentError");
188
+ rb_cOTS = rb_define_class("OTS", rb_cObject);
189
+ rb_define_method(rb_cOTS, "load_dictionary", rb_ots_load_dictionary, 1);
190
+ rb_define_method(rb_cOTS, "parse", rb_ots_parse_string, 1);
191
+ rb_define_method(rb_cOTS, "highlight_lines", rb_ots_highlight_lines, 1);
192
+ rb_define_method(rb_cOTS, "highlight_percent", rb_ots_highlight_percent, 1);
193
+ rb_define_method(rb_cOTS, "highlighted_content", rb_ots_get_highlighted_lines, 0);
194
+ rb_define_method(rb_cOTS, "summarize", rb_summarize, 1);
195
+ rb_define_method(rb_cOTS, "title", rb_ots_article_title, 0);
196
+ rb_define_method(rb_cOTS, "keywords", rb_ots_article_keywords, 0);
197
+ }
data/lib/ots.rb ADDED
@@ -0,0 +1 @@
1
+ require File.join(File.dirname(__FILE__), %w(.. ext ots))
data/test/ots_test.rb ADDED
@@ -0,0 +1,62 @@
1
+ require 'helper'
2
+
3
+ class OTSTest < Test::Unit::TestCase
4
+
5
+ SAMPLE = <<-TEXT
6
+ The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
7
+ It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
8
+ Pacific subspecies.
9
+ TEXT
10
+
11
+ context 'Title' do
12
+ should 'extract title from given document' do
13
+ ots = OTS.new
14
+ ots.parse SAMPLE
15
+ assert_equal 'species,turtle,subspecies,pacific,atlantic', ots.title
16
+ end
17
+ end
18
+
19
+ context 'Keywords' do
20
+ should 'extract keywords from given document' do
21
+ ots = OTS.new
22
+ ots.parse SAMPLE
23
+ assert_equal %W(
24
+ species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
25
+ belonging sea endangered critically hawksbill
26
+ ), ots.keywords
27
+ end
28
+ end
29
+
30
+ context 'Summary' do
31
+ should 'extract keywords from given document' do
32
+ ots = OTS.new
33
+ ots.parse SAMPLE
34
+ lines = ots.summarize(:lines => 2).map do |value|
35
+ { :sentence => value[:sentence].gsub(/\n\s*/, ' ').strip, :score => value[:score] }
36
+ end
37
+
38
+ assert_equal [
39
+ {
40
+ :sentence => "The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.",
41
+ :score => 48
42
+ },
43
+ {
44
+ :sentence => "The species has a worldwide distribution, with Atlantic and Pacific subspecies.",
45
+ :score => 20
46
+ }
47
+ ], lines
48
+
49
+ end
50
+
51
+ should 'utf8 encode strings properly' do
52
+ ots = OTS.new
53
+ text = "The hawksbill turtle\xE2\x80\x93is critically endangered."
54
+ text.force_encoding('UTF-8') if RUBY_VERSION >= "1.9"
55
+
56
+ ots.parse(text)
57
+ summary = ots.summarize(:lines => 1).first[:sentence]
58
+ assert_equal text, summary
59
+ end
60
+ end
61
+
62
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ots
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 2
9
+ version: 0.4.2
10
+ platform: ruby
11
+ authors:
12
+ - Bharanee Rathna
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-02-16 00:00:00 +11:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 2
30
+ - 10
31
+ version: "2.10"
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: Ruby interface to libots libraries for unix.
35
+ email: deepfryed@gmail.com
36
+ executables: []
37
+
38
+ extensions:
39
+ - ext/extconf.rb
40
+ extra_rdoc_files:
41
+ - README
42
+ files:
43
+ - README
44
+ - VERSION
45
+ - ext/ots.c
46
+ - lib/ots.rb
47
+ - test/ots_test.rb
48
+ - ext/extconf.rb
49
+ has_rdoc: true
50
+ homepage: http://github.com/deepfryed/ots
51
+ licenses: []
52
+
53
+ post_install_message:
54
+ rdoc_options:
55
+ - --charset=UTF-8
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ segments:
72
+ - 0
73
+ version: "0"
74
+ requirements: []
75
+
76
+ rubyforge_project:
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: Open Text Summarizer interface for Ruby.
81
+ test_files:
82
+ - test/ots_test.rb