ots 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. data/README +25 -0
  2. data/VERSION +1 -0
  3. data/ext/extconf.rb +14 -0
  4. data/ext/ots.c +197 -0
  5. data/lib/ots.rb +1 -0
  6. data/test/ots_test.rb +62 -0
  7. metadata +82 -0
data/README ADDED
@@ -0,0 +1,25 @@
1
+ ots is an interface to libots - The open text summarizer
2
+
3
+ INSTALL:
4
+
5
+ sudo gem install ots --source http://gems.github.com
6
+
7
+ REQUIREMENT:
8
+
9
+ * Ruby >= 1.8.7 ( >= 1.9.1 recommended)
10
+ * rubygems >= 1.3.5
11
+ * ruby development libraries (debian: ruby1.8-dev, ruby1.9.1-dev)
12
+ * libxml2 development libraries (debian: libxml2-dev)
13
+ * libots development libraries (debian: libots-dev)
14
+ * glib2.0 development libraries (debian: libglib2.0-dev)
15
+
16
+ USAGE:
17
+
18
+ >> require "rubygems"
19
+ >> require "ots"
20
+ >> summarizer = ots.new
21
+ >> summarizer.parse("I think I need some ice cream to cool me off. It is too hot down under")
22
+ >> summarizer.title
23
+ => [ "hot","cool","cream","ice","think" ]
24
+ >> summarizer.summarize(:lines => 1)
25
+ => [ { :sentence => "I think I need some ice cream to cool me off", :score => 57 } ]
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.2
data/ext/extconf.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -Wall"
4
+ $LDFLAGS = "-lglib-2.0"
5
+
6
+ dir_config("libots", ["/usr/local", "/opt/local", "/usr"])
7
+
8
+ headers = [ 'stdio.h', 'stdlib.h', 'string.h', 'libots-1/ots/libots.h' ]
9
+ if have_header('libots-1/ots/libots.h') && have_library('ots-1', 'ots_new_article', headers)
10
+ create_makefile 'ots'
11
+ else
12
+ puts "Cannot find libots headers or libraries"
13
+ exit 1
14
+ end
data/ext/ots.c ADDED
@@ -0,0 +1,197 @@
1
+ #include <ruby.h>
2
+
3
+ /* ruby 1.9 only */
4
+ #ifdef RUBY_VM
5
+ #include <ruby/encoding.h>
6
+ #endif
7
+
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+ #include <libots-1/ots/libots.h>
13
+
14
+ #define ID_CONST_GET rb_intern("const_get")
15
+ #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
16
+
17
+ static VALUE rb_cOTS;
18
+ static VALUE eLoadError;
19
+ static VALUE eRuntimeError;
20
+ static VALUE eArgumentError;
21
+
22
+ typedef struct {
23
+ gchar *word; /* the word */
24
+ gchar *stem; /*stem of the word*/
25
+ gint occ; /* how many times have we seen this word in the text? */
26
+ } OtsWordEntery;
27
+
28
+
29
+ /* helpers */
30
+
31
+ OtsArticle* get_article(VALUE self, gboolean error_on_missing) {
32
+ VALUE rb_article_object = rb_iv_get(self, "@article");
33
+ if (rb_article_object == Qnil) {
34
+ if (error_on_missing)
35
+ rb_raise(eRuntimeError, "libots document not initialized properly. Did you forget to parse content ?");
36
+ else
37
+ return NULL;
38
+ }
39
+ return (OtsArticle *)DATA_PTR(rb_article_object);
40
+ }
41
+
42
+ void rb_ots_free_article(VALUE self) {
43
+ OtsArticle *article = DATA_PTR(rb_iv_get(self, "@article"));
44
+ ots_free_article(article);
45
+ }
46
+
47
+ VALUE rb_string(char *utf8) {
48
+ VALUE str = rb_str_new(utf8, strlen(utf8));
49
+
50
+ /* ruby 1.9 only - force bytestream to utf8 */
51
+ #ifdef RUBY_VM
52
+ rb_enc_associate(str, rb_to_encoding(rb_str_new2("UTF-8")));
53
+ ENC_CODERANGE_CLEAR(str);
54
+ #endif
55
+
56
+ return str;
57
+ }
58
+
59
+ /* ruby libots methods/wrappers */
60
+
61
+ VALUE rb_ots_init(VALUE self) {
62
+ OtsArticle *article = get_article(self, FALSE);
63
+ VALUE dict = Qnil;
64
+ if (article != NULL) {
65
+ dict = rb_iv_get(self, "@dict");
66
+ ots_free_article(article);
67
+ }
68
+ article = ots_new_article();
69
+ rb_iv_set(self, "@article", Data_Wrap_Struct(rb_cObject, 0, 0, article));
70
+ rb_iv_set(self, "@dict", dict);
71
+ return self;
72
+ }
73
+
74
+ VALUE rb_ots_load_dictionary(VALUE self, VALUE dict) {
75
+ char *dict_cstr = "en";
76
+ if (dict != Qnil) dict_cstr = RSTRING_PTR(dict);
77
+
78
+ OtsArticle *article = get_article(self, FALSE);
79
+ if (article == NULL) {
80
+ rb_ots_init(self);
81
+ article = get_article(self, TRUE);
82
+ }
83
+
84
+ if (!ots_load_xml_dictionary(article, (unsigned const char *)dict_cstr)) {
85
+ rb_ots_free_article(self);
86
+ rb_raise(eLoadError, "Could not find dictionary file: %s", dict_cstr);
87
+ }
88
+
89
+ rb_iv_set(self, "@dict", dict);
90
+ return Qtrue;
91
+ }
92
+
93
+ VALUE rb_ots_parse_string(VALUE self, VALUE string) {
94
+ const unsigned char *string_cstr = (const unsigned char *)RSTRING_PTR(string);
95
+ size_t string_len = RSTRING_LEN(string);
96
+
97
+ rb_ots_init(self);
98
+ rb_ots_load_dictionary(self, rb_iv_get(self, "@dict"));
99
+ OtsArticle *article = get_article(self, TRUE);
100
+ ots_parse_stream(string_cstr, string_len, article);
101
+ ots_grade_doc(article);
102
+ return Qtrue;
103
+ }
104
+
105
+ VALUE rb_ots_highlight_lines(VALUE self, int lines) {
106
+ OtsArticle *article = get_article(self, TRUE);
107
+ ots_highlight_doc_lines(article, lines);
108
+ return Qtrue;
109
+ }
110
+
111
+ VALUE rb_ots_highlight_percent(VALUE self, int percent) {
112
+ OtsArticle *article = get_article(self, TRUE);
113
+ ots_highlight_doc(article, percent);
114
+ return Qtrue;
115
+ }
116
+
117
+ VALUE rb_ots_article_title(VALUE self) {
118
+ OtsArticle *article = get_article(self, TRUE);
119
+ if (article->title != NULL)
120
+ return rb_string(article->title);
121
+ else
122
+ return Qnil;
123
+ }
124
+
125
+ VALUE rb_ots_article_keywords(VALUE self) {
126
+ OtsArticle *article = get_article(self, TRUE);
127
+ GList* words = article->ImpWords;
128
+ VALUE iwords = rb_ary_new();
129
+ while (words != NULL) {
130
+ OtsWordEntery *data = (OtsWordEntery *)words->data;
131
+ if (data != NULL && strlen(data->word) > 0)
132
+ rb_ary_push(iwords, rb_string(data->word));
133
+ words = words->next;
134
+ }
135
+
136
+ return iwords;
137
+ }
138
+
139
+ VALUE rb_ots_get_highlighted_lines(VALUE self) {
140
+ OtsArticle *article = get_article(self, TRUE);
141
+ OtsSentence *sentence;
142
+ GList *curr_line = article->lines;
143
+ VALUE hlt_lines = rb_ary_new();
144
+
145
+ while (curr_line != NULL) {
146
+ sentence = (OtsSentence *)curr_line->data;
147
+ if (sentence->selected) {
148
+ size_t len;
149
+ unsigned char* content = ots_get_line_text(sentence, TRUE, &len);
150
+ VALUE hlt_line = rb_hash_new();
151
+ rb_hash_aset(hlt_line, ID2SYM(rb_intern("sentence")), rb_string((char *)content));
152
+ rb_hash_aset(hlt_line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score));
153
+ rb_ary_push(hlt_lines, hlt_line);
154
+ }
155
+ curr_line = g_list_next(curr_line);
156
+ }
157
+
158
+ return hlt_lines;
159
+ }
160
+
161
+ VALUE rb_summarize(VALUE self, VALUE options) {
162
+
163
+ VALUE lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
164
+ VALUE percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
165
+
166
+ if (lines != Qnil && percent != Qnil) {
167
+ rb_ots_free_article(self);
168
+ rb_raise(eArgumentError, "Cannot summarize on :lines & :percent, only one is allowed");
169
+ }
170
+ else if (lines == Qnil && percent == Qnil) {
171
+ rb_ots_free_article(self);
172
+ rb_raise(eArgumentError, "Need either :lines or :percent to summarize");
173
+ }
174
+
175
+ if (lines != Qnil)
176
+ rb_ots_highlight_lines(self, FIX2INT(lines));
177
+ else if (percent != Qnil)
178
+ rb_ots_highlight_percent(self, FIX2INT(percent));
179
+ return rb_ots_get_highlighted_lines(self);
180
+ }
181
+
182
+ /* init */
183
+
184
+ void Init_ots(void) {
185
+ eLoadError = CONST_GET(rb_mKernel, "LoadError");
186
+ eRuntimeError = CONST_GET(rb_mKernel, "RuntimeError");
187
+ eArgumentError = CONST_GET(rb_mKernel, "ArgumentError");
188
+ rb_cOTS = rb_define_class("OTS", rb_cObject);
189
+ rb_define_method(rb_cOTS, "load_dictionary", rb_ots_load_dictionary, 1);
190
+ rb_define_method(rb_cOTS, "parse", rb_ots_parse_string, 1);
191
+ rb_define_method(rb_cOTS, "highlight_lines", rb_ots_highlight_lines, 1);
192
+ rb_define_method(rb_cOTS, "highlight_percent", rb_ots_highlight_percent, 1);
193
+ rb_define_method(rb_cOTS, "highlighted_content", rb_ots_get_highlighted_lines, 0);
194
+ rb_define_method(rb_cOTS, "summarize", rb_summarize, 1);
195
+ rb_define_method(rb_cOTS, "title", rb_ots_article_title, 0);
196
+ rb_define_method(rb_cOTS, "keywords", rb_ots_article_keywords, 0);
197
+ }
data/lib/ots.rb ADDED
@@ -0,0 +1 @@
1
+ require File.join(File.dirname(__FILE__), %w(.. ext ots))
data/test/ots_test.rb ADDED
@@ -0,0 +1,62 @@
1
+ require 'helper'
2
+
3
+ class OTSTest < Test::Unit::TestCase
4
+
5
+ SAMPLE = <<-TEXT
6
+ The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
7
+ It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
8
+ Pacific subspecies.
9
+ TEXT
10
+
11
+ context 'Title' do
12
+ should 'extract title from given document' do
13
+ ots = OTS.new
14
+ ots.parse SAMPLE
15
+ assert_equal 'species,turtle,subspecies,pacific,atlantic', ots.title
16
+ end
17
+ end
18
+
19
+ context 'Keywords' do
20
+ should 'extract keywords from given document' do
21
+ ots = OTS.new
22
+ ots.parse SAMPLE
23
+ assert_equal %W(
24
+ species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
25
+ belonging sea endangered critically hawksbill
26
+ ), ots.keywords
27
+ end
28
+ end
29
+
30
+ context 'Summary' do
31
+ should 'extract keywords from given document' do
32
+ ots = OTS.new
33
+ ots.parse SAMPLE
34
+ lines = ots.summarize(:lines => 2).map do |value|
35
+ { :sentence => value[:sentence].gsub(/\n\s*/, ' ').strip, :score => value[:score] }
36
+ end
37
+
38
+ assert_equal [
39
+ {
40
+ :sentence => "The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.",
41
+ :score => 48
42
+ },
43
+ {
44
+ :sentence => "The species has a worldwide distribution, with Atlantic and Pacific subspecies.",
45
+ :score => 20
46
+ }
47
+ ], lines
48
+
49
+ end
50
+
51
+ should 'utf8 encode strings properly' do
52
+ ots = OTS.new
53
+ text = "The hawksbill turtle\xE2\x80\x93is critically endangered."
54
+ text.force_encoding('UTF-8') if RUBY_VERSION >= "1.9"
55
+
56
+ ots.parse(text)
57
+ summary = ots.summarize(:lines => 1).first[:sentence]
58
+ assert_equal text, summary
59
+ end
60
+ end
61
+
62
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ots
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 2
9
+ version: 0.4.2
10
+ platform: ruby
11
+ authors:
12
+ - Bharanee Rathna
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-02-16 00:00:00 +11:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 2
30
+ - 10
31
+ version: "2.10"
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: Ruby interface to libots libraries for unix.
35
+ email: deepfryed@gmail.com
36
+ executables: []
37
+
38
+ extensions:
39
+ - ext/extconf.rb
40
+ extra_rdoc_files:
41
+ - README
42
+ files:
43
+ - README
44
+ - VERSION
45
+ - ext/ots.c
46
+ - lib/ots.rb
47
+ - test/ots_test.rb
48
+ - ext/extconf.rb
49
+ has_rdoc: true
50
+ homepage: http://github.com/deepfryed/ots
51
+ licenses: []
52
+
53
+ post_install_message:
54
+ rdoc_options:
55
+ - --charset=UTF-8
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ segments:
72
+ - 0
73
+ version: "0"
74
+ requirements: []
75
+
76
+ rubyforge_project:
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: Open Text Summarizer interface for Ruby.
81
+ test_files:
82
+ - test/ots_test.rb