ots 0.4.4 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +14 -13
- data/ext/dictionary.c +5 -4
- data/ext/ots.c +30 -16
- data/ext/version.h +1 -1
- data/test/test_article.rb +29 -6
- data/test/test_ots.rb +3 -3
- metadata +4 -4
data/README.md
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
# OTS
|
2
2
|
|
3
|
-
ots is an interface to libots - The open text summarizer
|
3
|
+
ots is an interface to libots - The [open text summarizer](http://libots.sourceforge.net/).
|
4
4
|
|
5
5
|
## Dependencies
|
6
6
|
|
7
7
|
* ruby 1.9.1 or later
|
8
8
|
* libxml2
|
9
|
-
* glib2.0
|
9
|
+
* glib2.0
|
10
10
|
* homebrew (on MacOSX)
|
11
11
|
|
12
12
|
## Installation
|
13
13
|
|
14
14
|
### Debian flavors of Linux
|
15
|
-
|
15
|
+
|
16
16
|
```
|
17
17
|
|
18
18
|
# ruby & ruby development libraries (not needed if you use rvm)
|
@@ -34,6 +34,9 @@ ots is an interface to libots - The open text summarizer
|
|
34
34
|
# update homebrew to latest & greatest version
|
35
35
|
GIT_SSL_NO_VERIFY=1 brew update
|
36
36
|
|
37
|
+
# optional: macosx normally has libxml2 installed if not try
|
38
|
+
brew install libxml2
|
39
|
+
|
37
40
|
# install glib
|
38
41
|
brew install glib
|
39
42
|
|
@@ -47,13 +50,13 @@ ots is an interface to libots - The open text summarizer
|
|
47
50
|
```
|
48
51
|
OTS
|
49
52
|
.parse #=> OTS::Article
|
50
|
-
.
|
53
|
+
.languages #=> Array
|
51
54
|
|
52
55
|
OTS::Article
|
53
56
|
.new
|
54
|
-
#
|
57
|
+
#topics #=> Array
|
55
58
|
#keywords #=> Array
|
56
|
-
#
|
59
|
+
#summarize #=> Array
|
57
60
|
|
58
61
|
```
|
59
62
|
|
@@ -62,17 +65,15 @@ ots is an interface to libots - The open text summarizer
|
|
62
65
|
```ruby
|
63
66
|
require 'ots'
|
64
67
|
article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
|
68
|
+
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", language: "fr")
|
69
|
+
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", dictionary: "custom.xml")
|
65
70
|
|
71
|
+
article.topics
|
66
72
|
article.keywords
|
67
|
-
article.summarize(lines: 1)
|
68
|
-
article.summarize(percent: 50)
|
69
|
-
|
70
|
-
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
|
71
|
-
article.keywords
|
72
|
-
article.summarize(lines: 1)
|
73
73
|
article.summarize(percent: 50)
|
74
|
+
article.summarize(sentences: 1)
|
74
75
|
|
75
|
-
OTS.
|
76
|
+
OTS.languages #=> list of supported languages dictionaries baked-in to libots
|
76
77
|
```
|
77
78
|
|
78
79
|
## License
|
data/ext/dictionary.c
CHANGED
@@ -59,12 +59,13 @@ ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
|
|
59
59
|
char *local_dict_name;
|
60
60
|
|
61
61
|
dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
|
62
|
-
local_dict_name = g_strdup_printf ("%s
|
62
|
+
local_dict_name = g_strdup_printf ("%s", name);
|
63
63
|
|
64
|
+
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
65
|
+
doc = xmlReadFile (local_dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
|
66
|
+
if (doc == NULL && g_file_test(dict_name,G_FILE_TEST_EXISTS))
|
67
|
+
doc = xmlReadFile (dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
|
64
68
|
|
65
|
-
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
66
|
-
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
|
67
|
-
if (doc == NULL) doc = xmlParseFile (dict_name);
|
68
69
|
if (doc == NULL) return (FALSE);
|
69
70
|
|
70
71
|
head = xmlDocGetRootElement (doc);
|
data/ext/ots.c
CHANGED
@@ -30,18 +30,28 @@ void article_load_dictionary(OtsArticle *article, char *name) {
|
|
30
30
|
}
|
31
31
|
|
32
32
|
VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
|
33
|
-
VALUE text, dictionary;
|
33
|
+
VALUE text, options, language, dictionary = Qnil;
|
34
34
|
OtsArticle *article = article_handle(self);
|
35
35
|
|
36
|
-
rb_scan_args(argc, argv, "11", &text, &
|
36
|
+
rb_scan_args(argc, argv, "11", &text, &options);
|
37
|
+
|
38
|
+
language = rb_str_new2("en");
|
37
39
|
|
38
40
|
if (TYPE(text) != T_STRING)
|
39
41
|
rb_raise(rb_eArgError, "invalid +text+");
|
40
42
|
|
41
|
-
if (NIL_P(
|
42
|
-
|
43
|
-
|
43
|
+
if (!NIL_P(options)) {
|
44
|
+
if (TYPE(options) != T_HASH)
|
45
|
+
rb_raise(rb_eArgError, "invalid +options+ hash");
|
46
|
+
|
47
|
+
dictionary = rb_hash_aref(options, ID2SYM(rb_intern("dictionary")));
|
48
|
+
language = rb_hash_aref(options, ID2SYM(rb_intern("language")));
|
49
|
+
}
|
50
|
+
|
51
|
+
if (!NIL_P(dictionary))
|
44
52
|
article_load_dictionary(article, CSTRING(dictionary));
|
53
|
+
else
|
54
|
+
article_load_dictionary(article, CSTRING(language));
|
45
55
|
|
46
56
|
ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
|
47
57
|
ots_grade_doc(article);
|
@@ -87,11 +97,11 @@ VALUE article_summarize(VALUE self, VALUE options) {
|
|
87
97
|
if (TYPE(options) != T_HASH)
|
88
98
|
rb_raise(rb_eArgError, "expect an options hash");
|
89
99
|
|
90
|
-
lines = rb_hash_aref(options, ID2SYM(rb_intern("
|
100
|
+
lines = rb_hash_aref(options, ID2SYM(rb_intern("sentences")));
|
91
101
|
percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
|
92
102
|
|
93
103
|
if (NIL_P(lines) && NIL_P(percent))
|
94
|
-
rb_raise(rb_eArgError, "expect +
|
104
|
+
rb_raise(rb_eArgError, "expect +sentences+ or +percent+");
|
95
105
|
|
96
106
|
if (lines != Qnil)
|
97
107
|
ots_highlight_doc_lines(article, NUM2INT(lines));
|
@@ -101,9 +111,13 @@ VALUE article_summarize(VALUE self, VALUE options) {
|
|
101
111
|
return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
|
102
112
|
}
|
103
113
|
|
104
|
-
VALUE
|
114
|
+
VALUE article_topics(VALUE self) {
|
105
115
|
OtsArticle *article = article_handle(self);
|
106
|
-
|
116
|
+
|
117
|
+
return
|
118
|
+
article->title ?
|
119
|
+
rb_str_split(rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")), ",") :
|
120
|
+
Qnil;
|
107
121
|
}
|
108
122
|
|
109
123
|
typedef struct {
|
@@ -136,16 +150,16 @@ VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
|
|
136
150
|
return article;
|
137
151
|
}
|
138
152
|
|
139
|
-
VALUE
|
153
|
+
VALUE ots_languages(VALUE self) {
|
140
154
|
DIR *dir;
|
141
155
|
struct dirent *entry;
|
142
|
-
VALUE
|
156
|
+
VALUE languages = rb_ary_new();
|
143
157
|
|
144
158
|
if ((dir = opendir(DICTIONARY_DIR))) {
|
145
159
|
while ((entry = readdir(dir))) {
|
146
160
|
// entry->d_type is not portable.
|
147
161
|
if (strstr(entry->d_name, ".xml"))
|
148
|
-
rb_ary_push(
|
162
|
+
rb_ary_push(languages, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
|
149
163
|
}
|
150
164
|
}
|
151
165
|
else {
|
@@ -153,7 +167,7 @@ VALUE ots_dictionaries(VALUE self) {
|
|
153
167
|
}
|
154
168
|
|
155
169
|
closedir(dir);
|
156
|
-
return
|
170
|
+
return languages;
|
157
171
|
}
|
158
172
|
|
159
173
|
/* init */
|
@@ -164,11 +178,11 @@ void Init_ots(void) {
|
|
164
178
|
|
165
179
|
rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
|
166
180
|
rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
|
167
|
-
rb_define_method(cArticle, "
|
181
|
+
rb_define_method(cArticle, "topics", RUBY_METHOD_FUNC(article_topics), 0);
|
168
182
|
rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
|
169
183
|
|
170
|
-
rb_define_module_function(mOTS, "parse",
|
171
|
-
rb_define_module_function(mOTS, "
|
184
|
+
rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
|
185
|
+
rb_define_module_function(mOTS, "languages", RUBY_METHOD_FUNC(ots_languages), 0);
|
172
186
|
|
173
187
|
rb_define_alloc_func(cArticle, article_allocate);
|
174
188
|
|
data/ext/version.h
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
#pragma once
|
2
|
-
#define RUBY_OTS_VERSION "0.
|
2
|
+
#define RUBY_OTS_VERSION "0.5.0"
|
data/test/test_article.rb
CHANGED
@@ -12,8 +12,8 @@ describe 'OTS::Article' do
|
|
12
12
|
@article = OTS::Article.new(@sample)
|
13
13
|
end
|
14
14
|
|
15
|
-
it 'should extract
|
16
|
-
assert_equal
|
15
|
+
it 'should extract topic keywords from given document' do
|
16
|
+
assert_equal %w(species turtle subspecies pacific atlantic), @article.topics
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'should extract keywords from given document' do
|
@@ -27,7 +27,7 @@ describe 'OTS::Article' do
|
|
27
27
|
|
28
28
|
|
29
29
|
it 'should extract keywords from given document' do
|
30
|
-
lines = @article.summarize(
|
30
|
+
lines = @article.summarize(sentences: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
|
31
31
|
expect = [
|
32
32
|
["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
|
33
33
|
["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
|
@@ -39,14 +39,37 @@ describe 'OTS::Article' do
|
|
39
39
|
it 'should utf8 encode strings properly' do
|
40
40
|
text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
|
41
41
|
article = OTS.parse(text)
|
42
|
-
summary = article.summarize(
|
42
|
+
summary = article.summarize(sentences: 1).first[:sentence]
|
43
43
|
assert_equal text, summary
|
44
44
|
end
|
45
45
|
|
46
46
|
describe 'dictionaries' do
|
47
|
+
before do
|
48
|
+
@text = "j'ai besoin de la crème glacée. il fait trop chaud en australie."
|
49
|
+
end
|
50
|
+
|
47
51
|
it 'should load the french dictionary' do
|
48
|
-
article = OTS.parse(
|
49
|
-
assert_equal "j'ai besoin de la crème glacée.", article.summarize(
|
52
|
+
article = OTS.parse(@text, language: "fr")
|
53
|
+
assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should load the french dictionary given path' do
|
57
|
+
article = OTS.parse(@text, dictionary: File.join(File.dirname(__FILE__), '..', 'dictionaries', 'fr.xml'))
|
58
|
+
assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should raise LoadError on invalid language or dictionaries' do
|
62
|
+
assert_raises(LoadError) do
|
63
|
+
OTS.parse('hello world', language: "xxx")
|
64
|
+
end
|
65
|
+
|
66
|
+
assert_raises(LoadError) do
|
67
|
+
OTS.parse('hello world', dictionary: "xxx")
|
68
|
+
end
|
69
|
+
|
70
|
+
assert_raises(LoadError) do
|
71
|
+
OTS.parse('hello world', dictionary: __FILE__)
|
72
|
+
end
|
50
73
|
end
|
51
74
|
end
|
52
75
|
end
|
data/test/test_ots.rb
CHANGED
@@ -12,12 +12,12 @@ describe 'OTS' do
|
|
12
12
|
end
|
13
13
|
|
14
14
|
it 'should return a list of dictonaries' do
|
15
|
-
|
15
|
+
languages = OTS.languages
|
16
16
|
|
17
17
|
%w(en fr it es de ru).each do |name|
|
18
|
-
assert
|
18
|
+
assert languages.include?(name), "has #{name} language dictionary"
|
19
19
|
end
|
20
20
|
|
21
|
-
assert_empty
|
21
|
+
assert_empty languages.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
|
22
22
|
end
|
23
23
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 5
|
8
|
+
- 0
|
9
|
+
version: 0.5.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2012-01-
|
17
|
+
date: 2012-01-10 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|