ots 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +14 -13
- data/ext/dictionary.c +5 -4
- data/ext/ots.c +30 -16
- data/ext/version.h +1 -1
- data/test/test_article.rb +29 -6
- data/test/test_ots.rb +3 -3
- metadata +4 -4
data/README.md
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
# OTS
|
2
2
|
|
3
|
-
ots is an interface to libots - The open text summarizer
|
3
|
+
ots is an interface to libots - The [open text summarizer](http://libots.sourceforge.net/).
|
4
4
|
|
5
5
|
## Dependencies
|
6
6
|
|
7
7
|
* ruby 1.9.1 or later
|
8
8
|
* libxml2
|
9
|
-
* glib2.0
|
9
|
+
* glib2.0
|
10
10
|
* homebrew (on MacOSX)
|
11
11
|
|
12
12
|
## Installation
|
13
13
|
|
14
14
|
### Debian flavors of Linux
|
15
|
-
|
15
|
+
|
16
16
|
```
|
17
17
|
|
18
18
|
# ruby & ruby development libraries (not needed if you use rvm)
|
@@ -34,6 +34,9 @@ ots is an interface to libots - The open text summarizer
|
|
34
34
|
# update homebrew to latest & greatest version
|
35
35
|
GIT_SSL_NO_VERIFY=1 brew update
|
36
36
|
|
37
|
+
# optional: macosx normally has libxml2 installed if not try
|
38
|
+
brew install libxml2
|
39
|
+
|
37
40
|
# install glib
|
38
41
|
brew install glib
|
39
42
|
|
@@ -47,13 +50,13 @@ ots is an interface to libots - The open text summarizer
|
|
47
50
|
```
|
48
51
|
OTS
|
49
52
|
.parse #=> OTS::Article
|
50
|
-
.
|
53
|
+
.languages #=> Array
|
51
54
|
|
52
55
|
OTS::Article
|
53
56
|
.new
|
54
|
-
#
|
57
|
+
#topics #=> Array
|
55
58
|
#keywords #=> Array
|
56
|
-
#
|
59
|
+
#summarize #=> Array
|
57
60
|
|
58
61
|
```
|
59
62
|
|
@@ -62,17 +65,15 @@ ots is an interface to libots - The open text summarizer
|
|
62
65
|
```ruby
|
63
66
|
require 'ots'
|
64
67
|
article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
|
68
|
+
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", language: "fr")
|
69
|
+
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", dictionary: "custom.xml")
|
65
70
|
|
71
|
+
article.topics
|
66
72
|
article.keywords
|
67
|
-
article.summarize(lines: 1)
|
68
|
-
article.summarize(percent: 50)
|
69
|
-
|
70
|
-
article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
|
71
|
-
article.keywords
|
72
|
-
article.summarize(lines: 1)
|
73
73
|
article.summarize(percent: 50)
|
74
|
+
article.summarize(sentences: 1)
|
74
75
|
|
75
|
-
OTS.
|
76
|
+
OTS.languages #=> list of supported languages dictionaries baked-in to libots
|
76
77
|
```
|
77
78
|
|
78
79
|
## License
|
data/ext/dictionary.c
CHANGED
@@ -59,12 +59,13 @@ ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
|
|
59
59
|
char *local_dict_name;
|
60
60
|
|
61
61
|
dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
|
62
|
-
local_dict_name = g_strdup_printf ("%s
|
62
|
+
local_dict_name = g_strdup_printf ("%s", name);
|
63
63
|
|
64
|
+
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
65
|
+
doc = xmlReadFile (local_dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
|
66
|
+
if (doc == NULL && g_file_test(dict_name,G_FILE_TEST_EXISTS))
|
67
|
+
doc = xmlReadFile (dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
|
64
68
|
|
65
|
-
if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
|
66
|
-
doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
|
67
|
-
if (doc == NULL) doc = xmlParseFile (dict_name);
|
68
69
|
if (doc == NULL) return (FALSE);
|
69
70
|
|
70
71
|
head = xmlDocGetRootElement (doc);
|
data/ext/ots.c
CHANGED
@@ -30,18 +30,28 @@ void article_load_dictionary(OtsArticle *article, char *name) {
|
|
30
30
|
}
|
31
31
|
|
32
32
|
VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
|
33
|
-
VALUE text, dictionary;
|
33
|
+
VALUE text, options, language, dictionary = Qnil;
|
34
34
|
OtsArticle *article = article_handle(self);
|
35
35
|
|
36
|
-
rb_scan_args(argc, argv, "11", &text, &
|
36
|
+
rb_scan_args(argc, argv, "11", &text, &options);
|
37
|
+
|
38
|
+
language = rb_str_new2("en");
|
37
39
|
|
38
40
|
if (TYPE(text) != T_STRING)
|
39
41
|
rb_raise(rb_eArgError, "invalid +text+");
|
40
42
|
|
41
|
-
if (NIL_P(
|
42
|
-
|
43
|
-
|
43
|
+
if (!NIL_P(options)) {
|
44
|
+
if (TYPE(options) != T_HASH)
|
45
|
+
rb_raise(rb_eArgError, "invalid +options+ hash");
|
46
|
+
|
47
|
+
dictionary = rb_hash_aref(options, ID2SYM(rb_intern("dictionary")));
|
48
|
+
language = rb_hash_aref(options, ID2SYM(rb_intern("language")));
|
49
|
+
}
|
50
|
+
|
51
|
+
if (!NIL_P(dictionary))
|
44
52
|
article_load_dictionary(article, CSTRING(dictionary));
|
53
|
+
else
|
54
|
+
article_load_dictionary(article, CSTRING(language));
|
45
55
|
|
46
56
|
ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
|
47
57
|
ots_grade_doc(article);
|
@@ -87,11 +97,11 @@ VALUE article_summarize(VALUE self, VALUE options) {
|
|
87
97
|
if (TYPE(options) != T_HASH)
|
88
98
|
rb_raise(rb_eArgError, "expect an options hash");
|
89
99
|
|
90
|
-
lines = rb_hash_aref(options, ID2SYM(rb_intern("
|
100
|
+
lines = rb_hash_aref(options, ID2SYM(rb_intern("sentences")));
|
91
101
|
percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
|
92
102
|
|
93
103
|
if (NIL_P(lines) && NIL_P(percent))
|
94
|
-
rb_raise(rb_eArgError, "expect +
|
104
|
+
rb_raise(rb_eArgError, "expect +sentences+ or +percent+");
|
95
105
|
|
96
106
|
if (lines != Qnil)
|
97
107
|
ots_highlight_doc_lines(article, NUM2INT(lines));
|
@@ -101,9 +111,13 @@ VALUE article_summarize(VALUE self, VALUE options) {
|
|
101
111
|
return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
|
102
112
|
}
|
103
113
|
|
104
|
-
VALUE
|
114
|
+
VALUE article_topics(VALUE self) {
|
105
115
|
OtsArticle *article = article_handle(self);
|
106
|
-
|
116
|
+
|
117
|
+
return
|
118
|
+
article->title ?
|
119
|
+
rb_str_split(rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")), ",") :
|
120
|
+
Qnil;
|
107
121
|
}
|
108
122
|
|
109
123
|
typedef struct {
|
@@ -136,16 +150,16 @@ VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
|
|
136
150
|
return article;
|
137
151
|
}
|
138
152
|
|
139
|
-
VALUE
|
153
|
+
VALUE ots_languages(VALUE self) {
|
140
154
|
DIR *dir;
|
141
155
|
struct dirent *entry;
|
142
|
-
VALUE
|
156
|
+
VALUE languages = rb_ary_new();
|
143
157
|
|
144
158
|
if ((dir = opendir(DICTIONARY_DIR))) {
|
145
159
|
while ((entry = readdir(dir))) {
|
146
160
|
// entry->d_type is not portable.
|
147
161
|
if (strstr(entry->d_name, ".xml"))
|
148
|
-
rb_ary_push(
|
162
|
+
rb_ary_push(languages, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
|
149
163
|
}
|
150
164
|
}
|
151
165
|
else {
|
@@ -153,7 +167,7 @@ VALUE ots_dictionaries(VALUE self) {
|
|
153
167
|
}
|
154
168
|
|
155
169
|
closedir(dir);
|
156
|
-
return
|
170
|
+
return languages;
|
157
171
|
}
|
158
172
|
|
159
173
|
/* init */
|
@@ -164,11 +178,11 @@ void Init_ots(void) {
|
|
164
178
|
|
165
179
|
rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
|
166
180
|
rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
|
167
|
-
rb_define_method(cArticle, "
|
181
|
+
rb_define_method(cArticle, "topics", RUBY_METHOD_FUNC(article_topics), 0);
|
168
182
|
rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
|
169
183
|
|
170
|
-
rb_define_module_function(mOTS, "parse",
|
171
|
-
rb_define_module_function(mOTS, "
|
184
|
+
rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
|
185
|
+
rb_define_module_function(mOTS, "languages", RUBY_METHOD_FUNC(ots_languages), 0);
|
172
186
|
|
173
187
|
rb_define_alloc_func(cArticle, article_allocate);
|
174
188
|
|
data/ext/version.h
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
#pragma once
|
2
|
-
#define RUBY_OTS_VERSION "0.
|
2
|
+
#define RUBY_OTS_VERSION "0.5.0"
|
data/test/test_article.rb
CHANGED
@@ -12,8 +12,8 @@ describe 'OTS::Article' do
|
|
12
12
|
@article = OTS::Article.new(@sample)
|
13
13
|
end
|
14
14
|
|
15
|
-
it 'should extract
|
16
|
-
assert_equal
|
15
|
+
it 'should extract topic keywords from given document' do
|
16
|
+
assert_equal %w(species turtle subspecies pacific atlantic), @article.topics
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'should extract keywords from given document' do
|
@@ -27,7 +27,7 @@ describe 'OTS::Article' do
|
|
27
27
|
|
28
28
|
|
29
29
|
it 'should extract keywords from given document' do
|
30
|
-
lines = @article.summarize(
|
30
|
+
lines = @article.summarize(sentences: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
|
31
31
|
expect = [
|
32
32
|
["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
|
33
33
|
["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
|
@@ -39,14 +39,37 @@ describe 'OTS::Article' do
|
|
39
39
|
it 'should utf8 encode strings properly' do
|
40
40
|
text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
|
41
41
|
article = OTS.parse(text)
|
42
|
-
summary = article.summarize(
|
42
|
+
summary = article.summarize(sentences: 1).first[:sentence]
|
43
43
|
assert_equal text, summary
|
44
44
|
end
|
45
45
|
|
46
46
|
describe 'dictionaries' do
|
47
|
+
before do
|
48
|
+
@text = "j'ai besoin de la crème glacée. il fait trop chaud en australie."
|
49
|
+
end
|
50
|
+
|
47
51
|
it 'should load the french dictionary' do
|
48
|
-
article = OTS.parse(
|
49
|
-
assert_equal "j'ai besoin de la crème glacée.", article.summarize(
|
52
|
+
article = OTS.parse(@text, language: "fr")
|
53
|
+
assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should load the french dictionary given path' do
|
57
|
+
article = OTS.parse(@text, dictionary: File.join(File.dirname(__FILE__), '..', 'dictionaries', 'fr.xml'))
|
58
|
+
assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should raise LoadError on invalid language or dictionaries' do
|
62
|
+
assert_raises(LoadError) do
|
63
|
+
OTS.parse('hello world', language: "xxx")
|
64
|
+
end
|
65
|
+
|
66
|
+
assert_raises(LoadError) do
|
67
|
+
OTS.parse('hello world', dictionary: "xxx")
|
68
|
+
end
|
69
|
+
|
70
|
+
assert_raises(LoadError) do
|
71
|
+
OTS.parse('hello world', dictionary: __FILE__)
|
72
|
+
end
|
50
73
|
end
|
51
74
|
end
|
52
75
|
end
|
data/test/test_ots.rb
CHANGED
@@ -12,12 +12,12 @@ describe 'OTS' do
|
|
12
12
|
end
|
13
13
|
|
14
14
|
it 'should return a list of dictonaries' do
|
15
|
-
|
15
|
+
languages = OTS.languages
|
16
16
|
|
17
17
|
%w(en fr it es de ru).each do |name|
|
18
|
-
assert
|
18
|
+
assert languages.include?(name), "has #{name} language dictionary"
|
19
19
|
end
|
20
20
|
|
21
|
-
assert_empty
|
21
|
+
assert_empty languages.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
|
22
22
|
end
|
23
23
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 5
|
8
|
+
- 0
|
9
|
+
version: 0.5.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2012-01-
|
17
|
+
date: 2012-01-10 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|