ots 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,18 +1,18 @@
1
1
  # OTS
2
2
 
3
- ots is an interface to libots - The open text summarizer
3
+ ots is an interface to libots - The [open text summarizer](http://libots.sourceforge.net/).
4
4
 
5
5
  ## Dependencies
6
6
 
7
7
  * ruby 1.9.1 or later
8
8
  * libxml2
9
- * glib2.0
9
+ * glib2.0
10
10
  * homebrew (on MacOSX)
11
11
 
12
12
  ## Installation
13
13
 
14
14
  ### Debian flavors of Linux
15
-
15
+
16
16
  ```
17
17
 
18
18
  # ruby & ruby development libraries (not needed if you use rvm)
@@ -34,6 +34,9 @@ ots is an interface to libots - The open text summarizer
34
34
  # update homebrew to latest & greatest version
35
35
  GIT_SSL_NO_VERIFY=1 brew update
36
36
 
37
+ # optional: macosx normally has libxml2 installed if not try
38
+ brew install libxml2
39
+
37
40
  # install glib
38
41
  brew install glib
39
42
 
@@ -47,13 +50,13 @@ ots is an interface to libots - The open text summarizer
47
50
  ```
48
51
  OTS
49
52
  .parse #=> OTS::Article
50
- .dictionaries #=> Array
53
+ .languages #=> Array
51
54
 
52
55
  OTS::Article
53
56
  .new
54
- #summarize #=> Array
57
+ #topics #=> Array
55
58
  #keywords #=> Array
56
- #title #=> String
59
+ #summarize #=> Array
57
60
 
58
61
  ```
59
62
 
@@ -62,17 +65,15 @@ ots is an interface to libots - The open text summarizer
62
65
  ```ruby
63
66
  require 'ots'
64
67
  article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
68
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", language: "fr")
69
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", dictionary: "custom.xml")
65
70
 
71
+ article.topics
66
72
  article.keywords
67
- article.summarize(lines: 1)
68
- article.summarize(percent: 50)
69
-
70
- article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
71
- article.keywords
72
- article.summarize(lines: 1)
73
73
  article.summarize(percent: 50)
74
+ article.summarize(sentences: 1)
74
75
 
75
- OTS.dictionaries #=> list of supported dictionaries
76
+ OTS.languages #=> list of supported languages dictionaries baked-in to libots
76
77
  ```
77
78
 
78
79
  ## License
@@ -59,12 +59,13 @@ ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
59
59
  char *local_dict_name;
60
60
 
61
61
  dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
62
- local_dict_name = g_strdup_printf ("%s.xml", name);
62
+ local_dict_name = g_strdup_printf ("%s", name);
63
63
 
64
+ if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
65
+ doc = xmlReadFile (local_dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
66
+ if (doc == NULL && g_file_test(dict_name,G_FILE_TEST_EXISTS))
67
+ doc = xmlReadFile (dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
64
68
 
65
- if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
66
- doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
67
- if (doc == NULL) doc = xmlParseFile (dict_name);
68
69
  if (doc == NULL) return (FALSE);
69
70
 
70
71
  head = xmlDocGetRootElement (doc);
data/ext/ots.c CHANGED
@@ -30,18 +30,28 @@ void article_load_dictionary(OtsArticle *article, char *name) {
30
30
  }
31
31
 
32
32
  VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
33
- VALUE text, dictionary;
33
+ VALUE text, options, language, dictionary = Qnil;
34
34
  OtsArticle *article = article_handle(self);
35
35
 
36
- rb_scan_args(argc, argv, "11", &text, &dictionary);
36
+ rb_scan_args(argc, argv, "11", &text, &options);
37
+
38
+ language = rb_str_new2("en");
37
39
 
38
40
  if (TYPE(text) != T_STRING)
39
41
  rb_raise(rb_eArgError, "invalid +text+");
40
42
 
41
- if (NIL_P(dictionary))
42
- article_load_dictionary(article, "en");
43
- else
43
+ if (!NIL_P(options)) {
44
+ if (TYPE(options) != T_HASH)
45
+ rb_raise(rb_eArgError, "invalid +options+ hash");
46
+
47
+ dictionary = rb_hash_aref(options, ID2SYM(rb_intern("dictionary")));
48
+ language = rb_hash_aref(options, ID2SYM(rb_intern("language")));
49
+ }
50
+
51
+ if (!NIL_P(dictionary))
44
52
  article_load_dictionary(article, CSTRING(dictionary));
53
+ else
54
+ article_load_dictionary(article, CSTRING(language));
45
55
 
46
56
  ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
47
57
  ots_grade_doc(article);
@@ -87,11 +97,11 @@ VALUE article_summarize(VALUE self, VALUE options) {
87
97
  if (TYPE(options) != T_HASH)
88
98
  rb_raise(rb_eArgError, "expect an options hash");
89
99
 
90
- lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
100
+ lines = rb_hash_aref(options, ID2SYM(rb_intern("sentences")));
91
101
  percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
92
102
 
93
103
  if (NIL_P(lines) && NIL_P(percent))
94
- rb_raise(rb_eArgError, "expect +lines+ or +percent+ to be provided");
104
+ rb_raise(rb_eArgError, "expect +sentences+ or +percent+");
95
105
 
96
106
  if (lines != Qnil)
97
107
  ots_highlight_doc_lines(article, NUM2INT(lines));
@@ -101,9 +111,13 @@ VALUE article_summarize(VALUE self, VALUE options) {
101
111
  return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
102
112
  }
103
113
 
104
- VALUE article_title(VALUE self) {
114
+ VALUE article_topics(VALUE self) {
105
115
  OtsArticle *article = article_handle(self);
106
- return (article->title ? rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")) : Qnil);
116
+
117
+ return
118
+ article->title ?
119
+ rb_str_split(rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")), ",") :
120
+ Qnil;
107
121
  }
108
122
 
109
123
  typedef struct {
@@ -136,16 +150,16 @@ VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
136
150
  return article;
137
151
  }
138
152
 
139
- VALUE ots_dictionaries(VALUE self) {
153
+ VALUE ots_languages(VALUE self) {
140
154
  DIR *dir;
141
155
  struct dirent *entry;
142
- VALUE dictionaries = rb_ary_new();
156
+ VALUE languages = rb_ary_new();
143
157
 
144
158
  if ((dir = opendir(DICTIONARY_DIR))) {
145
159
  while ((entry = readdir(dir))) {
146
160
  // entry->d_type is not portable.
147
161
  if (strstr(entry->d_name, ".xml"))
148
- rb_ary_push(dictionaries, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
162
+ rb_ary_push(languages, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
149
163
  }
150
164
  }
151
165
  else {
@@ -153,7 +167,7 @@ VALUE ots_dictionaries(VALUE self) {
153
167
  }
154
168
 
155
169
  closedir(dir);
156
- return dictionaries;
170
+ return languages;
157
171
  }
158
172
 
159
173
  /* init */
@@ -164,11 +178,11 @@ void Init_ots(void) {
164
178
 
165
179
  rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
166
180
  rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
167
- rb_define_method(cArticle, "title", RUBY_METHOD_FUNC(article_title), 0);
181
+ rb_define_method(cArticle, "topics", RUBY_METHOD_FUNC(article_topics), 0);
168
182
  rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
169
183
 
170
- rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
171
- rb_define_module_function(mOTS, "dictionaries", RUBY_METHOD_FUNC(ots_dictionaries), 0);
184
+ rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
185
+ rb_define_module_function(mOTS, "languages", RUBY_METHOD_FUNC(ots_languages), 0);
172
186
 
173
187
  rb_define_alloc_func(cArticle, article_allocate);
174
188
 
@@ -1,2 +1,2 @@
1
1
  #pragma once
2
- #define RUBY_OTS_VERSION "0.4.4"
2
+ #define RUBY_OTS_VERSION "0.5.0"
@@ -12,8 +12,8 @@ describe 'OTS::Article' do
12
12
  @article = OTS::Article.new(@sample)
13
13
  end
14
14
 
15
- it 'should extract title keywords from given document' do
16
- assert_equal 'species,turtle,subspecies,pacific,atlantic', @article.title
15
+ it 'should extract topic keywords from given document' do
16
+ assert_equal %w(species turtle subspecies pacific atlantic), @article.topics
17
17
  end
18
18
 
19
19
  it 'should extract keywords from given document' do
@@ -27,7 +27,7 @@ describe 'OTS::Article' do
27
27
 
28
28
 
29
29
  it 'should extract keywords from given document' do
30
- lines = @article.summarize(lines: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
30
+ lines = @article.summarize(sentences: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
31
31
  expect = [
32
32
  ["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
33
33
  ["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
@@ -39,14 +39,37 @@ describe 'OTS::Article' do
39
39
  it 'should utf8 encode strings properly' do
40
40
  text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
41
41
  article = OTS.parse(text)
42
- summary = article.summarize(lines: 1).first[:sentence]
42
+ summary = article.summarize(sentences: 1).first[:sentence]
43
43
  assert_equal text, summary
44
44
  end
45
45
 
46
46
  describe 'dictionaries' do
47
+ before do
48
+ @text = "j'ai besoin de la crème glacée. il fait trop chaud en australie."
49
+ end
50
+
47
51
  it 'should load the french dictionary' do
48
- article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
49
- assert_equal "j'ai besoin de la crème glacée.", article.summarize(lines: 1).first[:sentence]
52
+ article = OTS.parse(@text, language: "fr")
53
+ assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
54
+ end
55
+
56
+ it 'should load the french dictionary given path' do
57
+ article = OTS.parse(@text, dictionary: File.join(File.dirname(__FILE__), '..', 'dictionaries', 'fr.xml'))
58
+ assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
59
+ end
60
+
61
+ it 'should raise LoadError on invalid language or dictionaries' do
62
+ assert_raises(LoadError) do
63
+ OTS.parse('hello world', language: "xxx")
64
+ end
65
+
66
+ assert_raises(LoadError) do
67
+ OTS.parse('hello world', dictionary: "xxx")
68
+ end
69
+
70
+ assert_raises(LoadError) do
71
+ OTS.parse('hello world', dictionary: __FILE__)
72
+ end
50
73
  end
51
74
  end
52
75
  end
@@ -12,12 +12,12 @@ describe 'OTS' do
12
12
  end
13
13
 
14
14
  it 'should return a list of dictonaries' do
15
- dictionaries = OTS.dictionaries
15
+ languages = OTS.languages
16
16
 
17
17
  %w(en fr it es de ru).each do |name|
18
- assert dictionaries.include?(name), "has #{name} dictionary"
18
+ assert languages.include?(name), "has #{name} language dictionary"
19
19
  end
20
20
 
21
- assert_empty dictionaries.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
21
+ assert_empty languages.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
22
22
  end
23
23
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 4
8
- - 4
9
- version: 0.4.4
7
+ - 5
8
+ - 0
9
+ version: 0.5.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2012-01-09 00:00:00 +11:00
17
+ date: 2012-01-10 00:00:00 +11:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency