ots 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,18 +1,18 @@
1
1
  # OTS
2
2
 
3
- ots is an interface to libots - The open text summarizer
3
+ ots is an interface to libots - The [open text summarizer](http://libots.sourceforge.net/).
4
4
 
5
5
  ## Dependencies
6
6
 
7
7
  * ruby 1.9.1 or later
8
8
  * libxml2
9
- * glib2.0
9
+ * glib2.0
10
10
  * homebrew (on MacOSX)
11
11
 
12
12
  ## Installation
13
13
 
14
14
  ### Debian flavors of Linux
15
-
15
+
16
16
  ```
17
17
 
18
18
  # ruby & ruby development libraries (not needed if you use rvm)
@@ -34,6 +34,9 @@ ots is an interface to libots - The open text summarizer
34
34
  # update homebrew to latest & greatest version
35
35
  GIT_SSL_NO_VERIFY=1 brew update
36
36
 
37
+ # optional: macosx normally has libxml2 installed if not try
38
+ brew install libxml2
39
+
37
40
  # install glib
38
41
  brew install glib
39
42
 
@@ -47,13 +50,13 @@ ots is an interface to libots - The open text summarizer
47
50
  ```
48
51
  OTS
49
52
  .parse #=> OTS::Article
50
- .dictionaries #=> Array
53
+ .languages #=> Array
51
54
 
52
55
  OTS::Article
53
56
  .new
54
- #summarize #=> Array
57
+ #topics #=> Array
55
58
  #keywords #=> Array
56
- #title #=> String
59
+ #summarize #=> Array
57
60
 
58
61
  ```
59
62
 
@@ -62,17 +65,15 @@ ots is an interface to libots - The open text summarizer
62
65
  ```ruby
63
66
  require 'ots'
64
67
  article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under")
68
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", language: "fr")
69
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", dictionary: "custom.xml")
65
70
 
71
+ article.topics
66
72
  article.keywords
67
- article.summarize(lines: 1)
68
- article.summarize(percent: 50)
69
-
70
- article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
71
- article.keywords
72
- article.summarize(lines: 1)
73
73
  article.summarize(percent: 50)
74
+ article.summarize(sentences: 1)
74
75
 
75
- OTS.dictionaries #=> list of supported dictionaries
76
+ OTS.languages #=> list of supported languages dictionaries baked-in to libots
76
77
  ```
77
78
 
78
79
  ## License
@@ -59,12 +59,13 @@ ots_load_xml_dictionary (OtsArticle * Doc, const char *name)
59
59
  char *local_dict_name;
60
60
 
61
61
  dict_name = g_strdup_printf ("%s%s.xml", DICTIONARY_DIR, name);
62
- local_dict_name = g_strdup_printf ("%s.xml", name);
62
+ local_dict_name = g_strdup_printf ("%s", name);
63
63
 
64
+ if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
65
+ doc = xmlReadFile (local_dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
66
+ if (doc == NULL && g_file_test(dict_name,G_FILE_TEST_EXISTS))
67
+ doc = xmlReadFile (dict_name, 0, XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
64
68
 
65
- if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
66
- doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
67
- if (doc == NULL) doc = xmlParseFile (dict_name);
68
69
  if (doc == NULL) return (FALSE);
69
70
 
70
71
  head = xmlDocGetRootElement (doc);
data/ext/ots.c CHANGED
@@ -30,18 +30,28 @@ void article_load_dictionary(OtsArticle *article, char *name) {
30
30
  }
31
31
 
32
32
  VALUE article_initialize(int argc, VALUE *argv, VALUE self) {
33
- VALUE text, dictionary;
33
+ VALUE text, options, language, dictionary = Qnil;
34
34
  OtsArticle *article = article_handle(self);
35
35
 
36
- rb_scan_args(argc, argv, "11", &text, &dictionary);
36
+ rb_scan_args(argc, argv, "11", &text, &options);
37
+
38
+ language = rb_str_new2("en");
37
39
 
38
40
  if (TYPE(text) != T_STRING)
39
41
  rb_raise(rb_eArgError, "invalid +text+");
40
42
 
41
- if (NIL_P(dictionary))
42
- article_load_dictionary(article, "en");
43
- else
43
+ if (!NIL_P(options)) {
44
+ if (TYPE(options) != T_HASH)
45
+ rb_raise(rb_eArgError, "invalid +options+ hash");
46
+
47
+ dictionary = rb_hash_aref(options, ID2SYM(rb_intern("dictionary")));
48
+ language = rb_hash_aref(options, ID2SYM(rb_intern("language")));
49
+ }
50
+
51
+ if (!NIL_P(dictionary))
44
52
  article_load_dictionary(article, CSTRING(dictionary));
53
+ else
54
+ article_load_dictionary(article, CSTRING(language));
45
55
 
46
56
  ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article);
47
57
  ots_grade_doc(article);
@@ -87,11 +97,11 @@ VALUE article_summarize(VALUE self, VALUE options) {
87
97
  if (TYPE(options) != T_HASH)
88
98
  rb_raise(rb_eArgError, "expect an options hash");
89
99
 
90
- lines = rb_hash_aref(options, ID2SYM(rb_intern("lines")));
100
+ lines = rb_hash_aref(options, ID2SYM(rb_intern("sentences")));
91
101
  percent = rb_hash_aref(options, ID2SYM(rb_intern("percent")));
92
102
 
93
103
  if (NIL_P(lines) && NIL_P(percent))
94
- rb_raise(rb_eArgError, "expect +lines+ or +percent+ to be provided");
104
+ rb_raise(rb_eArgError, "expect +sentences+ or +percent+");
95
105
 
96
106
  if (lines != Qnil)
97
107
  ots_highlight_doc_lines(article, NUM2INT(lines));
@@ -101,9 +111,13 @@ VALUE article_summarize(VALUE self, VALUE options) {
101
111
  return article_summary(article, (rb_encoding *)rb_iv_get(self, "@encoding"));
102
112
  }
103
113
 
104
- VALUE article_title(VALUE self) {
114
+ VALUE article_topics(VALUE self) {
105
115
  OtsArticle *article = article_handle(self);
106
- return (article->title ? rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")) : Qnil);
116
+
117
+ return
118
+ article->title ?
119
+ rb_str_split(rb_enc_str_new2(article->title, (rb_encoding*)rb_iv_get(self, "@encoding")), ",") :
120
+ Qnil;
107
121
  }
108
122
 
109
123
  typedef struct {
@@ -136,16 +150,16 @@ VALUE ots_parse(int argc, VALUE *argv, VALUE self) {
136
150
  return article;
137
151
  }
138
152
 
139
- VALUE ots_dictionaries(VALUE self) {
153
+ VALUE ots_languages(VALUE self) {
140
154
  DIR *dir;
141
155
  struct dirent *entry;
142
- VALUE dictionaries = rb_ary_new();
156
+ VALUE languages = rb_ary_new();
143
157
 
144
158
  if ((dir = opendir(DICTIONARY_DIR))) {
145
159
  while ((entry = readdir(dir))) {
146
160
  // entry->d_type is not portable.
147
161
  if (strstr(entry->d_name, ".xml"))
148
- rb_ary_push(dictionaries, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
162
+ rb_ary_push(languages, rb_str_new(entry->d_name, strlen(entry->d_name) - 4));
149
163
  }
150
164
  }
151
165
  else {
@@ -153,7 +167,7 @@ VALUE ots_dictionaries(VALUE self) {
153
167
  }
154
168
 
155
169
  closedir(dir);
156
- return dictionaries;
170
+ return languages;
157
171
  }
158
172
 
159
173
  /* init */
@@ -164,11 +178,11 @@ void Init_ots(void) {
164
178
 
165
179
  rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1);
166
180
  rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1);
167
- rb_define_method(cArticle, "title", RUBY_METHOD_FUNC(article_title), 0);
181
+ rb_define_method(cArticle, "topics", RUBY_METHOD_FUNC(article_topics), 0);
168
182
  rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0);
169
183
 
170
- rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
171
- rb_define_module_function(mOTS, "dictionaries", RUBY_METHOD_FUNC(ots_dictionaries), 0);
184
+ rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1);
185
+ rb_define_module_function(mOTS, "languages", RUBY_METHOD_FUNC(ots_languages), 0);
172
186
 
173
187
  rb_define_alloc_func(cArticle, article_allocate);
174
188
 
@@ -1,2 +1,2 @@
1
1
  #pragma once
2
- #define RUBY_OTS_VERSION "0.4.4"
2
+ #define RUBY_OTS_VERSION "0.5.0"
@@ -12,8 +12,8 @@ describe 'OTS::Article' do
12
12
  @article = OTS::Article.new(@sample)
13
13
  end
14
14
 
15
- it 'should extract title keywords from given document' do
16
- assert_equal 'species,turtle,subspecies,pacific,atlantic', @article.title
15
+ it 'should extract topic keywords from given document' do
16
+ assert_equal %w(species turtle subspecies pacific atlantic), @article.topics
17
17
  end
18
18
 
19
19
  it 'should extract keywords from given document' do
@@ -27,7 +27,7 @@ describe 'OTS::Article' do
27
27
 
28
28
 
29
29
  it 'should extract keywords from given document' do
30
- lines = @article.summarize(lines: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
30
+ lines = @article.summarize(sentences: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
31
31
  expect = [
32
32
  ["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
33
33
  ["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
@@ -39,14 +39,37 @@ describe 'OTS::Article' do
39
39
  it 'should utf8 encode strings properly' do
40
40
  text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
41
41
  article = OTS.parse(text)
42
- summary = article.summarize(lines: 1).first[:sentence]
42
+ summary = article.summarize(sentences: 1).first[:sentence]
43
43
  assert_equal text, summary
44
44
  end
45
45
 
46
46
  describe 'dictionaries' do
47
+ before do
48
+ @text = "j'ai besoin de la crème glacée. il fait trop chaud en australie."
49
+ end
50
+
47
51
  it 'should load the french dictionary' do
48
- article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
49
- assert_equal "j'ai besoin de la crème glacée.", article.summarize(lines: 1).first[:sentence]
52
+ article = OTS.parse(@text, language: "fr")
53
+ assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
54
+ end
55
+
56
+ it 'should load the french dictionary given path' do
57
+ article = OTS.parse(@text, dictionary: File.join(File.dirname(__FILE__), '..', 'dictionaries', 'fr.xml'))
58
+ assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence]
59
+ end
60
+
61
+ it 'should raise LoadError on invalid language or dictionaries' do
62
+ assert_raises(LoadError) do
63
+ OTS.parse('hello world', language: "xxx")
64
+ end
65
+
66
+ assert_raises(LoadError) do
67
+ OTS.parse('hello world', dictionary: "xxx")
68
+ end
69
+
70
+ assert_raises(LoadError) do
71
+ OTS.parse('hello world', dictionary: __FILE__)
72
+ end
50
73
  end
51
74
  end
52
75
  end
@@ -12,12 +12,12 @@ describe 'OTS' do
12
12
  end
13
13
 
14
14
  it 'should return a list of dictonaries' do
15
- dictionaries = OTS.dictionaries
15
+ languages = OTS.languages
16
16
 
17
17
  %w(en fr it es de ru).each do |name|
18
- assert dictionaries.include?(name), "has #{name} dictionary"
18
+ assert languages.include?(name), "has #{name} language dictionary"
19
19
  end
20
20
 
21
- assert_empty dictionaries.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
21
+ assert_empty languages.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
22
22
  end
23
23
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 4
8
- - 4
9
- version: 0.4.4
7
+ - 5
8
+ - 0
9
+ version: 0.5.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2012-01-09 00:00:00 +11:00
17
+ date: 2012-01-10 00:00:00 +11:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency