ots 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,98 @@
1
+ /*
2
+ * text.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+
27
+ unsigned char *
28
+ ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size)
29
+ {
30
+ GList *li;
31
+ GString *text;
32
+ unsigned char *utf8_data;
33
+
34
+ if (!(aLine))
35
+ return NULL;
36
+
37
+ text = g_string_new (NULL);
38
+
39
+ if (!only_if_selected || aLine->selected)
40
+ {
41
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
42
+ if (li->data && strlen (li->data)) /*if word exists*/
43
+ g_string_append (text, (char *) li->data);
44
+
45
+ }
46
+
47
+ if (out_size)
48
+ *out_size = text->len;
49
+
50
+ utf8_data = text->str;
51
+ g_string_free (text, FALSE);
52
+
53
+ return utf8_data;
54
+ }
55
+
56
+ static void
57
+ ots_print_line (FILE * stream, const OtsSentence * aLine)
58
+ {
59
+ unsigned char *utf8_txt;
60
+ size_t len;
61
+ utf8_txt = ots_get_line_text (aLine, TRUE, &len);
62
+ fwrite (utf8_txt, 1, len, stream);
63
+ g_free (utf8_txt);
64
+ }
65
+
66
+ unsigned char *
67
+ ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
68
+ {
69
+ GList *li;
70
+ GString *text;
71
+ unsigned char *utf8_data;
72
+ size_t line_len;
73
+
74
+ text = g_string_new (NULL);
75
+
76
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
77
+ {
78
+ utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len);
79
+ g_string_append_len (text, utf8_data, line_len);
80
+ g_free (utf8_data);
81
+ }
82
+
83
+ if (out_len)
84
+ *out_len = text->len;
85
+ utf8_data = text->str;
86
+
87
+ g_string_free (text, FALSE);
88
+ return utf8_data;
89
+ }
90
+
91
+ void
92
+ ots_print_doc (FILE * stream, const OtsArticle * Doc)
93
+ {
94
+ GList *li;
95
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next) /* for each line in Article Do: */
96
+ ots_print_line (stream, (OtsSentence *) li->data);
97
+ fputc ('\n', stream);
98
+ }
@@ -0,0 +1,2 @@
1
+ #pragma once
2
+ #define RUBY_OTS_VERSION "0.4.4"
@@ -0,0 +1,220 @@
1
+ /*
2
+ * wordlist.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ /*word lists manipulations , mainly for grader-tc */
29
+
30
+ OtsWordEntery *
31
+ ots_new_wordEntery_strip(unsigned const char *wordString,const OtsStemRule *rule) /*for real text use*/
32
+ {
33
+ OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
34
+ aWord->occ = 1;
35
+ aWord->word = ots_stem_format(wordString,rule);
36
+ aWord->stem = ots_stem_strip(wordString,rule);
37
+ return aWord;
38
+ }
39
+
40
+ OtsWordEntery *
41
+ ots_new_wordEntery (unsigned const char *wordString) /*for dictionary use only, no formating here*/
42
+ {
43
+ OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
44
+ aWord->occ = 1;
45
+ aWord->word = g_strdup (wordString);
46
+ aWord->stem = g_strdup (wordString);
47
+ return aWord;
48
+ }
49
+
50
+
51
+ void
52
+ ots_free_wordEntery (OtsWordEntery * WC)
53
+ {
54
+ if (WC != NULL)
55
+ {
56
+ if (NULL!=WC->word) g_free (WC->word);
57
+ if (NULL!=WC->stem) g_free (WC->stem);
58
+ g_free (WC);
59
+ }
60
+ }
61
+
62
+ void
63
+ ots_free_wordlist (GList * aList)
64
+ {
65
+ if (aList != NULL)
66
+ {
67
+ g_list_foreach(aList,(GFunc)ots_free_wordEntery , NULL);
68
+ g_list_free(aList);
69
+ }
70
+ }
71
+
72
+ OtsWordEntery *
73
+ ots_copy_wordEntery (OtsWordEntery * obj)
74
+ {
75
+ OtsWordEntery *aWord;
76
+ if (obj == NULL) { return NULL;}
77
+ aWord = g_new (OtsWordEntery, 1);
78
+ aWord->occ = obj->occ;
79
+ aWord->word = g_strdup (obj->word);
80
+ if (NULL!=obj->stem)
81
+ {aWord->stem = g_strdup (obj->stem);} else {aWord->stem=NULL;}
82
+ return aWord;
83
+ }
84
+
85
+ static int
86
+ ots_sort_handler (OtsWordEntery * node1, OtsWordEntery * node2)
87
+ {
88
+ if (node1->occ > node2->occ)
89
+ return -1;
90
+ if (node1->occ < node2->occ)
91
+ return 1;
92
+ return 0;
93
+ }
94
+
95
+ GList *
96
+ ots_sort_list (GList* aList)
97
+ {
98
+ GList *newList;
99
+ newList = g_list_sort (aList, (GCompareFunc) ots_sort_handler); /* sort article */
100
+ return newList;
101
+ }
102
+
103
+ GList *
104
+ ots_union_list (const GList *aLst, const GList * bLst)
105
+ {
106
+ GList *li;
107
+ GList *di;
108
+ int insert;
109
+ GList *newLst=NULL;
110
+
111
+ for (li = (GList *) aLst; li != NULL; li = li->next)
112
+ {
113
+ insert = 1;
114
+ for (di = (GList *) bLst; di != NULL; di = di->next)
115
+ {
116
+ if(( li->data) && (di->data) && (((OtsWordEntery *) li->data)->word) && (((OtsWordEntery *) di->data)->word)) /*all defined?*/
117
+ if (0 == g_strncasecmp ((((OtsWordEntery *) li->data)->word), /*fix me: unicode issue?*/
118
+ (((OtsWordEntery *) di->data)->word), 10))
119
+ insert = 0; /* if word in B */
120
+
121
+ }
122
+ if (insert == 1)
123
+ if ((li->data))
124
+ newLst = g_list_append (newLst,ots_copy_wordEntery ((OtsWordEntery *) li->data));
125
+ }
126
+
127
+ return newLst;
128
+ }
129
+
130
+
131
+ char *
132
+ ots_word_in_list (const GList *aList,const int index) /* return the String value of the n'th word */
133
+ {
134
+ OtsWordEntery *obj = NULL;
135
+
136
+ GList *item =(GList *)g_list_nth ((GList *)aList, index);
137
+ if (item != NULL) obj = item->data;
138
+ if (obj == NULL)
139
+ {
140
+ return NULL;
141
+ }
142
+ else
143
+ return obj->word;
144
+ }
145
+
146
+ char *
147
+ ots_stem_in_list (const GList *aList,const int index) /* return the String value of stem of the n'th word */
148
+ {
149
+ OtsWordEntery *obj = NULL;
150
+
151
+ GList *item =(GList *)g_list_nth ((GList *)aList, index);
152
+ if (item != NULL) obj = item->data;
153
+ if (obj == NULL)
154
+ {
155
+ return NULL;
156
+ }
157
+ else
158
+ return obj->stem;
159
+ }
160
+
161
+ /*Adds a word to the word count of the article*/
162
+ void
163
+ ots_add_wordstat (OtsArticle * Doc,
164
+ unsigned const char *wordString)
165
+ {
166
+ GList *li;
167
+ OtsWordEntery *stat;
168
+ OtsStemRule * rule=Doc->stem;
169
+ char *tmp = NULL;
170
+
171
+ if (NULL==wordString) return;
172
+ if (NULL==Doc) return;
173
+
174
+ if (0==strlen(wordString)) return;
175
+ if (0==strcmp(wordString," ")) return;
176
+ if (0==strcmp(wordString,"\n")) return;
177
+ if (0==strcmp(wordString,"\t")) return;
178
+
179
+ if (wordString)
180
+ tmp = ots_stem_strip (wordString, rule);
181
+
182
+ for (li = (GList *) Doc->wordStat; li != NULL; li = li->next) /* search the word in current wordlist */
183
+ {
184
+ if (li->data)
185
+ if (0 == strcmp (tmp, ((OtsWordEntery *) li->data)->stem))
186
+ {
187
+ ((OtsWordEntery *) li->data)->occ++; /* occurred in another place in the text now; */
188
+ g_free (tmp);
189
+
190
+ /*printf for debug*/
191
+ /*
192
+ if (0!=strcmp(((OtsWordEntery *) li->data)->word,wordString) )
193
+ printf("[%s]==[%s]\n",((OtsWordEntery *) li->data)->word,wordString);
194
+ */
195
+
196
+ return;
197
+ }
198
+ }
199
+
200
+ stat = ots_new_wordEntery_strip (wordString, rule); /* if not in list , Add stem it to the list */
201
+ if ((stat))
202
+ Doc->wordStat = g_list_prepend (Doc->wordStat, stat);
203
+ g_free (tmp);
204
+ return;
205
+ }
206
+
207
+
208
+
209
+
210
+ void
211
+ ots_print_wordlist (FILE * stream, const GList * aList)
212
+ {
213
+ GList *li;
214
+ for (li = (GList *) aList; li != NULL; li = li->next)
215
+ fprintf (stream, "Word[%d][%s]\n", ((OtsWordEntery *) li->data)->occ,
216
+ ((OtsWordEntery *) li->data)->word);
217
+ }
218
+
219
+
220
+
@@ -0,0 +1,3 @@
1
+ require 'ots'
2
+ require 'minitest/spec'
3
+ require 'minitest/autorun'
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ describe 'OTS::Article' do
5
+ before do
6
+ @sample = <<-TEXT
7
+ The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
8
+ It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
9
+ Pacific subspecies.
10
+ TEXT
11
+
12
+ @article = OTS::Article.new(@sample)
13
+ end
14
+
15
+ it 'should extract title keywords from given document' do
16
+ assert_equal 'species,turtle,subspecies,pacific,atlantic', @article.title
17
+ end
18
+
19
+ it 'should extract keywords from given document' do
20
+ expect = %w{
21
+ species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
22
+ belonging sea endangered critically hawksbill
23
+ }
24
+
25
+ assert_equal expect, @article.keywords
26
+ end
27
+
28
+
29
+ it 'should extract keywords from given document' do
30
+ lines = @article.summarize(lines: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
31
+ expect = [
32
+ ["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
33
+ ["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
34
+ ]
35
+
36
+ assert_equal expect, lines
37
+ end
38
+
39
+ it 'should utf8 encode strings properly' do
40
+ text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
41
+ article = OTS.parse(text)
42
+ summary = article.summarize(lines: 1).first[:sentence]
43
+ assert_equal text, summary
44
+ end
45
+
46
+ describe 'dictionaries' do
47
+ it 'should load the french dictionary' do
48
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
49
+ assert_equal "j'ai besoin de la crème glacée.", article.summarize(lines: 1).first[:sentence]
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,23 @@
1
+ require 'helper'
2
+
3
+ describe 'OTS' do
4
+ it 'parse() should return an article instance' do
5
+ OTS.parse("hello world").must_be_kind_of OTS::Article
6
+ end
7
+
8
+ it 'parse() should raise ArgumentError on invalid text' do
9
+ assert_raises(ArgumentError) do
10
+ OTS.parse(1)
11
+ end
12
+ end
13
+
14
+ it 'should return a list of dictonaries' do
15
+ dictionaries = OTS.dictionaries
16
+
17
+ %w(en fr it es de ru).each do |name|
18
+ assert dictionaries.include?(name), "has #{name} dictionary"
19
+ end
20
+
21
+ assert_empty dictionaries.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
22
+ end
23
+ end
metadata CHANGED
@@ -1,64 +1,148 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: ots
3
- version: !ruby/object:Gem::Version
4
- version: 0.4.3
5
- prerelease:
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 4
9
+ version: 0.4.4
6
10
  platform: ruby
7
- authors:
11
+ authors:
8
12
  - Bharanee Rathna
9
13
  autorequire:
10
14
  bindir: bin
11
15
  cert_chain: []
12
- date: 2011-05-23 00:00:00.000000000Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
15
- name: shoulda
16
- requirement: &17368280 !ruby/object:Gem::Requirement
16
+
17
+ date: 2012-01-09 00:00:00 +11:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rake
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
17
24
  none: false
18
- requirements:
19
- - - ! '>='
20
- - !ruby/object:Gem::Version
21
- version: '2.10'
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
22
31
  type: :development
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: rake-compiler
23
35
  prerelease: false
24
- version_requirements: *17368280
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ version: "0"
44
+ type: :development
45
+ version_requirements: *id002
25
46
  description: Ruby interface to libots libraries for unix.
26
- email: deepfryed@gmail.com
47
+ email:
48
+ - deepfryed@gmail.com
27
49
  executables: []
28
- extensions:
50
+
51
+ extensions:
29
52
  - ext/extconf.rb
30
- extra_rdoc_files:
31
- - README
32
- files:
33
- - README
34
- - VERSION
53
+ extra_rdoc_files: []
54
+
55
+ files:
56
+ - ext/text.c
57
+ - ext/grader-tf.c
58
+ - ext/stemmer.c
59
+ - ext/article.c
60
+ - ext/grader-tc.c
61
+ - ext/html.c
62
+ - ext/grader.c
35
63
  - ext/ots.c
36
- - lib/ots.rb
37
- - test/ots_test.rb
64
+ - ext/relations.c
65
+ - ext/parser.c
66
+ - ext/dictionary.c
67
+ - ext/highlighter.c
68
+ - ext/wordlist.c
69
+ - ext/grader-tc.h
70
+ - ext/ots.h
71
+ - ext/libots.h
72
+ - ext/version.h
38
73
  - ext/extconf.rb
74
+ - test/test_article.rb
75
+ - test/test_ots.rb
76
+ - test/helper.rb
77
+ - README.md
78
+ - dictionaries/cy.xml
79
+ - dictionaries/tr.xml
80
+ - dictionaries/fr.xml
81
+ - dictionaries/yi.xml
82
+ - dictionaries/ms.xml
83
+ - dictionaries/ia.xml
84
+ - dictionaries/lv.xml
85
+ - dictionaries/gl.xml
86
+ - dictionaries/cs.xml
87
+ - dictionaries/sv.xml
88
+ - dictionaries/is.xml
89
+ - dictionaries/fi.xml
90
+ - dictionaries/bg.xml
91
+ - dictionaries/uk.xml
92
+ - dictionaries/et.xml
93
+ - dictionaries/tl.xml
94
+ - dictionaries/da.xml
95
+ - dictionaries/it.xml
96
+ - dictionaries/ru.xml
97
+ - dictionaries/nl.xml
98
+ - dictionaries/eo.xml
99
+ - dictionaries/mi.xml
100
+ - dictionaries/ro.xml
101
+ - dictionaries/pl.xml
102
+ - dictionaries/ga.xml
103
+ - dictionaries/he.xml
104
+ - dictionaries/mt.xml
105
+ - dictionaries/eu.xml
106
+ - dictionaries/hu.xml
107
+ - dictionaries/en.xml
108
+ - dictionaries/de.xml
109
+ - dictionaries/el.xml
110
+ - dictionaries/pt.xml
111
+ - dictionaries/ca.xml
112
+ - dictionaries/es.xml
113
+ - dictionaries/nn.xml
114
+ - dictionaries/id.xml
115
+ has_rdoc: true
39
116
  homepage: http://github.com/deepfryed/ots
40
117
  licenses: []
118
+
41
119
  post_install_message:
42
120
  rdoc_options: []
43
- require_paths:
121
+
122
+ require_paths:
44
123
  - lib
45
- required_ruby_version: !ruby/object:Gem::Requirement
124
+ required_ruby_version: !ruby/object:Gem::Requirement
46
125
  none: false
47
- requirements:
48
- - - ! '>='
49
- - !ruby/object:Gem::Version
50
- version: '0'
51
- required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ segments:
130
+ - 0
131
+ version: "0"
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
133
  none: false
53
- requirements:
54
- - - ! '>='
55
- - !ruby/object:Gem::Version
56
- version: '0'
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ segments:
138
+ - 0
139
+ version: "0"
57
140
  requirements: []
141
+
58
142
  rubyforge_project:
59
- rubygems_version: 1.8.2
143
+ rubygems_version: 1.3.7
60
144
  signing_key:
61
145
  specification_version: 3
62
146
  summary: Open Text Summarizer interface for Ruby.
63
- test_files:
64
- - test/ots_test.rb
147
+ test_files: []
148
+