ots 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. data/README.md +80 -0
  2. data/dictionaries/bg.xml +101 -0
  3. data/dictionaries/ca.xml +141 -0
  4. data/dictionaries/cs.xml +161 -0
  5. data/dictionaries/cy.xml +118 -0
  6. data/dictionaries/da.xml +129 -0
  7. data/dictionaries/de.xml +354 -0
  8. data/dictionaries/el.xml +80 -0
  9. data/dictionaries/en.xml +606 -0
  10. data/dictionaries/eo.xml +171 -0
  11. data/dictionaries/es.xml +369 -0
  12. data/dictionaries/et.xml +172 -0
  13. data/dictionaries/eu.xml +77 -0
  14. data/dictionaries/fi.xml +105 -0
  15. data/dictionaries/fr.xml +199 -0
  16. data/dictionaries/ga.xml +124 -0
  17. data/dictionaries/gl.xml +290 -0
  18. data/dictionaries/he.xml +334 -0
  19. data/dictionaries/hu.xml +280 -0
  20. data/dictionaries/ia.xml +97 -0
  21. data/dictionaries/id.xml +75 -0
  22. data/dictionaries/is.xml +201 -0
  23. data/dictionaries/it.xml +206 -0
  24. data/dictionaries/lv.xml +77 -0
  25. data/dictionaries/mi.xml +76 -0
  26. data/dictionaries/ms.xml +160 -0
  27. data/dictionaries/mt.xml +73 -0
  28. data/dictionaries/nl.xml +245 -0
  29. data/dictionaries/nn.xml +264 -0
  30. data/dictionaries/pl.xml +92 -0
  31. data/dictionaries/pt.xml +365 -0
  32. data/dictionaries/ro.xml +163 -0
  33. data/dictionaries/ru.xml +150 -0
  34. data/dictionaries/sv.xml +255 -0
  35. data/dictionaries/tl.xml +67 -0
  36. data/dictionaries/tr.xml +65 -0
  37. data/dictionaries/uk.xml +98 -0
  38. data/dictionaries/yi.xml +293 -0
  39. data/ext/article.c +119 -0
  40. data/ext/dictionary.c +335 -0
  41. data/ext/extconf.rb +13 -14
  42. data/ext/grader-tc.c +185 -0
  43. data/ext/grader-tc.h +64 -0
  44. data/ext/grader-tf.c +116 -0
  45. data/ext/grader.c +85 -0
  46. data/ext/highlighter.c +128 -0
  47. data/ext/html.c +131 -0
  48. data/ext/libots.h +158 -0
  49. data/ext/ots.c +130 -151
  50. data/ext/ots.h +15 -0
  51. data/ext/parser.c +173 -0
  52. data/ext/relations.c +163 -0
  53. data/ext/stemmer.c +332 -0
  54. data/ext/text.c +98 -0
  55. data/ext/version.h +2 -0
  56. data/ext/wordlist.c +220 -0
  57. data/test/helper.rb +3 -0
  58. data/test/test_article.rb +52 -0
  59. data/test/test_ots.rb +23 -0
  60. metadata +122 -38
  61. data/README +0 -25
  62. data/VERSION +0 -1
  63. data/lib/ots.rb +0 -1
  64. data/test/ots_test.rb +0 -62
@@ -0,0 +1,98 @@
1
+ /*
2
+ * text.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+
27
+ unsigned char *
28
+ ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size)
29
+ {
30
+ GList *li;
31
+ GString *text;
32
+ unsigned char *utf8_data;
33
+
34
+ if (!(aLine))
35
+ return NULL;
36
+
37
+ text = g_string_new (NULL);
38
+
39
+ if (!only_if_selected || aLine->selected)
40
+ {
41
+ for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */
42
+ if (li->data && strlen (li->data)) /*if word exists*/
43
+ g_string_append (text, (char *) li->data);
44
+
45
+ }
46
+
47
+ if (out_size)
48
+ *out_size = text->len;
49
+
50
+ utf8_data = text->str;
51
+ g_string_free (text, FALSE);
52
+
53
+ return utf8_data;
54
+ }
55
+
56
+ static void
57
+ ots_print_line (FILE * stream, const OtsSentence * aLine)
58
+ {
59
+ unsigned char *utf8_txt;
60
+ size_t len;
61
+ utf8_txt = ots_get_line_text (aLine, TRUE, &len);
62
+ fwrite (utf8_txt, 1, len, stream);
63
+ g_free (utf8_txt);
64
+ }
65
+
66
+ unsigned char *
67
+ ots_get_doc_text (const OtsArticle * Doc, size_t * out_len)
68
+ {
69
+ GList *li;
70
+ GString *text;
71
+ unsigned char *utf8_data;
72
+ size_t line_len;
73
+
74
+ text = g_string_new (NULL);
75
+
76
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next)
77
+ {
78
+ utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len);
79
+ g_string_append_len (text, utf8_data, line_len);
80
+ g_free (utf8_data);
81
+ }
82
+
83
+ if (out_len)
84
+ *out_len = text->len;
85
+ utf8_data = text->str;
86
+
87
+ g_string_free (text, FALSE);
88
+ return utf8_data;
89
+ }
90
+
91
+ void
92
+ ots_print_doc (FILE * stream, const OtsArticle * Doc)
93
+ {
94
+ GList *li;
95
+ for (li = (GList *) Doc->lines; li != NULL; li = li->next) /* for each line in Article Do: */
96
+ ots_print_line (stream, (OtsSentence *) li->data);
97
+ fputc ('\n', stream);
98
+ }
@@ -0,0 +1,2 @@
1
+ #pragma once
2
+ #define RUBY_OTS_VERSION "0.4.4"
@@ -0,0 +1,220 @@
1
+ /*
2
+ * wordlist.c
3
+ *
4
+ * Copyright (C) 2003 Nadav Rotem <nadav256@hotmail.com>
5
+ *
6
+ * This program is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * This program is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ * GNU Library General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License
17
+ * along with this program; if not, write to the Free Software
18
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include "libots.h"
26
+ #include "grader-tc.h"
27
+
28
+ /*word lists manipulations , mainly for grader-tc */
29
+
30
+ OtsWordEntery *
31
+ ots_new_wordEntery_strip(unsigned const char *wordString,const OtsStemRule *rule) /*for real text use*/
32
+ {
33
+ OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
34
+ aWord->occ = 1;
35
+ aWord->word = ots_stem_format(wordString,rule);
36
+ aWord->stem = ots_stem_strip(wordString,rule);
37
+ return aWord;
38
+ }
39
+
40
+ OtsWordEntery *
41
+ ots_new_wordEntery (unsigned const char *wordString) /*for dictionary use only, no formating here*/
42
+ {
43
+ OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1);
44
+ aWord->occ = 1;
45
+ aWord->word = g_strdup (wordString);
46
+ aWord->stem = g_strdup (wordString);
47
+ return aWord;
48
+ }
49
+
50
+
51
+ void
52
+ ots_free_wordEntery (OtsWordEntery * WC)
53
+ {
54
+ if (WC != NULL)
55
+ {
56
+ if (NULL!=WC->word) g_free (WC->word);
57
+ if (NULL!=WC->stem) g_free (WC->stem);
58
+ g_free (WC);
59
+ }
60
+ }
61
+
62
+ void
63
+ ots_free_wordlist (GList * aList)
64
+ {
65
+ if (aList != NULL)
66
+ {
67
+ g_list_foreach(aList,(GFunc)ots_free_wordEntery , NULL);
68
+ g_list_free(aList);
69
+ }
70
+ }
71
+
72
+ OtsWordEntery *
73
+ ots_copy_wordEntery (OtsWordEntery * obj)
74
+ {
75
+ OtsWordEntery *aWord;
76
+ if (obj == NULL) { return NULL;}
77
+ aWord = g_new (OtsWordEntery, 1);
78
+ aWord->occ = obj->occ;
79
+ aWord->word = g_strdup (obj->word);
80
+ if (NULL!=obj->stem)
81
+ {aWord->stem = g_strdup (obj->stem);} else {aWord->stem=NULL;}
82
+ return aWord;
83
+ }
84
+
85
+ static int
86
+ ots_sort_handler (OtsWordEntery * node1, OtsWordEntery * node2)
87
+ {
88
+ if (node1->occ > node2->occ)
89
+ return -1;
90
+ if (node1->occ < node2->occ)
91
+ return 1;
92
+ return 0;
93
+ }
94
+
95
+ GList *
96
+ ots_sort_list (GList* aList)
97
+ {
98
+ GList *newList;
99
+ newList = g_list_sort (aList, (GCompareFunc) ots_sort_handler); /* sort article */
100
+ return newList;
101
+ }
102
+
103
+ GList *
104
+ ots_union_list (const GList *aLst, const GList * bLst)
105
+ {
106
+ GList *li;
107
+ GList *di;
108
+ int insert;
109
+ GList *newLst=NULL;
110
+
111
+ for (li = (GList *) aLst; li != NULL; li = li->next)
112
+ {
113
+ insert = 1;
114
+ for (di = (GList *) bLst; di != NULL; di = di->next)
115
+ {
116
+ if(( li->data) && (di->data) && (((OtsWordEntery *) li->data)->word) && (((OtsWordEntery *) di->data)->word)) /*all defined?*/
117
+ if (0 == g_strncasecmp ((((OtsWordEntery *) li->data)->word), /*fix me: unicode issue?*/
118
+ (((OtsWordEntery *) di->data)->word), 10))
119
+ insert = 0; /* if word in B */
120
+
121
+ }
122
+ if (insert == 1)
123
+ if ((li->data))
124
+ newLst = g_list_append (newLst,ots_copy_wordEntery ((OtsWordEntery *) li->data));
125
+ }
126
+
127
+ return newLst;
128
+ }
129
+
130
+
131
+ char *
132
+ ots_word_in_list (const GList *aList,const int index) /* return the String value of the n'th word */
133
+ {
134
+ OtsWordEntery *obj = NULL;
135
+
136
+ GList *item =(GList *)g_list_nth ((GList *)aList, index);
137
+ if (item != NULL) obj = item->data;
138
+ if (obj == NULL)
139
+ {
140
+ return NULL;
141
+ }
142
+ else
143
+ return obj->word;
144
+ }
145
+
146
+ char *
147
+ ots_stem_in_list (const GList *aList,const int index) /* return the String value of stem of the n'th word */
148
+ {
149
+ OtsWordEntery *obj = NULL;
150
+
151
+ GList *item =(GList *)g_list_nth ((GList *)aList, index);
152
+ if (item != NULL) obj = item->data;
153
+ if (obj == NULL)
154
+ {
155
+ return NULL;
156
+ }
157
+ else
158
+ return obj->stem;
159
+ }
160
+
161
+ /*Adds a word to the word count of the article*/
162
+ void
163
+ ots_add_wordstat (OtsArticle * Doc,
164
+ unsigned const char *wordString)
165
+ {
166
+ GList *li;
167
+ OtsWordEntery *stat;
168
+ OtsStemRule * rule=Doc->stem;
169
+ char *tmp = NULL;
170
+
171
+ if (NULL==wordString) return;
172
+ if (NULL==Doc) return;
173
+
174
+ if (0==strlen(wordString)) return;
175
+ if (0==strcmp(wordString," ")) return;
176
+ if (0==strcmp(wordString,"\n")) return;
177
+ if (0==strcmp(wordString,"\t")) return;
178
+
179
+ if (wordString)
180
+ tmp = ots_stem_strip (wordString, rule);
181
+
182
+ for (li = (GList *) Doc->wordStat; li != NULL; li = li->next) /* search the word in current wordlist */
183
+ {
184
+ if (li->data)
185
+ if (0 == strcmp (tmp, ((OtsWordEntery *) li->data)->stem))
186
+ {
187
+ ((OtsWordEntery *) li->data)->occ++; /* occurred in another place in the text now; */
188
+ g_free (tmp);
189
+
190
+ /*printf for debug*/
191
+ /*
192
+ if (0!=strcmp(((OtsWordEntery *) li->data)->word,wordString) )
193
+ printf("[%s]==[%s]\n",((OtsWordEntery *) li->data)->word,wordString);
194
+ */
195
+
196
+ return;
197
+ }
198
+ }
199
+
200
+ stat = ots_new_wordEntery_strip (wordString, rule); /* if not in list , Add stem it to the list */
201
+ if ((stat))
202
+ Doc->wordStat = g_list_prepend (Doc->wordStat, stat);
203
+ g_free (tmp);
204
+ return;
205
+ }
206
+
207
+
208
+
209
+
210
+ void
211
+ ots_print_wordlist (FILE * stream, const GList * aList)
212
+ {
213
+ GList *li;
214
+ for (li = (GList *) aList; li != NULL; li = li->next)
215
+ fprintf (stream, "Word[%d][%s]\n", ((OtsWordEntery *) li->data)->occ,
216
+ ((OtsWordEntery *) li->data)->word);
217
+ }
218
+
219
+
220
+
@@ -0,0 +1,3 @@
1
+ require 'ots'
2
+ require 'minitest/spec'
3
+ require 'minitest/autorun'
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ describe 'OTS::Article' do
5
+ before do
6
+ @sample = <<-TEXT
7
+ The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.
8
+ It is the only species in its genus. The species has a worldwide distribution, with Atlantic and
9
+ Pacific subspecies.
10
+ TEXT
11
+
12
+ @article = OTS::Article.new(@sample)
13
+ end
14
+
15
+ it 'should extract title keywords from given document' do
16
+ assert_equal 'species,turtle,subspecies,pacific,atlantic', @article.title
17
+ end
18
+
19
+ it 'should extract keywords from given document' do
20
+ expect = %w{
21
+ species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family
22
+ belonging sea endangered critically hawksbill
23
+ }
24
+
25
+ assert_equal expect, @article.keywords
26
+ end
27
+
28
+
29
+ it 'should extract keywords from given document' do
30
+ lines = @article.summarize(lines: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]}
31
+ expect = [
32
+ ["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48],
33
+ ["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20],
34
+ ]
35
+
36
+ assert_equal expect, lines
37
+ end
38
+
39
+ it 'should utf8 encode strings properly' do
40
+ text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8')
41
+ article = OTS.parse(text)
42
+ summary = article.summarize(lines: 1).first[:sentence]
43
+ assert_equal text, summary
44
+ end
45
+
46
+ describe 'dictionaries' do
47
+ it 'should load the french dictionary' do
48
+ article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", "fr")
49
+ assert_equal "j'ai besoin de la crème glacée.", article.summarize(lines: 1).first[:sentence]
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,23 @@
1
+ require 'helper'
2
+
3
+ describe 'OTS' do
4
+ it 'parse() should return an article instance' do
5
+ OTS.parse("hello world").must_be_kind_of OTS::Article
6
+ end
7
+
8
+ it 'parse() should raise ArgumentError on invalid text' do
9
+ assert_raises(ArgumentError) do
10
+ OTS.parse(1)
11
+ end
12
+ end
13
+
14
+ it 'should return a list of dictonaries' do
15
+ dictionaries = OTS.dictionaries
16
+
17
+ %w(en fr it es de ru).each do |name|
18
+ assert dictionaries.include?(name), "has #{name} dictionary"
19
+ end
20
+
21
+ assert_empty dictionaries.reject {|name| name.size == 2}, "dictionaries path should not have other junk"
22
+ end
23
+ end
metadata CHANGED
@@ -1,64 +1,148 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: ots
3
- version: !ruby/object:Gem::Version
4
- version: 0.4.3
5
- prerelease:
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 4
9
+ version: 0.4.4
6
10
  platform: ruby
7
- authors:
11
+ authors:
8
12
  - Bharanee Rathna
9
13
  autorequire:
10
14
  bindir: bin
11
15
  cert_chain: []
12
- date: 2011-05-23 00:00:00.000000000Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
15
- name: shoulda
16
- requirement: &17368280 !ruby/object:Gem::Requirement
16
+
17
+ date: 2012-01-09 00:00:00 +11:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rake
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
17
24
  none: false
18
- requirements:
19
- - - ! '>='
20
- - !ruby/object:Gem::Version
21
- version: '2.10'
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
22
31
  type: :development
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: rake-compiler
23
35
  prerelease: false
24
- version_requirements: *17368280
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ version: "0"
44
+ type: :development
45
+ version_requirements: *id002
25
46
  description: Ruby interface to libots libraries for unix.
26
- email: deepfryed@gmail.com
47
+ email:
48
+ - deepfryed@gmail.com
27
49
  executables: []
28
- extensions:
50
+
51
+ extensions:
29
52
  - ext/extconf.rb
30
- extra_rdoc_files:
31
- - README
32
- files:
33
- - README
34
- - VERSION
53
+ extra_rdoc_files: []
54
+
55
+ files:
56
+ - ext/text.c
57
+ - ext/grader-tf.c
58
+ - ext/stemmer.c
59
+ - ext/article.c
60
+ - ext/grader-tc.c
61
+ - ext/html.c
62
+ - ext/grader.c
35
63
  - ext/ots.c
36
- - lib/ots.rb
37
- - test/ots_test.rb
64
+ - ext/relations.c
65
+ - ext/parser.c
66
+ - ext/dictionary.c
67
+ - ext/highlighter.c
68
+ - ext/wordlist.c
69
+ - ext/grader-tc.h
70
+ - ext/ots.h
71
+ - ext/libots.h
72
+ - ext/version.h
38
73
  - ext/extconf.rb
74
+ - test/test_article.rb
75
+ - test/test_ots.rb
76
+ - test/helper.rb
77
+ - README.md
78
+ - dictionaries/cy.xml
79
+ - dictionaries/tr.xml
80
+ - dictionaries/fr.xml
81
+ - dictionaries/yi.xml
82
+ - dictionaries/ms.xml
83
+ - dictionaries/ia.xml
84
+ - dictionaries/lv.xml
85
+ - dictionaries/gl.xml
86
+ - dictionaries/cs.xml
87
+ - dictionaries/sv.xml
88
+ - dictionaries/is.xml
89
+ - dictionaries/fi.xml
90
+ - dictionaries/bg.xml
91
+ - dictionaries/uk.xml
92
+ - dictionaries/et.xml
93
+ - dictionaries/tl.xml
94
+ - dictionaries/da.xml
95
+ - dictionaries/it.xml
96
+ - dictionaries/ru.xml
97
+ - dictionaries/nl.xml
98
+ - dictionaries/eo.xml
99
+ - dictionaries/mi.xml
100
+ - dictionaries/ro.xml
101
+ - dictionaries/pl.xml
102
+ - dictionaries/ga.xml
103
+ - dictionaries/he.xml
104
+ - dictionaries/mt.xml
105
+ - dictionaries/eu.xml
106
+ - dictionaries/hu.xml
107
+ - dictionaries/en.xml
108
+ - dictionaries/de.xml
109
+ - dictionaries/el.xml
110
+ - dictionaries/pt.xml
111
+ - dictionaries/ca.xml
112
+ - dictionaries/es.xml
113
+ - dictionaries/nn.xml
114
+ - dictionaries/id.xml
115
+ has_rdoc: true
39
116
  homepage: http://github.com/deepfryed/ots
40
117
  licenses: []
118
+
41
119
  post_install_message:
42
120
  rdoc_options: []
43
- require_paths:
121
+
122
+ require_paths:
44
123
  - lib
45
- required_ruby_version: !ruby/object:Gem::Requirement
124
+ required_ruby_version: !ruby/object:Gem::Requirement
46
125
  none: false
47
- requirements:
48
- - - ! '>='
49
- - !ruby/object:Gem::Version
50
- version: '0'
51
- required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ segments:
130
+ - 0
131
+ version: "0"
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
133
  none: false
53
- requirements:
54
- - - ! '>='
55
- - !ruby/object:Gem::Version
56
- version: '0'
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ segments:
138
+ - 0
139
+ version: "0"
57
140
  requirements: []
141
+
58
142
  rubyforge_project:
59
- rubygems_version: 1.8.2
143
+ rubygems_version: 1.3.7
60
144
  signing_key:
61
145
  specification_version: 3
62
146
  summary: Open Text Summarizer interface for Ruby.
63
- test_files:
64
- - test/ots_test.rb
147
+ test_files: []
148
+